skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/server/impl.py CHANGED
@@ -5,6 +5,7 @@ import shlex
5
5
  import signal
6
6
  import tempfile
7
7
  import threading
8
+ import typing
8
9
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
10
  import uuid
10
11
 
@@ -17,12 +18,15 @@ from sky import execution
17
18
  from sky import sky_logging
18
19
  from sky import skypilot_config
19
20
  from sky import task as task_lib
21
+ from sky.adaptors import common as adaptors_common
20
22
  from sky.backends import backend_utils
21
23
  from sky.catalog import common as service_catalog_common
22
24
  from sky.data import storage as storage_lib
23
25
  from sky.serve import constants as serve_constants
26
+ from sky.serve import serve_rpc_utils
24
27
  from sky.serve import serve_state
25
28
  from sky.serve import serve_utils
29
+ from sky.server.requests import request_names
26
30
  from sky.skylet import constants
27
31
  from sky.skylet import job_lib
28
32
  from sky.utils import admin_policy_utils
@@ -36,6 +40,11 @@ from sky.utils import subprocess_utils
36
40
  from sky.utils import ux_utils
37
41
  from sky.utils import yaml_utils
38
42
 
43
+ if typing.TYPE_CHECKING:
44
+ import grpc
45
+ else:
46
+ grpc = adaptors_common.LazyImport('grpc')
47
+
39
48
  logger = sky_logging.init_logger(__name__)
40
49
 
41
50
 
@@ -78,24 +87,35 @@ def _get_service_record(
78
87
  """Get the service record."""
79
88
  noun = 'pool' if pool else 'service'
80
89
 
81
- code = serve_utils.ServeCodeGen.get_service_status([service_name],
82
- pool=pool)
83
- returncode, serve_status_payload, stderr = backend.run_on_head(
84
- handle,
85
- code,
86
- require_outputs=True,
87
- stream_logs=False,
88
- separate_stderr=True)
89
- try:
90
- subprocess_utils.handle_returncode(returncode,
91
- code,
92
- f'Failed to get {noun} status',
93
- stderr,
94
- stream_logs=True)
95
- except exceptions.CommandError as e:
96
- raise RuntimeError(e.error_msg) from e
90
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
91
+ use_legacy = not handle.is_grpc_enabled_with_flag
97
92
 
98
- service_statuses = serve_utils.load_service_status(serve_status_payload)
93
+ if not use_legacy:
94
+ try:
95
+ service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
96
+ handle, [service_name], pool)
97
+ except exceptions.SkyletMethodNotImplementedError:
98
+ use_legacy = True
99
+
100
+ if use_legacy:
101
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
102
+ pool=pool)
103
+ returncode, serve_status_payload, stderr = backend.run_on_head(
104
+ handle,
105
+ code,
106
+ require_outputs=True,
107
+ stream_logs=False,
108
+ separate_stderr=True)
109
+ try:
110
+ subprocess_utils.handle_returncode(returncode,
111
+ code,
112
+ f'Failed to get {noun} status',
113
+ stderr,
114
+ stream_logs=True)
115
+ except exceptions.CommandError as e:
116
+ raise RuntimeError(e.error_msg) from e
117
+
118
+ service_statuses = serve_utils.load_service_status(serve_status_payload)
99
119
 
100
120
  assert len(service_statuses) <= 1, service_statuses
101
121
  if not service_statuses:
@@ -103,6 +123,18 @@ def _get_service_record(
103
123
  return service_statuses[0]
104
124
 
105
125
 
126
+ def _maybe_display_run_warning(task: 'task_lib.Task') -> None:
127
+ # We do not block the user from creating a pool with a run section
128
+ # in order to enable using the same yaml for pool creation
129
+ # and job submission. But we want to make it clear that 'run' will not
130
+ # be respected here.
131
+ if task.run is not None:
132
+ logger.warning(
133
+ f'{colorama.Fore.YELLOW} Pool creation does not support the '
134
+ '`run` section. Creating the pool while ignoring the '
135
+ f'`run` section.{colorama.Style.RESET_ALL}')
136
+
137
+
106
138
  def up(
107
139
  task: 'task_lib.Task',
108
140
  service_name: Optional[str] = None,
@@ -133,16 +165,15 @@ def up(
133
165
  # Always apply the policy again here, even though it might have been applied
134
166
  # in the CLI. This is to ensure that we apply the policy to the final DAG
135
167
  # and get the mutated config.
136
- dag, mutated_user_config = admin_policy_utils.apply(dag)
168
+ dag, mutated_user_config = admin_policy_utils.apply(
169
+ dag, request_name=request_names.AdminPolicyRequestName.SERVE_UP)
137
170
  dag.resolve_and_validate_volumes()
138
171
  dag.pre_mount_volumes()
139
172
  task = dag.tasks[0]
140
173
  assert task.service is not None
141
174
  if pool:
142
- if task.run is not None:
143
- logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
144
- f'ignored for pool.{colorama.Style.RESET_ALL}')
145
- # Use dummy run script for cluster pool.
175
+ _maybe_display_run_warning(task)
176
+ # Use dummy run script for pool.
146
177
  task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
147
178
 
148
179
  with rich_utils.safe_status(
@@ -258,6 +289,8 @@ def up(
258
289
  task=controller_task,
259
290
  cluster_name=controller_name,
260
291
  retry_until_up=True,
292
+ _request_name=request_names.AdminPolicyRequestName.
293
+ SERVE_LAUNCH_CONTROLLER,
261
294
  _disable_controller_check=True,
262
295
  )
263
296
  else:
@@ -280,38 +313,51 @@ def up(
280
313
  ]
281
314
  run_script = '\n'.join(env_cmds + [run_script])
282
315
  # Dump script for high availability recovery.
283
- if controller_utils.high_availability_specified(controller_name):
284
- serve_state.set_ha_recovery_script(service_name, run_script)
316
+ serve_state.set_ha_recovery_script(service_name, run_script)
285
317
  backend.run_on_head(controller_handle, run_script)
286
318
 
287
319
  style = colorama.Style
288
320
  fore = colorama.Fore
289
321
 
290
322
  assert controller_job_id is not None and controller_handle is not None
323
+ assert isinstance(controller_handle, backends.CloudVmRayResourceHandle)
324
+ backend = backend_utils.get_backend_from_handle(controller_handle)
325
+ assert isinstance(backend, backends.CloudVmRayBackend)
291
326
  # TODO(tian): Cache endpoint locally to speedup. Endpoint won't
292
327
  # change after the first time, so there is no consistency issue.
293
- with rich_utils.safe_status(
294
- ux_utils.spinner_message(
295
- f'Waiting for the {noun} to register')):
296
- # This function will check the controller job id in the database
297
- # and return the endpoint if the job id matches. Otherwise it will
298
- # return None.
299
- code = serve_utils.ServeCodeGen.wait_service_registration(
300
- service_name, controller_job_id, pool)
301
- backend = backend_utils.get_backend_from_handle(controller_handle)
302
- assert isinstance(backend, backends.CloudVmRayBackend)
303
- assert isinstance(controller_handle,
304
- backends.CloudVmRayResourceHandle)
305
- returncode, lb_port_payload, _ = backend.run_on_head(
306
- controller_handle,
307
- code,
308
- require_outputs=True,
309
- stream_logs=False)
310
328
  try:
311
- subprocess_utils.handle_returncode(
312
- returncode, code, f'Failed to wait for {noun} initialization',
313
- lb_port_payload)
314
- except exceptions.CommandError:
329
+ with rich_utils.safe_status(
330
+ ux_utils.spinner_message(
331
+ f'Waiting for the {noun} to register')):
332
+ # This function will check the controller job id in the database
333
+ # and return the endpoint if the job id matches. Otherwise it
334
+ # will return None.
335
+ use_legacy = not controller_handle.is_grpc_enabled_with_flag
336
+
337
+ if controller_handle.is_grpc_enabled_with_flag:
338
+ try:
339
+ lb_port = serve_rpc_utils.RpcRunner.wait_service_registration( # pylint: disable=line-too-long
340
+ controller_handle, service_name, controller_job_id,
341
+ pool)
342
+ except exceptions.SkyletMethodNotImplementedError:
343
+ use_legacy = True
344
+
345
+ if use_legacy:
346
+ code = serve_utils.ServeCodeGen.wait_service_registration(
347
+ service_name, controller_job_id, pool)
348
+ returncode, lb_port_payload, _ = backend.run_on_head(
349
+ controller_handle,
350
+ code,
351
+ require_outputs=True,
352
+ stream_logs=False)
353
+ subprocess_utils.handle_returncode(
354
+ returncode, code,
355
+ f'Failed to wait for {noun} initialization',
356
+ lb_port_payload)
357
+ lb_port = serve_utils.load_service_initialization_result(
358
+ lb_port_payload)
359
+ except (exceptions.CommandError, grpc.FutureTimeoutError,
360
+ grpc.RpcError):
315
361
  if serve_utils.is_consolidation_mode(pool):
316
362
  with ux_utils.print_exception_no_traceback():
317
363
  raise RuntimeError(
@@ -345,8 +391,6 @@ def up(
345
391
  'Failed to spin up the service. Please '
346
392
  'check the logs above for more details.') from None
347
393
  else:
348
- lb_port = serve_utils.load_service_initialization_result(
349
- lb_port_payload)
350
394
  if not serve_utils.is_consolidation_mode(pool) and not pool:
351
395
  socket_endpoint = backend_utils.get_endpoints(
352
396
  controller_handle.cluster_name,
@@ -381,6 +425,9 @@ def up(
381
425
  f'\n{ux_utils.INDENT_LAST_SYMBOL}To terminate the pool:\t'
382
426
  f'{ux_utils.BOLD}sky jobs pool down {service_name}'
383
427
  f'{ux_utils.RESET_BOLD}'
428
+ f'\n{ux_utils.INDENT_SYMBOL}To update the number of workers:\t'
429
+ f'{ux_utils.BOLD}sky jobs pool apply --pool {service_name} '
430
+ f'--workers 5{ux_utils.RESET_BOLD}'
384
431
  '\n\n' + ux_utils.finishing_message('Successfully created pool '
385
432
  f'{service_name!r}.'))
386
433
  else:
@@ -418,37 +465,15 @@ def up(
418
465
 
419
466
 
420
467
  def update(
421
- task: 'task_lib.Task',
468
+ task: Optional['task_lib.Task'],
422
469
  service_name: str,
423
470
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
424
471
  pool: bool = False,
472
+ workers: Optional[int] = None,
425
473
  ) -> None:
426
474
  """Updates an existing service or pool."""
427
475
  noun = 'pool' if pool else 'service'
428
476
  capnoun = noun.capitalize()
429
- task.validate()
430
- serve_utils.validate_service_task(task, pool=pool)
431
-
432
- # Always apply the policy again here, even though it might have been applied
433
- # in the CLI. This is to ensure that we apply the policy to the final DAG
434
- # and get the mutated config.
435
- # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
436
- # will not apply the config.
437
- dag, _ = admin_policy_utils.apply(task)
438
- task = dag.tasks[0]
439
- if pool:
440
- if task.run is not None:
441
- logger.warning(f'{colorama.Fore.YELLOW}The `run` section will be '
442
- f'ignored for pool.{colorama.Style.RESET_ALL}')
443
- # Use dummy run script for cluster pool.
444
- task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
445
-
446
- assert task.service is not None
447
- if not pool and task.service.tls_credential is not None:
448
- logger.warning('Updating TLS keyfile and certfile is not supported. '
449
- 'Any updates to the keyfile and certfile will not take '
450
- 'effect. To update TLS keyfile and certfile, please '
451
- 'tear down the service and spin up a new one.')
452
477
 
453
478
  controller_type = controller_utils.get_controller_for_pool(pool)
454
479
  handle = backend_utils.is_controller_accessible(
@@ -462,6 +487,7 @@ def update(
462
487
  f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
463
488
  )
464
489
 
490
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
465
491
  backend = backend_utils.get_backend_from_handle(handle)
466
492
  assert isinstance(backend, backends.CloudVmRayBackend)
467
493
 
@@ -474,6 +500,58 @@ def update(
474
500
  f'To spin up a {noun}, use {ux_utils.BOLD}'
475
501
  f'{cmd}{ux_utils.RESET_BOLD}')
476
502
 
503
+ # If task is None and workers is specified, load existing configuration
504
+ # and update replica count.
505
+ if task is None:
506
+ if workers is None:
507
+ with ux_utils.print_exception_no_traceback():
508
+ raise ValueError(
509
+ f'Cannot update {noun} without specifying '
510
+ f'task or workers. Please provide either a task '
511
+ f'or specify the number of workers.')
512
+
513
+ if not pool:
514
+ with ux_utils.print_exception_no_traceback():
515
+ raise ValueError(
516
+ 'Non-pool service, trying to update replicas to '
517
+ f'{workers} is not supported. Ignoring the update.')
518
+
519
+ # Load the existing task configuration from the service's YAML file
520
+ yaml_content = service_record['yaml_content']
521
+
522
+ # Load the existing task configuration
523
+ task = task_lib.Task.from_yaml_str(yaml_content)
524
+
525
+ if task.service is None:
526
+ with ux_utils.print_exception_no_traceback():
527
+ raise RuntimeError('No service configuration found in '
528
+ f'existing {noun} {service_name!r}')
529
+ task.set_service(task.service.copy(min_replicas=workers))
530
+
531
+ task.validate()
532
+ serve_utils.validate_service_task(task, pool=pool)
533
+
534
+ # Now apply the policy and handle task-specific logic
535
+ # Always apply the policy again here, even though it might have been applied
536
+ # in the CLI. This is to ensure that we apply the policy to the final DAG
537
+ # and get the mutated config.
538
+ # TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
539
+ # will not apply the config.
540
+ dag, _ = admin_policy_utils.apply(
541
+ task, request_name=request_names.AdminPolicyRequestName.SERVE_UPDATE)
542
+ task = dag.tasks[0]
543
+ if pool:
544
+ _maybe_display_run_warning(task)
545
+ # Use dummy run script for pool.
546
+ task.run = serve_constants.POOL_DUMMY_RUN_COMMAND
547
+
548
+ assert task.service is not None
549
+ if not pool and task.service.tls_credential is not None:
550
+ logger.warning('Updating TLS keyfile and certfile is not supported. '
551
+ 'Any updates to the keyfile and certfile will not take '
552
+ 'effect. To update TLS keyfile and certfile, please '
553
+ 'tear down the service and spin up a new one.')
554
+
477
555
  prompt = None
478
556
  if (service_record['status'] == serve_state.ServiceStatus.CONTROLLER_FAILED
479
557
  ):
@@ -504,29 +582,39 @@ def update(
504
582
  controller_utils.maybe_translate_local_file_mounts_and_sync_up(
505
583
  task, task_type='serve')
506
584
 
507
- code = serve_utils.ServeCodeGen.add_version(service_name)
508
- returncode, version_string_payload, stderr = backend.run_on_head(
509
- handle,
510
- code,
511
- require_outputs=True,
512
- stream_logs=False,
513
- separate_stderr=True)
514
- try:
515
- subprocess_utils.handle_returncode(returncode,
516
- code,
517
- 'Failed to add version',
518
- stderr,
519
- stream_logs=True)
520
- except exceptions.CommandError as e:
521
- raise RuntimeError(e.error_msg) from e
585
+ use_legacy = not handle.is_grpc_enabled_with_flag
522
586
 
523
- version_string = serve_utils.load_version_string(version_string_payload)
524
- try:
525
- current_version = int(version_string)
526
- except ValueError as e:
527
- with ux_utils.print_exception_no_traceback():
528
- raise ValueError(f'Failed to parse version: {version_string}; '
529
- f'Returncode: {returncode}') from e
587
+ if not use_legacy:
588
+ try:
589
+ current_version = serve_rpc_utils.RpcRunner.add_version(
590
+ handle, service_name)
591
+ except exceptions.SkyletMethodNotImplementedError:
592
+ use_legacy = True
593
+
594
+ if use_legacy:
595
+ code = serve_utils.ServeCodeGen.add_version(service_name)
596
+ returncode, version_string_payload, stderr = backend.run_on_head(
597
+ handle,
598
+ code,
599
+ require_outputs=True,
600
+ stream_logs=False,
601
+ separate_stderr=True)
602
+ try:
603
+ subprocess_utils.handle_returncode(returncode,
604
+ code,
605
+ 'Failed to add version',
606
+ stderr,
607
+ stream_logs=True)
608
+ except exceptions.CommandError as e:
609
+ raise RuntimeError(e.error_msg) from e
610
+
611
+ version_string = serve_utils.load_version_string(version_string_payload)
612
+ try:
613
+ current_version = int(version_string)
614
+ except ValueError as e:
615
+ with ux_utils.print_exception_no_traceback():
616
+ raise ValueError(f'Failed to parse version: {version_string}; '
617
+ f'Returncode: {returncode}') from e
530
618
 
531
619
  with tempfile.NamedTemporaryFile(
532
620
  prefix=f'{service_name}-v{current_version}',
@@ -541,23 +629,33 @@ def update(
541
629
  {remote_task_yaml_path: service_file.name},
542
630
  storage_mounts=None)
543
631
 
544
- code = serve_utils.ServeCodeGen.update_service(service_name,
545
- current_version,
546
- mode=mode.value,
547
- pool=pool)
548
- returncode, _, stderr = backend.run_on_head(handle,
549
- code,
550
- require_outputs=True,
551
- stream_logs=False,
552
- separate_stderr=True)
553
- try:
554
- subprocess_utils.handle_returncode(returncode,
555
- code,
556
- f'Failed to update {noun}s',
557
- stderr,
558
- stream_logs=True)
559
- except exceptions.CommandError as e:
560
- raise RuntimeError(e.error_msg) from e
632
+ use_legacy = not handle.is_grpc_enabled_with_flag
633
+
634
+ if not use_legacy:
635
+ try:
636
+ serve_rpc_utils.RpcRunner.update_service(
637
+ handle, service_name, current_version, mode, pool)
638
+ except exceptions.SkyletMethodNotImplementedError:
639
+ use_legacy = True
640
+
641
+ if use_legacy:
642
+ code = serve_utils.ServeCodeGen.update_service(service_name,
643
+ current_version,
644
+ mode=mode.value,
645
+ pool=pool)
646
+ returncode, _, stderr = backend.run_on_head(handle,
647
+ code,
648
+ require_outputs=True,
649
+ stream_logs=False,
650
+ separate_stderr=True)
651
+ try:
652
+ subprocess_utils.handle_returncode(returncode,
653
+ code,
654
+ f'Failed to update {noun}s',
655
+ stderr,
656
+ stream_logs=True)
657
+ except exceptions.CommandError as e:
658
+ raise RuntimeError(e.error_msg) from e
561
659
 
562
660
  cmd = 'sky jobs pool status' if pool else 'sky serve status'
563
661
  logger.info(
@@ -566,14 +664,25 @@ def update(
566
664
  f'Please use {ux_utils.BOLD}{cmd} {service_name} '
567
665
  f'{ux_utils.RESET_BOLD}to check the latest status.')
568
666
 
667
+ if pool:
668
+ logs_cmd = f'`sky jobs pool logs {service_name} <worker_id>`'
669
+ unit_noun = 'Workers'
670
+
671
+ else:
672
+ logs_cmd = f'`sky serve logs {service_name} <replica_id>`'
673
+ unit_noun = 'Replicas'
569
674
  logger.info(
570
675
  ux_utils.finishing_message(
571
676
  f'Successfully updated {noun} {service_name!r} '
572
- f'to version {current_version}.'))
677
+ f'to version {current_version}.',
678
+ follow_up_message=
679
+ f'\n{unit_noun} are updating, use {ux_utils.BOLD}{logs_cmd}'
680
+ f'{ux_utils.RESET_BOLD} to check their status.'))
573
681
 
574
682
 
575
683
  def apply(
576
684
  task: 'task_lib.Task',
685
+ workers: Optional[int],
577
686
  service_name: str,
578
687
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
579
688
  pool: bool = False,
@@ -589,7 +698,7 @@ def apply(
589
698
  service_record = _get_service_record(service_name, pool, handle,
590
699
  backend)
591
700
  if service_record is not None:
592
- return update(task, service_name, mode, pool)
701
+ return update(task, service_name, mode, pool, workers)
593
702
  except exceptions.ClusterNotUpError:
594
703
  pass
595
704
  up(task, service_name, pool)
@@ -620,29 +729,44 @@ def down(
620
729
  raise ValueError(f'Can only specify one of {noun}_names or all. '
621
730
  f'Provided {argument_str!r}.')
622
731
 
623
- backend = backend_utils.get_backend_from_handle(handle)
624
- assert isinstance(backend, backends.CloudVmRayBackend)
625
732
  service_names = None if all else service_names
626
- code = serve_utils.ServeCodeGen.terminate_services(service_names, purge,
627
- pool)
628
733
 
629
734
  try:
630
- returncode, stdout, _ = backend.run_on_head(handle,
631
- code,
632
- require_outputs=True,
633
- stream_logs=False)
735
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
736
+ use_legacy = not handle.is_grpc_enabled_with_flag
737
+
738
+ if not use_legacy:
739
+ try:
740
+ stdout = serve_rpc_utils.RpcRunner.terminate_services(
741
+ handle, service_names, purge, pool)
742
+ except exceptions.SkyletMethodNotImplementedError:
743
+ use_legacy = True
744
+
745
+ if use_legacy:
746
+ backend = backend_utils.get_backend_from_handle(handle)
747
+ assert isinstance(backend, backends.CloudVmRayBackend)
748
+ code = serve_utils.ServeCodeGen.terminate_services(
749
+ service_names, purge, pool)
750
+
751
+ returncode, stdout, _ = backend.run_on_head(handle,
752
+ code,
753
+ require_outputs=True,
754
+ stream_logs=False)
755
+
756
+ subprocess_utils.handle_returncode(returncode, code,
757
+ f'Failed to terminate {noun}',
758
+ stdout)
634
759
  except exceptions.FetchClusterInfoError as e:
635
760
  raise RuntimeError(
636
761
  'Failed to fetch controller IP. Please refresh controller status '
637
- f'by `sky status -r {controller_type.value.cluster_name}` '
638
- 'and try again.') from e
639
-
640
- try:
641
- subprocess_utils.handle_returncode(returncode, code,
642
- f'Failed to terminate {noun}',
643
- stdout)
762
+ f'by `sky status -r {controller_type.value.cluster_name}` and try '
763
+ 'again.') from e
644
764
  except exceptions.CommandError as e:
645
765
  raise RuntimeError(e.error_msg) from e
766
+ except grpc.RpcError as e:
767
+ raise RuntimeError(f'{e.details()} ({e.code()})') from e
768
+ except grpc.FutureTimeoutError as e:
769
+ raise RuntimeError('gRPC timed out') from e
646
770
 
647
771
  logger.info(stdout)
648
772
 
@@ -670,27 +794,40 @@ def status(
670
794
  stopped_message=controller_type.value.default_hint_if_non_existent.
671
795
  replace('service', noun))
672
796
 
673
- backend = backend_utils.get_backend_from_handle(handle)
674
- assert isinstance(backend, backends.CloudVmRayBackend)
797
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
798
+ use_legacy = not handle.is_grpc_enabled_with_flag
675
799
 
676
- code = serve_utils.ServeCodeGen.get_service_status(service_names, pool=pool)
677
- returncode, serve_status_payload, stderr = backend.run_on_head(
678
- handle,
679
- code,
680
- require_outputs=True,
681
- stream_logs=False,
682
- separate_stderr=True)
800
+ if not use_legacy:
801
+ try:
802
+ service_records = serve_rpc_utils.RpcRunner.get_service_status(
803
+ handle, service_names, pool)
804
+ except exceptions.SkyletMethodNotImplementedError:
805
+ use_legacy = True
806
+
807
+ if use_legacy:
808
+ backend = backend_utils.get_backend_from_handle(handle)
809
+ assert isinstance(backend, backends.CloudVmRayBackend)
810
+
811
+ code = serve_utils.ServeCodeGen.get_service_status(service_names,
812
+ pool=pool)
813
+ returncode, serve_status_payload, stderr = backend.run_on_head(
814
+ handle,
815
+ code,
816
+ require_outputs=True,
817
+ stream_logs=False,
818
+ separate_stderr=True)
683
819
 
684
- try:
685
- subprocess_utils.handle_returncode(returncode,
686
- code,
687
- f'Failed to fetch {noun}s',
688
- stderr,
689
- stream_logs=True)
690
- except exceptions.CommandError as e:
691
- raise RuntimeError(e.error_msg) from e
820
+ try:
821
+ subprocess_utils.handle_returncode(returncode,
822
+ code,
823
+ f'Failed to fetch {noun}s',
824
+ stderr,
825
+ stream_logs=True)
826
+ except exceptions.CommandError as e:
827
+ raise RuntimeError(e.error_msg) from e
828
+
829
+ service_records = serve_utils.load_service_status(serve_status_payload)
692
830
 
693
- service_records = serve_utils.load_service_status(serve_status_payload)
694
831
  # Get the endpoint for each service
695
832
  for service_record in service_records:
696
833
  service_record['endpoint'] = None
@@ -793,25 +930,37 @@ def _get_all_replica_targets(
793
930
  handle: backends.CloudVmRayResourceHandle,
794
931
  pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
795
932
  """Helper function to get targets for all live replicas."""
796
- code = serve_utils.ServeCodeGen.get_service_status([service_name],
797
- pool=pool)
798
- returncode, serve_status_payload, stderr = backend.run_on_head(
799
- handle,
800
- code,
801
- require_outputs=True,
802
- stream_logs=False,
803
- separate_stderr=True)
933
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
934
+ use_legacy = not handle.is_grpc_enabled_with_flag
804
935
 
805
- try:
806
- subprocess_utils.handle_returncode(returncode,
807
- code,
808
- 'Failed to fetch services',
809
- stderr,
810
- stream_logs=True)
811
- except exceptions.CommandError as e:
812
- raise RuntimeError(e.error_msg) from e
936
+ if not use_legacy:
937
+ try:
938
+ service_records = serve_rpc_utils.RpcRunner.get_service_status(
939
+ handle, [service_name], pool)
940
+ except exceptions.SkyletMethodNotImplementedError:
941
+ use_legacy = True
942
+
943
+ if use_legacy:
944
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
945
+ pool=pool)
946
+ returncode, serve_status_payload, stderr = backend.run_on_head(
947
+ handle,
948
+ code,
949
+ require_outputs=True,
950
+ stream_logs=False,
951
+ separate_stderr=True)
952
+
953
+ try:
954
+ subprocess_utils.handle_returncode(returncode,
955
+ code,
956
+ 'Failed to fetch services',
957
+ stderr,
958
+ stream_logs=True)
959
+ except exceptions.CommandError as e:
960
+ raise RuntimeError(e.error_msg) from e
961
+
962
+ service_records = serve_utils.load_service_status(serve_status_payload)
813
963
 
814
- service_records = serve_utils.load_service_status(serve_status_payload)
815
964
  if not service_records:
816
965
  raise ValueError(f'Service {service_name!r} not found.')
817
966
  assert len(service_records) == 1