skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/serve/serve_utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import base64
3
3
  import collections
4
4
  import dataclasses
5
+ import datetime
5
6
  import enum
6
7
  import os
7
8
  import pathlib
@@ -9,11 +10,11 @@ import pickle
9
10
  import re
10
11
  import shlex
11
12
  import shutil
12
- import threading
13
13
  import time
14
+ import traceback
14
15
  import typing
15
- from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar, Union)
16
+ from typing import (Any, Callable, DefaultDict, Deque, Dict, Iterator, List,
17
+ Optional, TextIO, Type, Union)
17
18
  import uuid
18
19
 
19
20
  import colorama
@@ -22,19 +23,25 @@ import filelock
22
23
  from sky import backends
23
24
  from sky import exceptions
24
25
  from sky import global_user_state
26
+ from sky import sky_logging
27
+ from sky import skypilot_config
25
28
  from sky.adaptors import common as adaptors_common
29
+ from sky.jobs import state as managed_job_state
26
30
  from sky.serve import constants
27
31
  from sky.serve import serve_state
28
32
  from sky.serve import spot_placer
29
33
  from sky.skylet import constants as skylet_constants
30
34
  from sky.skylet import job_lib
31
35
  from sky.utils import annotations
36
+ from sky.utils import command_runner
32
37
  from sky.utils import common_utils
38
+ from sky.utils import controller_utils
33
39
  from sky.utils import log_utils
34
40
  from sky.utils import message_utils
35
41
  from sky.utils import resources_utils
36
42
  from sky.utils import status_lib
37
43
  from sky.utils import ux_utils
44
+ from sky.utils import yaml_utils
38
45
 
39
46
  if typing.TYPE_CHECKING:
40
47
  import fastapi
@@ -47,23 +54,19 @@ else:
47
54
  psutil = adaptors_common.LazyImport('psutil')
48
55
  requests = adaptors_common.LazyImport('requests')
49
56
 
50
-
51
- @annotations.lru_cache(scope='request')
52
- def get_num_service_threshold():
53
- """Get number of services threshold, calculating it only when needed."""
54
- system_memory_gb = psutil.virtual_memory().total // (1024**3)
55
- return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
56
-
57
+ logger = sky_logging.init_logger(__name__)
57
58
 
58
59
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
59
60
 
60
- # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
61
- # and always appear after a space. Be careful when changing UX as this
62
- # assumption is used to expand some log files while ignoring others.
63
- _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
64
- _SKYPILOT_PROVISION_LOG_PATTERN = (
65
- fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
66
- _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
61
+ # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
62
+ # when changing UX as this assumption is used to expand some log files while
63
+ # ignoring others.
64
+ _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
65
+ _SKYPILOT_PROVISION_API_LOG_PATTERN = (
66
+ fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
67
+ # New hint pattern for provision logs
68
+ _SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
69
+ _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
67
70
 
68
71
  # TODO(tian): Find all existing replica id and print here.
69
72
  _FAILED_TO_FIND_REPLICA_MSG = (
@@ -154,50 +157,6 @@ _SIGNAL_TO_ERROR = {
154
157
  UserSignal.TERMINATE: exceptions.ServeUserTerminatedError,
155
158
  }
156
159
 
157
- # pylint: disable=invalid-name
158
- KeyType = TypeVar('KeyType')
159
- ValueType = TypeVar('ValueType')
160
-
161
-
162
- # Google style guide: Do not rely on the atomicity of built-in types.
163
- # Our launch and down process pool will be used by multiple threads,
164
- # therefore we need to use a thread-safe dict.
165
- # see https://google.github.io/styleguide/pyguide.html#218-threading
166
- class ThreadSafeDict(Generic[KeyType, ValueType]):
167
- """A thread-safe dict."""
168
-
169
- def __init__(self, *args: Any, **kwargs: Any) -> None:
170
- self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
171
- self._lock = threading.Lock()
172
-
173
- def __getitem__(self, key: KeyType) -> ValueType:
174
- with self._lock:
175
- return self._dict.__getitem__(key)
176
-
177
- def __setitem__(self, key: KeyType, value: ValueType) -> None:
178
- with self._lock:
179
- return self._dict.__setitem__(key, value)
180
-
181
- def __delitem__(self, key: KeyType) -> None:
182
- with self._lock:
183
- return self._dict.__delitem__(key)
184
-
185
- def __len__(self) -> int:
186
- with self._lock:
187
- return self._dict.__len__()
188
-
189
- def __contains__(self, key: KeyType) -> bool:
190
- with self._lock:
191
- return self._dict.__contains__(key)
192
-
193
- def items(self):
194
- with self._lock:
195
- return self._dict.items()
196
-
197
- def values(self):
198
- with self._lock:
199
- return self._dict.values()
200
-
201
160
 
202
161
  class RequestsAggregator:
203
162
  """Base class for request aggregator."""
@@ -244,7 +203,120 @@ class RequestTimestamp(RequestsAggregator):
244
203
  return f'RequestTimestamp(timestamps={self.timestamps})'
245
204
 
246
205
 
247
- def validate_service_task(task: 'sky.Task') -> None:
206
+ def get_service_filelock_path(pool: str) -> str:
207
+ path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
208
+ 'pool.lock').expanduser().absolute()
209
+ path.parents[0].mkdir(parents=True, exist_ok=True)
210
+ return str(path)
211
+
212
+
213
+ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
214
+ pool: bool) -> None:
215
+ """Validate the consolidation mode config."""
216
+ # Check whether the consolidation mode config is changed.
217
+ controller = controller_utils.get_controller_for_pool(pool).value
218
+ if current_is_consolidation_mode:
219
+ controller_cn = controller.cluster_name
220
+ if global_user_state.cluster_with_name_exists(controller_cn):
221
+ logger.warning(
222
+ f'{colorama.Fore.RED}Consolidation mode for '
223
+ f'{controller.controller_type} is enabled, but the controller '
224
+ f'cluster {controller_cn} is still running. Please terminate '
225
+ 'the controller cluster first.'
226
+ f'{colorama.Style.RESET_ALL}')
227
+ else:
228
+ noun = 'pool' if pool else 'service'
229
+ all_services = [
230
+ svc for svc in serve_state.get_services() if svc['pool'] == pool
231
+ ]
232
+ if all_services:
233
+ logger.warning(
234
+ f'{colorama.Fore.RED}Consolidation mode for '
235
+ f'{controller.controller_type} is disabled, but there are '
236
+ f'still {len(all_services)} {noun}s running. Please terminate '
237
+ f'those {noun}s first.{colorama.Style.RESET_ALL}')
238
+
239
+
240
+ @annotations.lru_cache(scope='request', maxsize=1)
241
+ def is_consolidation_mode(pool: bool = False) -> bool:
242
+ # Use jobs config for pool consolidation mode.
243
+ controller = controller_utils.get_controller_for_pool(pool).value
244
+ consolidation_mode = skypilot_config.get_nested(
245
+ (controller.controller_type, 'controller', 'consolidation_mode'),
246
+ default_value=False)
247
+ if os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
248
+ # if we are in the job controller, we must always be in consolidation
249
+ # mode.
250
+ return True
251
+ # We should only do this check on API server, as the controller will not
252
+ # have related config and will always seemingly disabled for consolidation
253
+ # mode. Check #6611 for more details.
254
+ if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
255
+ _validate_consolidation_mode_config(consolidation_mode, pool)
256
+ return consolidation_mode
257
+
258
+
259
+ def ha_recovery_for_consolidation_mode(pool: bool):
260
+ """Recovery logic for HA mode."""
261
+ # No setup recovery is needed in consolidation mode, as the API server
262
+ # already has all runtime installed. Directly start jobs recovery here.
263
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
264
+ runner = command_runner.LocalProcessCommandRunner()
265
+ noun = 'pool' if pool else 'serve'
266
+ capnoun = noun.capitalize()
267
+ prefix = f'{noun}_'
268
+ with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
269
+ 'w',
270
+ encoding='utf-8') as f:
271
+ start = time.time()
272
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
273
+ for service_name in serve_state.get_glob_service_names(None):
274
+ svc = _get_service_status(service_name,
275
+ pool=pool,
276
+ with_replica_info=False)
277
+ if svc is None:
278
+ continue
279
+ controller_pid = svc['controller_pid']
280
+ if controller_pid is not None:
281
+ try:
282
+ if _controller_process_alive(controller_pid, service_name):
283
+ f.write(f'Controller pid {controller_pid} for '
284
+ f'{noun} {service_name} is still running. '
285
+ 'Skipping recovery.\n')
286
+ continue
287
+ except Exception: # pylint: disable=broad-except
288
+ # _controller_process_alive may raise if psutil fails; we
289
+ # should not crash the recovery logic because of this.
290
+ f.write('Error checking controller pid '
291
+ f'{controller_pid} for {noun} {service_name}\n')
292
+
293
+ script = serve_state.get_ha_recovery_script(service_name)
294
+ if script is None:
295
+ f.write(f'{capnoun} {service_name}\'s recovery script does '
296
+ 'not exist. Skipping recovery.\n')
297
+ continue
298
+ rc, out, err = runner.run(script, require_outputs=True)
299
+ if rc:
300
+ f.write(f'Recovery script returned {rc}. '
301
+ f'Output: {out}\nError: {err}\n')
302
+ f.write(f'{capnoun} {service_name} completed recovery at '
303
+ f'{datetime.datetime.now()}\n')
304
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
305
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
306
+
307
+
308
+ def _controller_process_alive(pid: int, service_name: str) -> bool:
309
+ """Check if the controller process is alive."""
310
+ try:
311
+ process = psutil.Process(pid)
312
+ cmd_str = ' '.join(process.cmdline())
313
+ return process.is_running(
314
+ ) and f'--service-name {service_name}' in cmd_str
315
+ except psutil.NoSuchProcess:
316
+ return False
317
+
318
+
319
+ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
248
320
  """Validate the task for Sky Serve.
249
321
 
250
322
  Args:
@@ -267,19 +339,43 @@ def validate_service_task(task: 'sky.Task') -> None:
267
339
  'use `dynamic_ondemand_fallback` or set '
268
340
  'base_ondemand_fallback_replicas.')
269
341
 
342
+ field_name = 'service' if not pool else 'pool'
270
343
  if task.service is None:
271
344
  with ux_utils.print_exception_no_traceback():
272
- raise RuntimeError('Service section not found.')
345
+ raise RuntimeError(f'{field_name.capitalize()} section not found.')
346
+
347
+ if pool != task.service.pool:
348
+ with ux_utils.print_exception_no_traceback():
349
+ raise ValueError(f'{field_name.capitalize()} section in the YAML '
350
+ f'file does not match the pool argument. '
351
+ f'To fix, add a valid `{field_name}` field.')
273
352
 
274
353
  policy_description = ('on-demand'
275
354
  if task.service.dynamic_ondemand_fallback else 'spot')
276
355
  for resource in list(task.resources):
277
356
  if resource.job_recovery is not None:
357
+ sys_name = 'SkyServe' if not pool else 'Pool'
278
358
  with ux_utils.print_exception_no_traceback():
279
- raise ValueError('job_recovery is disabled for SkyServe. '
280
- 'SkyServe will replenish preempted spot '
359
+ raise ValueError(f'job_recovery is disabled for {sys_name}. '
360
+ f'{sys_name} will replenish preempted spot '
281
361
  f'with {policy_description} instances.')
282
362
 
363
+ if pool:
364
+ accelerators = set()
365
+ for resource in task.resources:
366
+ if resource.accelerators is not None:
367
+ if isinstance(resource.accelerators, str):
368
+ accelerators.add(resource.accelerators)
369
+ elif isinstance(resource.accelerators, dict):
370
+ accelerators.update(resource.accelerators.keys())
371
+ elif isinstance(resource.accelerators, list):
372
+ accelerators.update(resource.accelerators)
373
+ if len(accelerators) > 1:
374
+ with ux_utils.print_exception_no_traceback():
375
+ raise ValueError('Heterogeneous clusters are not supported for '
376
+ 'pools please specify one accelerator '
377
+ 'for all workers.')
378
+
283
379
  # Try to create a spot placer from the task yaml. Check if the task yaml
284
380
  # is valid for spot placer.
285
381
  spot_placer.SpotPlacer.from_task(task.service, task)
@@ -300,7 +396,7 @@ def validate_service_task(task: 'sky.Task') -> None:
300
396
  raise ValueError(
301
397
  '`spot_placer` is only supported for spot resources. '
302
398
  'Please explicitly specify `use_spot: true` in resources.')
303
- if task.service.ports is None:
399
+ if not pool and task.service.ports is None:
304
400
  requested_ports = list(
305
401
  resources_utils.port_ranges_to_set(requested_resources.ports))
306
402
  if len(requested_ports) != 1:
@@ -320,10 +416,16 @@ def validate_service_task(task: 'sky.Task') -> None:
320
416
  f'Got multiple ports: {service_port} and '
321
417
  f'{replica_ingress_port} in different resources. '
322
418
  'Please specify the same port instead.')
419
+ if pool:
420
+ if (task.service.ports is not None or
421
+ requested_resources.ports is not None):
422
+ with ux_utils.print_exception_no_traceback():
423
+ raise ValueError('Cannot specify ports in a pool.')
323
424
 
324
425
 
325
- def generate_service_name():
326
- return f'sky-service-{uuid.uuid4().hex[:4]}'
426
+ def generate_service_name(pool: bool = False):
427
+ noun = 'pool' if pool else 'service'
428
+ return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
327
429
 
328
430
 
329
431
  def generate_remote_service_dir_name(service_name: str) -> str:
@@ -390,6 +492,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
390
492
 
391
493
 
392
494
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
495
+ # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
496
+ # checking replica cluster existence. Be careful when changing it.
393
497
  return f'{service_name}-{replica_id}'
394
498
 
395
499
 
@@ -425,26 +529,63 @@ def set_service_status_and_active_versions_from_replica(
425
529
  active_versions=active_versions)
426
530
 
427
531
 
428
- def update_service_status() -> None:
429
- services = serve_state.get_services()
430
- for record in services:
431
- if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
532
+ def update_service_status(pool: bool) -> None:
533
+ noun = 'pool' if pool else 'serve'
534
+ capnoun = noun.capitalize()
535
+ service_names = serve_state.get_glob_service_names(None)
536
+ for service_name in service_names:
537
+ record = _get_service_status(service_name,
538
+ pool=pool,
539
+ with_replica_info=False)
540
+ if record is None:
541
+ continue
542
+ service_status = record['status']
543
+ if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
432
544
  # Skip services that is shutting down.
433
545
  continue
434
- controller_job_id = record['controller_job_id']
435
- assert controller_job_id is not None
436
- controller_status = job_lib.get_status(controller_job_id)
437
- if controller_status is None or controller_status.is_terminal():
438
- # If controller job is not running, set it as controller failed.
439
- serve_state.set_service_status_and_active_versions(
440
- record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
441
546
 
547
+ logger.info(f'Update {noun} status for {service_name!r} '
548
+ f'with status {service_status}')
549
+
550
+ controller_pid = record['controller_pid']
551
+ if controller_pid is None:
552
+ logger.info(f'{capnoun} {service_name!r} controller pid is None. '
553
+ f'Unexpected status {service_status}. Set to failure.')
554
+ elif controller_pid < 0:
555
+ # Backwards compatibility: this service was submitted when ray was
556
+ # still used for controller process management. We set the
557
+ # value_to_replace_existing_entries to -1 to indicate historical
558
+ # services.
559
+ # TODO(tian): Remove before 0.13.0.
560
+ controller_job_id = record['controller_job_id']
561
+ assert controller_job_id is not None
562
+ controller_status = job_lib.get_status(controller_job_id)
563
+ if (controller_status is not None and
564
+ not controller_status.is_terminal()):
565
+ continue
566
+ logger.info(f'Updating {noun} {service_name!r} in old version. '
567
+ f'SkyPilot job status: {controller_status}. '
568
+ 'Set to failure.')
569
+ else:
570
+ if _controller_process_alive(controller_pid, service_name):
571
+ # The controller is still running.
572
+ continue
573
+ logger.info(f'{capnoun} {service_name!r} controller pid '
574
+ f'{controller_pid} is not alive. Set to failure.')
575
+
576
+ # If controller job is not running, set it as controller failed.
577
+ serve_state.set_service_status_and_active_versions(
578
+ service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
442
579
 
443
- def update_service_encoded(service_name: str, version: int, mode: str) -> str:
444
- service_status = _get_service_status(service_name)
580
+
581
+ def update_service_encoded(service_name: str, version: int, mode: str,
582
+ pool: bool) -> str:
583
+ noun = 'pool' if pool else 'service'
584
+ capnoun = noun.capitalize()
585
+ service_status = _get_service_status(service_name, pool=pool)
445
586
  if service_status is None:
446
587
  with ux_utils.print_exception_no_traceback():
447
- raise ValueError(f'Service {service_name!r} does not exist.')
588
+ raise ValueError(f'{capnoun} {service_name!r} does not exist.')
448
589
  controller_port = service_status['controller_port']
449
590
  resp = requests.post(
450
591
  _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
@@ -455,27 +596,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
455
596
  })
456
597
  if resp.status_code == 404:
457
598
  with ux_utils.print_exception_no_traceback():
599
+ # This only happens for services since pool is added after the
600
+ # update feature is introduced.
458
601
  raise ValueError(
459
602
  'The service is up-ed in an old version and does not '
460
603
  'support update. Please `sky serve down` '
461
604
  'it first and relaunch the service. ')
462
605
  elif resp.status_code == 400:
463
606
  with ux_utils.print_exception_no_traceback():
464
- raise ValueError(f'Client error during service update: {resp.text}')
607
+ raise ValueError(f'Client error during {noun} update: {resp.text}')
465
608
  elif resp.status_code == 500:
466
609
  with ux_utils.print_exception_no_traceback():
467
610
  raise RuntimeError(
468
- f'Server error during service update: {resp.text}')
611
+ f'Server error during {noun} update: {resp.text}')
469
612
  elif resp.status_code != 200:
470
613
  with ux_utils.print_exception_no_traceback():
471
- raise ValueError(f'Failed to update service: {resp.text}')
614
+ raise ValueError(f'Failed to update {noun}: {resp.text}')
472
615
 
473
616
  service_msg = resp.json()['message']
474
617
  return message_utils.encode_payload(service_msg)
475
618
 
476
619
 
477
620
  def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
478
- service_status = _get_service_status(service_name)
621
+ # TODO(tian): Currently pool does not support terminating replica.
622
+ service_status = _get_service_status(service_name, pool=False)
479
623
  if service_status is None:
480
624
  with ux_utils.print_exception_no_traceback():
481
625
  raise ValueError(f'Service {service_name!r} does not exist.')
@@ -504,8 +648,21 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
504
648
  return message
505
649
 
506
650
 
651
+ def get_yaml_content(service_name: str, version: int) -> str:
652
+ yaml_content = serve_state.get_yaml_content(service_name, version)
653
+ if yaml_content is not None:
654
+ return yaml_content
655
+ # Backward compatibility for old service records that
656
+ # does not dump the yaml content to version database.
657
+ # TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
658
+ latest_yaml_path = generate_task_yaml_file_name(service_name, version)
659
+ with open(latest_yaml_path, 'r', encoding='utf-8') as f:
660
+ return f.read()
661
+
662
+
507
663
  def _get_service_status(
508
664
  service_name: str,
665
+ pool: bool,
509
666
  with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
510
667
  """Get the status dict of the service.
511
668
 
@@ -520,34 +677,105 @@ def _get_service_status(
520
677
  record = serve_state.get_service_from_name(service_name)
521
678
  if record is None:
522
679
  return None
680
+ if record['pool'] != pool:
681
+ return None
682
+
683
+ record['pool_yaml'] = ''
684
+ if record['pool']:
685
+ version = record['version']
686
+ try:
687
+ yaml_content = get_yaml_content(service_name, version)
688
+ raw_yaml_config = yaml_utils.read_yaml_str(yaml_content)
689
+ except Exception as e: # pylint: disable=broad-except
690
+ # If this is a consolidation mode running without an PVC, the file
691
+ # might lost after an API server update (restart). In such case, we
692
+ # don't want it to crash the command. Fall back to an empty string.
693
+ logger.error(f'Failed to read YAML for service {service_name} '
694
+ f'with version {version}: {e}')
695
+ record['pool_yaml'] = ''
696
+ else:
697
+ original_config = raw_yaml_config.get('_user_specified_yaml')
698
+ if original_config is None:
699
+ # Fall back to old display format.
700
+ original_config = raw_yaml_config
701
+ original_config.pop('run', None)
702
+ svc: Dict[str, Any] = original_config.pop('service')
703
+ if svc is not None:
704
+ svc.pop('pool', None) # Remove pool from service config
705
+ original_config['pool'] = svc # Add pool to root config
706
+ else:
707
+ original_config = yaml_utils.safe_load(original_config)
708
+ record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
709
+
710
+ record['target_num_replicas'] = 0
711
+ try:
712
+ controller_port = record['controller_port']
713
+ resp = requests.get(
714
+ _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
715
+ '/autoscaler/info')
716
+ record['target_num_replicas'] = resp.json()['target_num_replicas']
717
+ except requests.exceptions.RequestException:
718
+ record['target_num_replicas'] = None
719
+ except Exception as e: # pylint: disable=broad-except
720
+ logger.error(f'Failed to get autoscaler info for {service_name}: '
721
+ f'{common_utils.format_exception(e)}\n'
722
+ f'Traceback: {traceback.format_exc()}')
723
+
523
724
  if with_replica_info:
524
725
  record['replica_info'] = [
525
- info.to_info_dict(with_handle=True)
726
+ info.to_info_dict(with_handle=True, with_url=not pool)
526
727
  for info in serve_state.get_replica_infos(service_name)
527
728
  ]
729
+ if pool:
730
+ for replica_info in record['replica_info']:
731
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
732
+ service_name, replica_info['name'])
733
+ replica_info['used_by'] = job_ids[0] if job_ids else None
528
734
  return record
529
735
 
530
736
 
531
- def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
737
+ def get_service_status_pickled(service_names: Optional[List[str]],
738
+ pool: bool) -> List[Dict[str, str]]:
532
739
  service_statuses: List[Dict[str, str]] = []
533
740
  if service_names is None:
534
741
  # Get all service names
535
742
  service_names = serve_state.get_glob_service_names(None)
536
743
  for service_name in service_names:
537
- service_status = _get_service_status(service_name)
744
+ service_status = _get_service_status(service_name, pool=pool)
538
745
  if service_status is None:
539
746
  continue
540
747
  service_statuses.append({
541
748
  k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
542
749
  for k, v in service_status.items()
543
750
  })
751
+ return sorted(service_statuses, key=lambda x: x['name'])
752
+
753
+
754
+ # TODO (kyuds): remove when serve codegen is removed
755
+ def get_service_status_encoded(service_names: Optional[List[str]],
756
+ pool: bool) -> str:
544
757
  # We have to use payload_type here to avoid the issue of
545
758
  # message_utils.decode_payload() not being able to correctly decode the
546
759
  # message with <sky-payload> tags.
760
+ service_statuses = get_service_status_pickled(service_names, pool)
547
761
  return message_utils.encode_payload(service_statuses,
548
762
  payload_type='service_status')
549
763
 
550
764
 
765
+ def unpickle_service_status(
766
+ payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
767
+ service_statuses: List[Dict[str, Any]] = []
768
+ for service_status in payload:
769
+ if not isinstance(service_status, dict):
770
+ raise ValueError(f'Invalid service status: {service_status}')
771
+ service_statuses.append({
772
+ k: pickle.loads(base64.b64decode(v))
773
+ for k, v in service_status.items()
774
+ })
775
+ return service_statuses
776
+
777
+
778
+ # TODO (kyuds): remove when serve codegen is removed
551
779
  def load_service_status(payload: str) -> List[Dict[str, Any]]:
552
780
  try:
553
781
  service_statuses_encoded = message_utils.decode_payload(
@@ -559,26 +787,85 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
559
787
  service_statuses_encoded = message_utils.decode_payload(payload)
560
788
  else:
561
789
  raise
562
- service_statuses: List[Dict[str, Any]] = []
563
- for service_status in service_statuses_encoded:
564
- if not isinstance(service_status, dict):
565
- raise ValueError(f'Invalid service status: {service_status}')
566
- service_statuses.append({
567
- k: pickle.loads(base64.b64decode(v))
568
- for k, v in service_status.items()
569
- })
570
- return service_statuses
790
+ return unpickle_service_status(service_statuses_encoded)
571
791
 
572
792
 
793
+ # TODO (kyuds): remove when serve codegen is removed
573
794
  def add_version_encoded(service_name: str) -> str:
574
795
  new_version = serve_state.add_version(service_name)
575
796
  return message_utils.encode_payload(new_version)
576
797
 
577
798
 
799
+ # TODO (kyuds): remove when serve codegen is removed
578
800
  def load_version_string(payload: str) -> str:
579
801
  return message_utils.decode_payload(payload)
580
802
 
581
803
 
804
+ def get_ready_replicas(
805
+ service_name: str) -> List['replica_managers.ReplicaInfo']:
806
+ logger.info(f'Get number of replicas for pool {service_name!r}')
807
+ return [
808
+ info for info in serve_state.get_replica_infos(service_name)
809
+ if info.status == serve_state.ReplicaStatus.READY
810
+ ]
811
+
812
+
813
+ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
814
+ """Get the next available cluster name from idle replicas.
815
+
816
+ Args:
817
+ service_name: The name of the service.
818
+ job_id: Optional job ID to associate with the acquired cluster.
819
+ If None, a placeholder will be used.
820
+
821
+ Returns:
822
+ The cluster name if an idle replica is found, None otherwise.
823
+ """
824
+ # Check if service exists
825
+ service_status = _get_service_status(service_name,
826
+ pool=True,
827
+ with_replica_info=False)
828
+ if service_status is None:
829
+ logger.error(f'Service {service_name!r} does not exist.')
830
+ return None
831
+ if not service_status['pool']:
832
+ logger.error(f'Service {service_name!r} is not a pool.')
833
+ return None
834
+ with filelock.FileLock(get_service_filelock_path(service_name)):
835
+ logger.debug(f'Get next cluster name for pool {service_name!r}')
836
+ ready_replicas = get_ready_replicas(service_name)
837
+ idle_replicas: List['replica_managers.ReplicaInfo'] = []
838
+ for replica_info in ready_replicas:
839
+ jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
840
+ service_name, replica_info.cluster_name)
841
+ # TODO(tian): Make it resources aware. Currently we allow and only
842
+ # allow one job per replica. In the following PR, we should:
843
+ # i) When the replica is launched with `any_of` resources (
844
+ # replicas can have different resources), we should check if
845
+ # the resources that jobs require are available on the replica.
846
+ # e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
847
+ # should only goes to replica with A100.
848
+ # ii) When a job only requires a subset of the resources on the
849
+ # replica, each replica should be able to handle multiple jobs
850
+ # at the same time. e.g., if a job requires A100:1 on a A100:8
851
+ # pool, it should be able to run 4 jobs at the same time.
852
+ if not jobs_on_replica:
853
+ idle_replicas.append(replica_info)
854
+ if not idle_replicas:
855
+ logger.info(f'No idle replicas found for pool {service_name!r}')
856
+ return None
857
+
858
+ # Select the first idle replica.
859
+ # TODO(tian): "Load balancing" policy.
860
+ replica_info = idle_replicas[0]
861
+ logger.info(f'Selected replica {replica_info.replica_id} with cluster '
862
+ f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
863
+ f'{service_name!r}')
864
+ managed_job_state.set_current_cluster_name(job_id,
865
+ replica_info.cluster_name)
866
+ return replica_info.cluster_name
867
+
868
+
582
869
  def _terminate_failed_services(
583
870
  service_name: str,
584
871
  service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
@@ -598,8 +885,8 @@ def _terminate_failed_services(
598
885
  # replicas, so we don't need to try again here.
599
886
  for replica_info in serve_state.get_replica_infos(service_name):
600
887
  # TODO(tian): Refresh latest status of the cluster.
601
- if global_user_state.get_cluster_from_name(
602
- replica_info.cluster_name) is not None:
888
+ if global_user_state.cluster_with_name_exists(
889
+ replica_info.cluster_name):
603
890
  remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
604
891
  serve_state.remove_replica(service_name, replica_info.replica_id)
605
892
 
@@ -608,9 +895,11 @@ def _terminate_failed_services(
608
895
  shutil.rmtree(service_dir)
609
896
  serve_state.remove_service(service_name)
610
897
  serve_state.delete_all_versions(service_name)
898
+ serve_state.remove_ha_recovery_script(service_name)
611
899
 
612
900
  if not remaining_replica_clusters:
613
901
  return None
902
+ # TODO(tian): Try to terminate those replica clusters.
614
903
  remaining_identity = ', '.join(remaining_replica_clusters)
615
904
  return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
616
905
  f'failed status ({service_status}). This may indicate a resource '
@@ -618,17 +907,38 @@ def _terminate_failed_services(
618
907
  f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
619
908
 
620
909
 
621
- def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
910
+ def terminate_services(service_names: Optional[List[str]], purge: bool,
911
+ pool: bool) -> str:
912
+ noun = 'pool' if pool else 'service'
913
+ capnoun = noun.capitalize()
622
914
  service_names = serve_state.get_glob_service_names(service_names)
623
915
  terminated_service_names: List[str] = []
624
916
  messages: List[str] = []
625
917
  for service_name in service_names:
626
918
  service_status = _get_service_status(service_name,
919
+ pool=pool,
627
920
  with_replica_info=False)
921
+ if service_status is None:
922
+ continue
628
923
  if (service_status is not None and service_status['status']
629
924
  == serve_state.ServiceStatus.SHUTTING_DOWN):
630
925
  # Already scheduled to be terminated.
631
926
  continue
927
+ if pool:
928
+ nonterminal_job_ids = (
929
+ managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
930
+ if nonterminal_job_ids:
931
+ nonterminal_job_ids_str = ','.join(
932
+ str(job_id) for job_id in nonterminal_job_ids)
933
+ num_nonterminal_jobs = len(nonterminal_job_ids)
934
+ messages.append(
935
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
936
+ f'{num_nonterminal_jobs} nonterminal jobs: '
937
+ f'{nonterminal_job_ids_str}. To terminate the {noun}, '
938
+ f'please run `sky jobs cancel --pool {service_name}` to '
939
+ 'cancel all jobs in the pool first.'
940
+ f'{colorama.Style.RESET_ALL}')
941
+ continue
632
942
  # If the `services` and `version_specs` table are not aligned, it might
633
943
  # result in a None service status. In this case, the controller process
634
944
  # is not functioning as well and we should also use the
@@ -636,10 +946,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
636
946
  # This is a safeguard for a rare case, that is accidentally abort
637
947
  # between `serve_state.add_service` and
638
948
  # `serve_state.add_or_update_version` in service.py.
639
- if (service_status is None or service_status['status']
949
+ purge_cmd = (f'sky jobs pool down {service_name} --purge'
950
+ if pool else f'sky serve down {service_name} --purge')
951
+ if (service_status['status']
640
952
  in serve_state.ServiceStatus.failed_statuses()):
641
- failed_status = (service_status['status']
642
- if service_status is not None else None)
953
+ failed_status = service_status['status']
643
954
  if purge:
644
955
  message = _terminate_failed_services(service_name,
645
956
  failed_status)
@@ -647,11 +958,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
647
958
  messages.append(message)
648
959
  else:
649
960
  messages.append(
650
- f'{colorama.Fore.YELLOW}Service {service_name!r} is in '
961
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
651
962
  f'failed status ({failed_status}). Skipping '
652
963
  'its termination as it could lead to a resource leak. '
653
- f'(Use `sky serve down {service_name} --purge` to '
654
- 'forcefully terminate the service.)'
964
+ f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
655
965
  f'{colorama.Style.RESET_ALL}')
656
966
  # Don't add to terminated_service_names since it's not
657
967
  # actually terminated.
@@ -668,17 +978,18 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
668
978
  f.flush()
669
979
  terminated_service_names.append(f'{service_name!r}')
670
980
  if not terminated_service_names:
671
- messages.append('No service to terminate.')
981
+ messages.append(f'No {noun} to terminate.')
672
982
  else:
673
- identity_str = f'Service {terminated_service_names[0]} is'
983
+ identity_str = f'{capnoun} {terminated_service_names[0]} is'
674
984
  if len(terminated_service_names) > 1:
675
985
  terminated_service_names_str = ', '.join(terminated_service_names)
676
- identity_str = f'Services {terminated_service_names_str} are'
986
+ identity_str = f'{capnoun}s {terminated_service_names_str} are'
677
987
  messages.append(f'{identity_str} scheduled to be terminated.')
678
988
  return '\n'.join(messages)
679
989
 
680
990
 
681
- def wait_service_registration(service_name: str, job_id: int) -> str:
991
+ def wait_service_registration(service_name: str, job_id: int,
992
+ pool: bool) -> str:
682
993
  """Util function to call at the end of `sky.serve.up()`.
683
994
 
684
995
  This function will:
@@ -691,49 +1002,67 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
691
1002
  Returns:
692
1003
  Encoded load balancer port assigned to the service.
693
1004
  """
1005
+ # TODO (kyuds): when codegen is fully deprecated, return the lb port
1006
+ # as an int directly instead of encoding it.
694
1007
  start_time = time.time()
695
1008
  setup_completed = False
1009
+ noun = 'pool' if pool else 'service'
696
1010
  while True:
697
- job_status = job_lib.get_status(job_id)
698
- if job_status is None or job_status < job_lib.JobStatus.RUNNING:
699
- # Wait for the controller process to finish setting up. It can be
700
- # slow if a lot cloud dependencies are being installed.
701
- if (time.time() - start_time >
702
- constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
703
- with ux_utils.print_exception_no_traceback():
704
- raise RuntimeError(
705
- f'Failed to start the controller '
706
- f'process for the service {service_name!r} '
707
- f'within '
708
- f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS} seconds.'
709
- )
710
- # No need to check the service status as the controller process
711
- # is still setting up.
712
- time.sleep(1)
713
- continue
1011
+ # Only do this check for non-consolidation mode as consolidation mode
1012
+ # has no setup process.
1013
+ if not is_consolidation_mode(pool):
1014
+ job_status = job_lib.get_status(job_id)
1015
+ if job_status is None or job_status < job_lib.JobStatus.RUNNING:
1016
+ # Wait for the controller process to finish setting up. It
1017
+ # can be slow if a lot cloud dependencies are being installed.
1018
+ if (time.time() - start_time >
1019
+ constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
1020
+ with ux_utils.print_exception_no_traceback():
1021
+ raise RuntimeError(
1022
+ f'Failed to start the controller process for '
1023
+ f'the {noun} {service_name!r} within '
1024
+ f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
1025
+ f' seconds.')
1026
+ # No need to check the service status as the controller process
1027
+ # is still setting up.
1028
+ time.sleep(1)
1029
+ continue
714
1030
 
715
1031
  if not setup_completed:
716
1032
  setup_completed = True
717
1033
  # Reset the start time to wait for the service to be registered.
718
1034
  start_time = time.time()
719
1035
 
720
- record = serve_state.get_service_from_name(service_name)
1036
+ record = _get_service_status(service_name,
1037
+ pool=pool,
1038
+ with_replica_info=False)
721
1039
  if record is not None:
722
1040
  if job_id != record['controller_job_id']:
1041
+ if pool:
1042
+ command_to_run = 'sky jobs pool apply --pool'
1043
+ else:
1044
+ command_to_run = 'sky serve update'
723
1045
  with ux_utils.print_exception_no_traceback():
724
1046
  raise ValueError(
725
- f'The service {service_name!r} is already running. '
726
- 'Please specify a different name for your service. '
727
- 'To update an existing service, run: sky serve update '
728
- f'{service_name} <new-service-yaml>')
1047
+ f'The {noun} {service_name!r} is already running. '
1048
+ f'Please specify a different name for your {noun}. '
1049
+ f'To update an existing {noun}, run: {command_to_run}'
1050
+ f' {service_name} <new-{noun}-yaml>')
729
1051
  lb_port = record['load_balancer_port']
730
1052
  if lb_port is not None:
731
1053
  return message_utils.encode_payload(lb_port)
732
- elif len(serve_state.get_services()) >= get_num_service_threshold():
733
- with ux_utils.print_exception_no_traceback():
734
- raise RuntimeError('Max number of services reached. '
735
- 'To spin up more services, please '
736
- 'tear down some existing services.')
1054
+ else:
1055
+ controller_log_path = os.path.expanduser(
1056
+ generate_remote_controller_log_file_name(service_name))
1057
+ if os.path.exists(controller_log_path):
1058
+ with open(controller_log_path, 'r', encoding='utf-8') as f:
1059
+ log_content = f.read()
1060
+ if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
1061
+ in log_content):
1062
+ with ux_utils.print_exception_no_traceback():
1063
+ raise RuntimeError('Max number of services reached. '
1064
+ 'To spin up more services, please '
1065
+ 'tear down some existing services.')
737
1066
  elapsed = time.time() - start_time
738
1067
  if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
739
1068
  # Print the controller log to help user debug.
@@ -754,12 +1083,16 @@ def load_service_initialization_result(payload: str) -> int:
754
1083
  return message_utils.decode_payload(payload)
755
1084
 
756
1085
 
757
- def check_service_status_healthy(service_name: str) -> Optional[str]:
758
- service_record = serve_state.get_service_from_name(service_name)
1086
+ def _check_service_status_healthy(service_name: str,
1087
+ pool: bool) -> Optional[str]:
1088
+ service_record = _get_service_status(service_name,
1089
+ pool,
1090
+ with_replica_info=False)
1091
+ capnoun = 'Service' if not pool else 'Pool'
759
1092
  if service_record is None:
760
- return f'Service {service_name!r} does not exist.'
1093
+ return f'{capnoun} {service_name!r} does not exist.'
761
1094
  if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
762
- return (f'Service {service_name!r} is still initializing its '
1095
+ return (f'{capnoun} {service_name!r} is still initializing its '
763
1096
  'controller. Please try again later.')
764
1097
  return None
765
1098
 
@@ -782,6 +1115,89 @@ def get_latest_version_with_min_replicas(
782
1115
  return active_versions[-1] if active_versions else None
783
1116
 
784
1117
 
1118
+ def _process_line(
1119
+ line: str,
1120
+ cluster_name: str,
1121
+ stop_on_eof: bool = False,
1122
+ streamed_provision_log_paths: Optional[set] = None) -> Iterator[str]:
1123
+ # The line might be directing users to view logs, like
1124
+ # `✓ Cluster launched: new-http. View logs at: *.log`
1125
+ # We should tail the detailed logs for user.
1126
+ def cluster_is_up() -> bool:
1127
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
1128
+ return status == status_lib.ClusterStatus.UP
1129
+
1130
+ provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1131
+ line)
1132
+ provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
1133
+ line)
1134
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1135
+
1136
+ def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
1137
+ # Check if this provision log has already been streamed to avoid
1138
+ # duplicate expansion. When a Kubernetes cluster needs to pull a Docker
1139
+ # image, rich spinner updates can produce hundreds of lines matching
1140
+ # _SKYPILOT_PROVISION_LOG_CMD_PATTERN (e.g., "Launching (1 pod(s)
1141
+ # pending due to Pulling)... View logs: sky logs --provision ...").
1142
+ # Without this check, the same provision log would be expanded hundreds
1143
+ # of times, creating huge log files (30M+) and making users think the
1144
+ # system is stuck in an infinite loop.
1145
+ if streamed_provision_log_paths is not None:
1146
+ resolved_path = str(p.resolve())
1147
+ if resolved_path in streamed_provision_log_paths:
1148
+ return
1149
+ streamed_provision_log_paths.add(resolved_path)
1150
+
1151
+ try:
1152
+ with open(p, 'r', newline='', encoding='utf-8') as f:
1153
+ # Exit if >10s without new content to avoid hanging when INIT
1154
+ yield from log_utils.follow_logs(f,
1155
+ should_stop=cluster_is_up,
1156
+ stop_on_eof=stop_on_eof,
1157
+ idle_timeout_seconds=10)
1158
+ except FileNotFoundError:
1159
+ # Fall back cleanly if the hinted path doesn't exist
1160
+ yield line
1161
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
1162
+ f'Try to expand log file {p} but not found. Skipping...'
1163
+ f'{colorama.Style.RESET_ALL}')
1164
+ return
1165
+
1166
+ if provision_api_log_prompt is not None:
1167
+ rel_path = provision_api_log_prompt.group(1)
1168
+ nested_log_path = pathlib.Path(
1169
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1170
+ rel_path).resolve()
1171
+ yield from _stream_provision_path(nested_log_path)
1172
+ return
1173
+
1174
+ if provision_log_cmd_prompt is not None:
1175
+ # Resolve provision log via cluster table first, then history.
1176
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1177
+ cluster_name)
1178
+ if not log_path_str:
1179
+ log_path_str = (
1180
+ global_user_state.get_cluster_history_provision_log_path(
1181
+ cluster_name))
1182
+ if not log_path_str:
1183
+ yield line
1184
+ return
1185
+ yield from _stream_provision_path(
1186
+ pathlib.Path(log_path_str).expanduser().resolve())
1187
+ return
1188
+
1189
+ if log_prompt is not None:
1190
+ # Now we skip other logs (file sync logs) since we lack
1191
+ # utility to determine when these log files are finished
1192
+ # writing.
1193
+ # TODO(tian): We should not skip these logs since there are
1194
+ # small chance that error will happen in file sync. Need to
1195
+ # find a better way to do this.
1196
+ return
1197
+
1198
+ yield line
1199
+
1200
+
785
1201
  def _follow_logs_with_provision_expanding(
786
1202
  file: TextIO,
787
1203
  cluster_name: str,
@@ -803,52 +1219,14 @@ def _follow_logs_with_provision_expanding(
803
1219
  Yields:
804
1220
  Log lines, including expanded content from referenced provision logs.
805
1221
  """
806
-
807
- def cluster_is_up() -> bool:
808
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
809
- if cluster_record is None:
810
- return False
811
- return cluster_record['status'] == status_lib.ClusterStatus.UP
1222
+ streamed_provision_log_paths: set = set()
812
1223
 
813
1224
  def process_line(line: str) -> Iterator[str]:
814
- # The line might be directing users to view logs, like
815
- # `✓ Cluster launched: new-http. View logs at: *.log`
816
- # We should tail the detailed logs for user.
817
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
818
- log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
819
-
820
- if provision_log_prompt is not None:
821
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
822
-
823
- try:
824
- with open(nested_log_path, 'r', newline='',
825
- encoding='utf-8') as f:
826
- # We still exit if more than 10 seconds without new content
827
- # to avoid any internal bug that causes the launch to fail
828
- # while cluster status remains INIT.
829
- yield from log_utils.follow_logs(f,
830
- should_stop=cluster_is_up,
831
- stop_on_eof=stop_on_eof,
832
- idle_timeout_seconds=10)
833
- except FileNotFoundError:
834
- yield line
835
-
836
- yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
837
- f'Try to expand log file {nested_log_path} but not '
838
- f'found. Skipping...{colorama.Style.RESET_ALL}')
839
- pass
840
- return
841
-
842
- if log_prompt is not None:
843
- # Now we skip other logs (file sync logs) since we lack
844
- # utility to determine when these log files are finished
845
- # writing.
846
- # TODO(tian): We should not skip these logs since there are
847
- # small chance that error will happen in file sync. Need to
848
- # find a better way to do this.
849
- return
850
-
851
- yield line
1225
+ yield from _process_line(
1226
+ line,
1227
+ cluster_name,
1228
+ stop_on_eof=stop_on_eof,
1229
+ streamed_provision_log_paths=streamed_provision_log_paths)
852
1230
 
853
1231
  return log_utils.follow_logs(file,
854
1232
  should_stop=should_stop,
@@ -857,24 +1235,62 @@ def _follow_logs_with_provision_expanding(
857
1235
  idle_timeout_seconds=idle_timeout_seconds)
858
1236
 
859
1237
 
860
- def stream_replica_logs(service_name: str, replica_id: int,
861
- follow: bool) -> str:
862
- msg = check_service_status_healthy(service_name)
1238
+ def _capped_follow_logs_with_provision_expanding(
1239
+ log_list: List[str],
1240
+ cluster_name: str,
1241
+ *,
1242
+ line_cap: int = 100,
1243
+ ) -> Iterator[str]:
1244
+ """Follows logs and expands any provision.log references found.
1245
+
1246
+ Args:
1247
+ log_list: List of Log Lines to read from.
1248
+ cluster_name: Name of the cluster being launched.
1249
+ line_cap: Number of last lines to return
1250
+
1251
+ Yields:
1252
+ Log lines, including expanded content from referenced provision logs.
1253
+ """
1254
+ all_lines: Deque[str] = collections.deque(maxlen=line_cap)
1255
+ streamed_provision_log_paths: set = set()
1256
+
1257
+ for line in log_list:
1258
+ for processed in _process_line(
1259
+ line=line,
1260
+ cluster_name=cluster_name,
1261
+ stop_on_eof=False,
1262
+ streamed_provision_log_paths=streamed_provision_log_paths):
1263
+ all_lines.append(processed)
1264
+
1265
+ yield from all_lines
1266
+
1267
+
1268
+ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1269
+ tail: Optional[int], pool: bool) -> str:
1270
+ msg = _check_service_status_healthy(service_name, pool=pool)
863
1271
  if msg is not None:
864
1272
  return msg
1273
+ repnoun = 'worker' if pool else 'replica'
1274
+ caprepnoun = repnoun.capitalize()
865
1275
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
866
- f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
867
-
1276
+ f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
868
1277
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
869
1278
  if os.path.exists(log_file_name):
870
- with open(log_file_name, 'r', encoding='utf-8') as f:
871
- print(f.read(), flush=True)
1279
+ if tail is not None:
1280
+ lines = common_utils.read_last_n_lines(log_file_name, tail)
1281
+ for line in lines:
1282
+ if not line.endswith('\n'):
1283
+ line += '\n'
1284
+ print(line, end='', flush=True)
1285
+ else:
1286
+ with open(log_file_name, 'r', encoding='utf-8') as f:
1287
+ print(f.read(), flush=True)
872
1288
  return ''
873
1289
 
874
1290
  launch_log_file_name = generate_replica_launch_log_file_name(
875
1291
  service_name, replica_id)
876
1292
  if not os.path.exists(launch_log_file_name):
877
- return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.'
1293
+ return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
878
1294
  f'{colorama.Style.RESET_ALL}')
879
1295
 
880
1296
  replica_cluster_name = generate_replica_cluster_name(
@@ -891,42 +1307,89 @@ def stream_replica_logs(service_name: str, replica_id: int,
891
1307
 
892
1308
  replica_provisioned = (
893
1309
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
894
- with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
895
- for line in _follow_logs_with_provision_expanding(
896
- f,
897
- replica_cluster_name,
898
- should_stop=replica_provisioned,
899
- stop_on_eof=not follow,
900
- ):
901
- print(line, end='', flush=True)
1310
+
1311
+ # Handle launch logs based on number parameter
1312
+ final_lines_to_print = []
1313
+ if tail is not None:
1314
+ static_lines = common_utils.read_last_n_lines(launch_log_file_name,
1315
+ tail)
1316
+ lines = list(
1317
+ _capped_follow_logs_with_provision_expanding(
1318
+ log_list=static_lines,
1319
+ cluster_name=replica_cluster_name,
1320
+ line_cap=tail,
1321
+ ))
1322
+ final_lines_to_print += lines
1323
+ else:
1324
+ with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
1325
+ for line in _follow_logs_with_provision_expanding(
1326
+ f,
1327
+ replica_cluster_name,
1328
+ should_stop=replica_provisioned,
1329
+ stop_on_eof=not follow,
1330
+ ):
1331
+ print(line, end='', flush=True)
902
1332
 
903
1333
  if (not follow and
904
1334
  _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
905
1335
  # Early exit if not following the logs.
1336
+ if tail is not None:
1337
+ for line in final_lines_to_print:
1338
+ if not line.endswith('\n'):
1339
+ line += '\n'
1340
+ print(line, end='', flush=True)
906
1341
  return ''
907
1342
 
908
1343
  backend = backends.CloudVmRayBackend()
909
1344
  handle = global_user_state.get_handle_from_cluster_name(
910
1345
  replica_cluster_name)
911
1346
  if handle is None:
1347
+ if tail is not None:
1348
+ for line in final_lines_to_print:
1349
+ if not line.endswith('\n'):
1350
+ line += '\n'
1351
+ print(line, end='', flush=True)
912
1352
  return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
913
1353
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
914
1354
 
915
1355
  # Notify user here to make sure user won't think the log is finished.
916
1356
  print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
917
- f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
1357
+ f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
918
1358
 
919
1359
  # Always tail the latest logs, which represent user setup & run.
920
- returncode = backend.tail_logs(handle, job_id=None, follow=follow)
921
- if returncode != 0:
922
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
923
- f'{replica_id}.{colorama.Style.RESET_ALL}')
1360
+ if tail is None:
1361
+ returncode = backend.tail_logs(handle, job_id=None, follow=follow)
1362
+ if returncode != 0:
1363
+ return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
1364
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1365
+ elif not follow and tail > 0:
1366
+ final = backend.tail_logs(handle,
1367
+ job_id=None,
1368
+ follow=follow,
1369
+ tail=tail,
1370
+ stream_logs=False,
1371
+ require_outputs=True,
1372
+ process_stream=True)
1373
+ if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
1374
+ if tail is not None:
1375
+ for line in final_lines_to_print:
1376
+ if not line.endswith('\n'):
1377
+ line += '\n'
1378
+ print(line, end='', flush=True)
1379
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1380
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1381
+ final_lines_to_print += final[1].splitlines()
1382
+ for line in final_lines_to_print[-tail:]:
1383
+ if not line.endswith('\n'):
1384
+ line += '\n'
1385
+ print(line, end='', flush=True)
924
1386
  return ''
925
1387
 
926
1388
 
927
1389
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
928
- follow: bool) -> str:
929
- msg = check_service_status_healthy(service_name)
1390
+ follow: bool, tail: Optional[int],
1391
+ pool: bool) -> str:
1392
+ msg = _check_service_status_healthy(service_name, pool)
930
1393
  if msg is not None:
931
1394
  return msg
932
1395
  if stream_controller:
@@ -935,19 +1398,31 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
935
1398
  log_file = generate_remote_load_balancer_log_file_name(service_name)
936
1399
 
937
1400
  def _service_is_terminal() -> bool:
938
- record = serve_state.get_service_from_name(service_name)
1401
+ record = _get_service_status(service_name,
1402
+ pool,
1403
+ with_replica_info=False)
939
1404
  if record is None:
940
1405
  return True
941
1406
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
942
1407
 
943
- with open(os.path.expanduser(log_file), 'r', newline='',
944
- encoding='utf-8') as f:
945
- for line in log_utils.follow_logs(
946
- f,
947
- should_stop=_service_is_terminal,
948
- stop_on_eof=not follow,
949
- ):
1408
+ if tail is not None:
1409
+ lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
1410
+ tail)
1411
+ for line in lines:
1412
+ if not line.endswith('\n'):
1413
+ line += '\n'
950
1414
  print(line, end='', flush=True)
1415
+ else:
1416
+ with open(os.path.expanduser(log_file),
1417
+ 'r',
1418
+ newline='',
1419
+ encoding='utf-8') as f:
1420
+ for line in log_utils.follow_logs(
1421
+ f,
1422
+ should_stop=_service_is_terminal,
1423
+ stop_on_eof=not follow,
1424
+ ):
1425
+ print(line, end='', flush=True)
951
1426
  return ''
952
1427
 
953
1428
 
@@ -965,18 +1440,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
965
1440
  return f'{ready_replica_num}/{total_replica_num}'
966
1441
 
967
1442
 
968
- def format_service_table(service_records: List[Dict[str, Any]],
969
- show_all: bool) -> str:
1443
+ def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
1444
+ pool: bool) -> str:
1445
+ noun = 'pool' if pool else 'service'
970
1446
  if not service_records:
971
- return 'No existing services.'
1447
+ return f'No existing {noun}s.'
972
1448
 
973
1449
  service_columns = [
974
- 'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'
1450
+ 'NAME', 'VERSION', 'UPTIME', 'STATUS',
1451
+ 'REPLICAS' if not pool else 'WORKERS'
975
1452
  ]
1453
+ if not pool:
1454
+ service_columns.append('ENDPOINT')
976
1455
  if show_all:
977
1456
  service_columns.extend([
978
1457
  'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
979
1458
  ])
1459
+ if pool:
1460
+ # Remove the load balancing policy column for pools.
1461
+ service_columns.pop(-2)
980
1462
  service_table = log_utils.create_table(service_columns)
981
1463
 
982
1464
  replica_infos: List[Dict[str, Any]] = []
@@ -1007,37 +1489,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
1007
1489
  uptime,
1008
1490
  status_str,
1009
1491
  replicas,
1010
- endpoint,
1011
1492
  ]
1493
+ if not pool:
1494
+ service_values.append(endpoint)
1012
1495
  if show_all:
1013
1496
  service_values.extend(
1014
1497
  [policy, load_balancing_policy, requested_resources_str])
1498
+ if pool:
1499
+ service_values.pop(-2)
1015
1500
  service_table.add_row(service_values)
1016
1501
 
1017
- replica_table = _format_replica_table(replica_infos, show_all)
1502
+ replica_table = _format_replica_table(replica_infos, show_all, pool)
1503
+ replica_noun = 'Pool Workers' if pool else 'Service Replicas'
1018
1504
  return (f'{service_table}\n'
1019
1505
  f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1020
- f'Service Replicas{colorama.Style.RESET_ALL}\n'
1506
+ f'{replica_noun}{colorama.Style.RESET_ALL}\n'
1021
1507
  f'{replica_table}')
1022
1508
 
1023
1509
 
1024
- def _format_replica_table(replica_records: List[Dict[str, Any]],
1025
- show_all: bool) -> str:
1510
+ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1511
+ pool: bool) -> str:
1512
+ noun = 'worker' if pool else 'replica'
1026
1513
  if not replica_records:
1027
- return 'No existing replicas.'
1514
+ return f'No existing {noun}s.'
1028
1515
 
1029
1516
  replica_columns = [
1030
- 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'RESOURCES',
1031
- 'STATUS', 'REGION'
1517
+ 'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
1518
+ 'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
1032
1519
  ]
1033
- if show_all:
1034
- replica_columns.append('ZONE')
1520
+ if pool:
1521
+ replica_columns.append('USED_BY')
1522
+ # Remove the endpoint column for pool workers.
1523
+ replica_columns.pop(3)
1035
1524
  replica_table = log_utils.create_table(replica_columns)
1036
1525
 
1037
1526
  truncate_hint = ''
1038
1527
  if not show_all:
1039
1528
  if len(replica_records) > _REPLICA_TRUNC_NUM:
1040
- truncate_hint = '\n... (use --all to show all replicas)'
1529
+ truncate_hint = f'\n... (use --all to show all {noun}s)'
1041
1530
  replica_records = replica_records[:_REPLICA_TRUNC_NUM]
1042
1531
 
1043
1532
  for record in replica_records:
@@ -1047,21 +1536,26 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1047
1536
  version = (record['version'] if 'version' in record else '-')
1048
1537
  replica_endpoint = endpoint if endpoint else '-'
1049
1538
  launched_at = log_utils.readable_time_duration(record['launched_at'])
1539
+ infra = '-'
1050
1540
  resources_str = '-'
1051
1541
  replica_status = record['status']
1052
1542
  status_str = replica_status.colored_str()
1053
- region = '-'
1054
- zone = '-'
1543
+ used_by = record.get('used_by', None)
1544
+ used_by_str = str(used_by) if used_by is not None else '-'
1055
1545
 
1056
1546
  replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
1057
1547
  'handle']
1058
1548
  if replica_handle is not None:
1059
- resources_str = resources_utils.get_readable_resources_repr(
1060
- replica_handle, simplify=not show_all)
1061
- if replica_handle.launched_resources.region is not None:
1062
- region = replica_handle.launched_resources.region
1063
- if replica_handle.launched_resources.zone is not None:
1064
- zone = replica_handle.launched_resources.zone
1549
+ infra = replica_handle.launched_resources.infra.formatted_str()
1550
+ simplified = not show_all
1551
+ resources_str_simple, resources_str_full = (
1552
+ resources_utils.get_readable_resources_repr(
1553
+ replica_handle, simplified_only=simplified))
1554
+ if simplified:
1555
+ resources_str = resources_str_simple
1556
+ else:
1557
+ assert resources_str_full is not None
1558
+ resources_str = resources_str_full
1065
1559
 
1066
1560
  replica_values = [
1067
1561
  service_name,
@@ -1069,18 +1563,20 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1069
1563
  version,
1070
1564
  replica_endpoint,
1071
1565
  launched_at,
1566
+ infra,
1072
1567
  resources_str,
1073
1568
  status_str,
1074
- region,
1075
1569
  ]
1076
- if show_all:
1077
- replica_values.append(zone)
1570
+ if pool:
1571
+ replica_values.append(used_by_str)
1572
+ replica_values.pop(3)
1078
1573
  replica_table.add_row(replica_values)
1079
1574
 
1080
1575
  return f'{replica_table}{truncate_hint}'
1081
1576
 
1082
1577
 
1083
1578
  # =========================== CodeGen for Sky Serve ===========================
1579
+ # TODO (kyuds): deprecate and remove serve codegen entirely.
1084
1580
 
1085
1581
 
1086
1582
  # TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
@@ -1099,13 +1595,16 @@ class ServeCodeGen:
1099
1595
  'from sky.serve import serve_state',
1100
1596
  'from sky.serve import serve_utils',
1101
1597
  'from sky.serve import constants',
1598
+ 'serve_version = constants.SERVE_VERSION',
1102
1599
  ]
1103
1600
 
1104
1601
  @classmethod
1105
- def get_service_status(cls, service_names: Optional[List[str]]) -> str:
1602
+ def get_service_status(cls, service_names: Optional[List[str]],
1603
+ pool: bool) -> str:
1106
1604
  code = [
1107
- f'msg = serve_utils.get_service_status_encoded({service_names!r})',
1108
- 'print(msg, end="", flush=True)'
1605
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1606
+ f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
1607
+ '**kwargs)', 'print(msg, end="", flush=True)'
1109
1608
  ]
1110
1609
  return cls._build(code)
1111
1610
 
@@ -1118,11 +1617,12 @@ class ServeCodeGen:
1118
1617
  return cls._build(code)
1119
1618
 
1120
1619
  @classmethod
1121
- def terminate_services(cls, service_names: Optional[List[str]],
1122
- purge: bool) -> str:
1620
+ def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
1621
+ pool: bool) -> str:
1123
1622
  code = [
1623
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1124
1624
  f'msg = serve_utils.terminate_services({service_names!r}, '
1125
- f'purge={purge})', 'print(msg, end="", flush=True)'
1625
+ f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
1126
1626
  ]
1127
1627
  return cls._build(code)
1128
1628
 
@@ -1139,29 +1639,48 @@ class ServeCodeGen:
1139
1639
  return cls._build(code)
1140
1640
 
1141
1641
  @classmethod
1142
- def wait_service_registration(cls, service_name: str, job_id: int) -> str:
1642
+ def wait_service_registration(cls, service_name: str, job_id: int,
1643
+ pool: bool) -> str:
1143
1644
  code = [
1645
+ f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
1144
1646
  'msg = serve_utils.wait_service_registration('
1145
- f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)'
1647
+ f'{service_name!r}, {job_id}, **kwargs)',
1648
+ 'print(msg, end="", flush=True)'
1146
1649
  ]
1147
1650
  return cls._build(code)
1148
1651
 
1149
1652
  @classmethod
1150
1653
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1151
- follow: bool) -> str:
1654
+ follow: bool, tail: Optional[int],
1655
+ pool: bool) -> str:
1152
1656
  code = [
1657
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1153
1658
  'msg = serve_utils.stream_replica_logs('
1154
- f'{service_name!r}, {replica_id!r}, follow={follow})',
1155
- 'print(msg, flush=True)'
1659
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
1660
+ '**kwargs)', 'print(msg, flush=True)'
1156
1661
  ]
1157
1662
  return cls._build(code)
1158
1663
 
1159
1664
  @classmethod
1160
1665
  def stream_serve_process_logs(cls, service_name: str,
1161
- stream_controller: bool, follow: bool) -> str:
1666
+ stream_controller: bool, follow: bool,
1667
+ tail: Optional[int], pool: bool) -> str:
1162
1668
  code = [
1669
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1163
1670
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1164
- f'{stream_controller}, follow={follow})', 'print(msg, flush=True)'
1671
+ f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
1672
+ 'print(msg, flush=True)'
1673
+ ]
1674
+ return cls._build(code)
1675
+
1676
+ @classmethod
1677
+ def update_service(cls, service_name: str, version: int, mode: str,
1678
+ pool: bool) -> str:
1679
+ code = [
1680
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1681
+ f'msg = serve_utils.update_service_encoded({service_name!r}, '
1682
+ f'{version}, mode={mode!r}, **kwargs)',
1683
+ 'print(msg, end="", flush=True)',
1165
1684
  ]
1166
1685
  return cls._build(code)
1167
1686
 
@@ -1175,12 +1694,3 @@ class ServeCodeGen:
1175
1694
  f'"{common_utils.get_user_hash()}"; '
1176
1695
  f'{skylet_constants.SKY_PYTHON_CMD} '
1177
1696
  f'-u -c {shlex.quote(generated_code)}')
1178
-
1179
- @classmethod
1180
- def update_service(cls, service_name: str, version: int, mode: str) -> str:
1181
- code = [
1182
- f'msg = serve_utils.update_service_encoded({service_name!r}, '
1183
- f'{version}, mode={mode!r})',
1184
- 'print(msg, end="", flush=True)',
1185
- ]
1186
- return cls._build(code)