skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import base64
3
3
  import collections
4
4
  import dataclasses
5
+ import datetime
5
6
  import enum
6
7
  import os
7
8
  import pathlib
@@ -11,9 +12,10 @@ import shlex
11
12
  import shutil
12
13
  import threading
13
14
  import time
15
+ import traceback
14
16
  import typing
15
- from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar, Union)
17
+ from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
18
+ List, Optional, TextIO, Type, TypeVar, Union)
17
19
  import uuid
18
20
 
19
21
  import colorama
@@ -22,19 +24,25 @@ import filelock
22
24
  from sky import backends
23
25
  from sky import exceptions
24
26
  from sky import global_user_state
27
+ from sky import sky_logging
28
+ from sky import skypilot_config
25
29
  from sky.adaptors import common as adaptors_common
30
+ from sky.jobs import state as managed_job_state
26
31
  from sky.serve import constants
27
32
  from sky.serve import serve_state
28
33
  from sky.serve import spot_placer
29
34
  from sky.skylet import constants as skylet_constants
30
35
  from sky.skylet import job_lib
31
36
  from sky.utils import annotations
37
+ from sky.utils import command_runner
32
38
  from sky.utils import common_utils
39
+ from sky.utils import controller_utils
33
40
  from sky.utils import log_utils
34
41
  from sky.utils import message_utils
35
42
  from sky.utils import resources_utils
36
43
  from sky.utils import status_lib
37
44
  from sky.utils import ux_utils
45
+ from sky.utils import yaml_utils
38
46
 
39
47
  if typing.TYPE_CHECKING:
40
48
  import fastapi
@@ -47,23 +55,19 @@ else:
47
55
  psutil = adaptors_common.LazyImport('psutil')
48
56
  requests = adaptors_common.LazyImport('requests')
49
57
 
50
-
51
- @annotations.lru_cache(scope='request')
52
- def get_num_service_threshold():
53
- """Get number of services threshold, calculating it only when needed."""
54
- system_memory_gb = psutil.virtual_memory().total // (1024**3)
55
- return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
56
-
58
+ logger = sky_logging.init_logger(__name__)
57
59
 
58
60
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
59
61
 
60
- # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
61
- # and always appear after a space. Be careful when changing UX as this
62
- # assumption is used to expand some log files while ignoring others.
63
- _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
64
- _SKYPILOT_PROVISION_LOG_PATTERN = (
65
- fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
66
- _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
62
+ # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
63
+ # when changing UX as this assumption is used to expand some log files while
64
+ # ignoring others.
65
+ _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
66
+ _SKYPILOT_PROVISION_API_LOG_PATTERN = (
67
+ fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
68
+ # New hint pattern for provision logs
69
+ _SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
70
+ _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
67
71
 
68
72
  # TODO(tian): Find all existing replica id and print here.
69
73
  _FAILED_TO_FIND_REPLICA_MSG = (
@@ -244,7 +248,123 @@ class RequestTimestamp(RequestsAggregator):
244
248
  return f'RequestTimestamp(timestamps={self.timestamps})'
245
249
 
246
250
 
247
- def validate_service_task(task: 'sky.Task') -> None:
251
+ def get_service_filelock_path(pool: str) -> str:
252
+ path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
253
+ 'pool.lock').expanduser().absolute()
254
+ path.parents[0].mkdir(parents=True, exist_ok=True)
255
+ return str(path)
256
+
257
+
258
+ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
259
+ pool: bool) -> None:
260
+ """Validate the consolidation mode config."""
261
+ # Check whether the consolidation mode config is changed.
262
+ controller = controller_utils.get_controller_for_pool(pool).value
263
+ if current_is_consolidation_mode:
264
+ controller_cn = controller.cluster_name
265
+ if global_user_state.cluster_with_name_exists(controller_cn):
266
+ with ux_utils.print_exception_no_traceback():
267
+ raise exceptions.InconsistentConsolidationModeError(
268
+ f'{colorama.Fore.RED}Consolidation mode for '
269
+ f'{controller.controller_type} is enabled, but the '
270
+ f'controller cluster {controller_cn} is still running. '
271
+ 'Please terminate the controller cluster first.'
272
+ f'{colorama.Style.RESET_ALL}')
273
+ else:
274
+ noun = 'pool' if pool else 'service'
275
+ all_services = [
276
+ svc for svc in serve_state.get_services() if svc['pool'] == pool
277
+ ]
278
+ if all_services:
279
+ with ux_utils.print_exception_no_traceback():
280
+ raise exceptions.InconsistentConsolidationModeError(
281
+ f'{colorama.Fore.RED}Consolidation mode for '
282
+ f'{controller.controller_type} is disabled, but there are '
283
+ f'still {len(all_services)} {noun}s running. Please '
284
+ f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
285
+
286
+
287
+ @annotations.lru_cache(scope='request', maxsize=1)
288
+ def is_consolidation_mode(pool: bool = False) -> bool:
289
+ # Use jobs config for pool consolidation mode.
290
+ controller = controller_utils.get_controller_for_pool(pool).value
291
+ consolidation_mode = skypilot_config.get_nested(
292
+ (controller.controller_type, 'controller', 'consolidation_mode'),
293
+ default_value=False)
294
+ # We should only do this check on API server, as the controller will not
295
+ # have related config and will always seemingly disabled for consolidation
296
+ # mode. Check #6611 for more details.
297
+ if (os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None
298
+ and controller.controller_type == 'jobs'):
299
+ # if we are in the job controller, we must always be in consolidation
300
+ # mode.
301
+ return True
302
+ if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
303
+ _validate_consolidation_mode_config(consolidation_mode, pool)
304
+ return consolidation_mode
305
+
306
+
307
+ def ha_recovery_for_consolidation_mode(pool: bool):
308
+ """Recovery logic for HA mode."""
309
+ # No setup recovery is needed in consolidation mode, as the API server
310
+ # already has all runtime installed. Directly start jobs recovery here.
311
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
312
+ runner = command_runner.LocalProcessCommandRunner()
313
+ noun = 'pool' if pool else 'serve'
314
+ capnoun = noun.capitalize()
315
+ prefix = f'{noun}_'
316
+ with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
317
+ 'w',
318
+ encoding='utf-8') as f:
319
+ start = time.time()
320
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
321
+ for service_name in serve_state.get_glob_service_names(None):
322
+ svc = _get_service_status(service_name,
323
+ pool=pool,
324
+ with_replica_info=False)
325
+ if svc is None:
326
+ continue
327
+ controller_pid = svc['controller_pid']
328
+ if controller_pid is not None:
329
+ try:
330
+ if _controller_process_alive(controller_pid, service_name):
331
+ f.write(f'Controller pid {controller_pid} for '
332
+ f'{noun} {service_name} is still running. '
333
+ 'Skipping recovery.\n')
334
+ continue
335
+ except Exception: # pylint: disable=broad-except
336
+ # _controller_process_alive may raise if psutil fails; we
337
+ # should not crash the recovery logic because of this.
338
+ f.write('Error checking controller pid '
339
+ f'{controller_pid} for {noun} {service_name}\n')
340
+
341
+ script = serve_state.get_ha_recovery_script(service_name)
342
+ if script is None:
343
+ f.write(f'{capnoun} {service_name}\'s recovery script does '
344
+ 'not exist. Skipping recovery.\n')
345
+ continue
346
+ rc, out, err = runner.run(script, require_outputs=True)
347
+ if rc:
348
+ f.write(f'Recovery script returned {rc}. '
349
+ f'Output: {out}\nError: {err}\n')
350
+ f.write(f'{capnoun} {service_name} completed recovery at '
351
+ f'{datetime.datetime.now()}\n')
352
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
353
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
354
+
355
+
356
+ def _controller_process_alive(pid: int, service_name: str) -> bool:
357
+ """Check if the controller process is alive."""
358
+ try:
359
+ process = psutil.Process(pid)
360
+ cmd_str = ' '.join(process.cmdline())
361
+ return process.is_running(
362
+ ) and f'--service-name {service_name}' in cmd_str
363
+ except psutil.NoSuchProcess:
364
+ return False
365
+
366
+
367
+ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
248
368
  """Validate the task for Sky Serve.
249
369
 
250
370
  Args:
@@ -267,19 +387,43 @@ def validate_service_task(task: 'sky.Task') -> None:
267
387
  'use `dynamic_ondemand_fallback` or set '
268
388
  'base_ondemand_fallback_replicas.')
269
389
 
390
+ field_name = 'service' if not pool else 'pool'
270
391
  if task.service is None:
271
392
  with ux_utils.print_exception_no_traceback():
272
- raise RuntimeError('Service section not found.')
393
+ raise RuntimeError(f'{field_name.capitalize()} section not found.')
394
+
395
+ if pool != task.service.pool:
396
+ with ux_utils.print_exception_no_traceback():
397
+ raise ValueError(f'{field_name.capitalize()} section in the YAML '
398
+ f'file does not match the pool argument. '
399
+ f'To fix, add a valid `{field_name}` field.')
273
400
 
274
401
  policy_description = ('on-demand'
275
402
  if task.service.dynamic_ondemand_fallback else 'spot')
276
403
  for resource in list(task.resources):
277
404
  if resource.job_recovery is not None:
405
+ sys_name = 'SkyServe' if not pool else 'Cluster Pool'
278
406
  with ux_utils.print_exception_no_traceback():
279
- raise ValueError('job_recovery is disabled for SkyServe. '
280
- 'SkyServe will replenish preempted spot '
407
+ raise ValueError(f'job_recovery is disabled for {sys_name}. '
408
+ f'{sys_name} will replenish preempted spot '
281
409
  f'with {policy_description} instances.')
282
410
 
411
+ if pool:
412
+ accelerators = set()
413
+ for resource in task.resources:
414
+ if resource.accelerators is not None:
415
+ if isinstance(resource.accelerators, str):
416
+ accelerators.add(resource.accelerators)
417
+ elif isinstance(resource.accelerators, dict):
418
+ accelerators.update(resource.accelerators.keys())
419
+ elif isinstance(resource.accelerators, list):
420
+ accelerators.update(resource.accelerators)
421
+ if len(accelerators) > 1:
422
+ with ux_utils.print_exception_no_traceback():
423
+ raise ValueError('Heterogeneous clusters are not supported for '
424
+ 'cluster pools please specify one accelerator '
425
+ 'for all workers.')
426
+
283
427
  # Try to create a spot placer from the task yaml. Check if the task yaml
284
428
  # is valid for spot placer.
285
429
  spot_placer.SpotPlacer.from_task(task.service, task)
@@ -300,7 +444,7 @@ def validate_service_task(task: 'sky.Task') -> None:
300
444
  raise ValueError(
301
445
  '`spot_placer` is only supported for spot resources. '
302
446
  'Please explicitly specify `use_spot: true` in resources.')
303
- if task.service.ports is None:
447
+ if not pool and task.service.ports is None:
304
448
  requested_ports = list(
305
449
  resources_utils.port_ranges_to_set(requested_resources.ports))
306
450
  if len(requested_ports) != 1:
@@ -320,10 +464,16 @@ def validate_service_task(task: 'sky.Task') -> None:
320
464
  f'Got multiple ports: {service_port} and '
321
465
  f'{replica_ingress_port} in different resources. '
322
466
  'Please specify the same port instead.')
467
+ if pool:
468
+ if (task.service.ports is not None or
469
+ requested_resources.ports is not None):
470
+ with ux_utils.print_exception_no_traceback():
471
+ raise ValueError('Cannot specify ports in a cluster pool.')
323
472
 
324
473
 
325
- def generate_service_name():
326
- return f'sky-service-{uuid.uuid4().hex[:4]}'
474
+ def generate_service_name(pool: bool = False):
475
+ noun = 'pool' if pool else 'service'
476
+ return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
327
477
 
328
478
 
329
479
  def generate_remote_service_dir_name(service_name: str) -> str:
@@ -390,6 +540,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
390
540
 
391
541
 
392
542
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
543
+ # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
544
+ # checking replica cluster existence. Be careful when changing it.
393
545
  return f'{service_name}-{replica_id}'
394
546
 
395
547
 
@@ -425,26 +577,63 @@ def set_service_status_and_active_versions_from_replica(
425
577
  active_versions=active_versions)
426
578
 
427
579
 
428
- def update_service_status() -> None:
429
- services = serve_state.get_services()
430
- for record in services:
431
- if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
580
+ def update_service_status(pool: bool) -> None:
581
+ noun = 'pool' if pool else 'serve'
582
+ capnoun = noun.capitalize()
583
+ service_names = serve_state.get_glob_service_names(None)
584
+ for service_name in service_names:
585
+ record = _get_service_status(service_name,
586
+ pool=pool,
587
+ with_replica_info=False)
588
+ if record is None:
589
+ continue
590
+ service_status = record['status']
591
+ if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
432
592
  # Skip services that is shutting down.
433
593
  continue
434
- controller_job_id = record['controller_job_id']
435
- assert controller_job_id is not None
436
- controller_status = job_lib.get_status(controller_job_id)
437
- if controller_status is None or controller_status.is_terminal():
438
- # If controller job is not running, set it as controller failed.
439
- serve_state.set_service_status_and_active_versions(
440
- record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
594
+
595
+ logger.info(f'Update {noun} status for {service_name!r} '
596
+ f'with status {service_status}')
597
+
598
+ controller_pid = record['controller_pid']
599
+ if controller_pid is None:
600
+ logger.info(f'{capnoun} {service_name!r} controller pid is None. '
601
+ f'Unexpected status {service_status}. Set to failure.')
602
+ elif controller_pid < 0:
603
+ # Backwards compatibility: this service was submitted when ray was
604
+ # still used for controller process management. We set the
605
+ # value_to_replace_existing_entries to -1 to indicate historical
606
+ # services.
607
+ # TODO(tian): Remove before 0.13.0.
608
+ controller_job_id = record['controller_job_id']
609
+ assert controller_job_id is not None
610
+ controller_status = job_lib.get_status(controller_job_id)
611
+ if (controller_status is not None and
612
+ not controller_status.is_terminal()):
613
+ continue
614
+ logger.info(f'Updating {noun} {service_name!r} in old version. '
615
+ f'SkyPilot job status: {controller_status}. '
616
+ 'Set to failure.')
617
+ else:
618
+ if _controller_process_alive(controller_pid, service_name):
619
+ # The controller is still running.
620
+ continue
621
+ logger.info(f'{capnoun} {service_name!r} controller pid '
622
+ f'{controller_pid} is not alive. Set to failure.')
623
+
624
+ # If controller job is not running, set it as controller failed.
625
+ serve_state.set_service_status_and_active_versions(
626
+ service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
441
627
 
442
628
 
443
- def update_service_encoded(service_name: str, version: int, mode: str) -> str:
444
- service_status = _get_service_status(service_name)
629
+ def update_service_encoded(service_name: str, version: int, mode: str,
630
+ pool: bool) -> str:
631
+ noun = 'pool' if pool else 'service'
632
+ capnoun = noun.capitalize()
633
+ service_status = _get_service_status(service_name, pool=pool)
445
634
  if service_status is None:
446
635
  with ux_utils.print_exception_no_traceback():
447
- raise ValueError(f'Service {service_name!r} does not exist.')
636
+ raise ValueError(f'{capnoun} {service_name!r} does not exist.')
448
637
  controller_port = service_status['controller_port']
449
638
  resp = requests.post(
450
639
  _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
@@ -455,27 +644,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
455
644
  })
456
645
  if resp.status_code == 404:
457
646
  with ux_utils.print_exception_no_traceback():
647
+ # This only happens for services since pool is added after the
648
+ # update feature is introduced.
458
649
  raise ValueError(
459
650
  'The service is up-ed in an old version and does not '
460
651
  'support update. Please `sky serve down` '
461
652
  'it first and relaunch the service. ')
462
653
  elif resp.status_code == 400:
463
654
  with ux_utils.print_exception_no_traceback():
464
- raise ValueError(f'Client error during service update: {resp.text}')
655
+ raise ValueError(f'Client error during {noun} update: {resp.text}')
465
656
  elif resp.status_code == 500:
466
657
  with ux_utils.print_exception_no_traceback():
467
658
  raise RuntimeError(
468
- f'Server error during service update: {resp.text}')
659
+ f'Server error during {noun} update: {resp.text}')
469
660
  elif resp.status_code != 200:
470
661
  with ux_utils.print_exception_no_traceback():
471
- raise ValueError(f'Failed to update service: {resp.text}')
662
+ raise ValueError(f'Failed to update {noun}: {resp.text}')
472
663
 
473
664
  service_msg = resp.json()['message']
474
665
  return message_utils.encode_payload(service_msg)
475
666
 
476
667
 
477
668
  def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
478
- service_status = _get_service_status(service_name)
669
+ # TODO(tian): Currently pool does not support terminating replica.
670
+ service_status = _get_service_status(service_name, pool=False)
479
671
  if service_status is None:
480
672
  with ux_utils.print_exception_no_traceback():
481
673
  raise ValueError(f'Service {service_name!r} does not exist.')
@@ -506,6 +698,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
506
698
 
507
699
  def _get_service_status(
508
700
  service_name: str,
701
+ pool: bool,
509
702
  with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
510
703
  """Get the status dict of the service.
511
704
 
@@ -520,34 +713,96 @@ def _get_service_status(
520
713
  record = serve_state.get_service_from_name(service_name)
521
714
  if record is None:
522
715
  return None
716
+ if record['pool'] != pool:
717
+ return None
718
+
719
+ record['pool_yaml'] = ''
720
+ if record['pool']:
721
+ latest_yaml_path = generate_task_yaml_file_name(service_name,
722
+ record['version'])
723
+ raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
724
+ original_config = raw_yaml_config.get('_user_specified_yaml')
725
+ if original_config is None:
726
+ # Fall back to old display format.
727
+ original_config = raw_yaml_config
728
+ original_config.pop('run', None)
729
+ svc: Dict[str, Any] = original_config.pop('service')
730
+ if svc is not None:
731
+ svc.pop('pool', None) # Remove pool from service config
732
+ original_config['pool'] = svc # Add pool to root config
733
+ else:
734
+ original_config = yaml_utils.safe_load(original_config)
735
+ record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
736
+
737
+ record['target_num_replicas'] = 0
738
+ try:
739
+ controller_port = record['controller_port']
740
+ resp = requests.get(
741
+ _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
742
+ '/autoscaler/info')
743
+ record['target_num_replicas'] = resp.json()['target_num_replicas']
744
+ except requests.exceptions.RequestException:
745
+ record['target_num_replicas'] = None
746
+ except Exception as e: # pylint: disable=broad-except
747
+ logger.error(f'Failed to get autoscaler info for {service_name}: '
748
+ f'{common_utils.format_exception(e)}\n'
749
+ f'Traceback: {traceback.format_exc()}')
750
+
523
751
  if with_replica_info:
524
752
  record['replica_info'] = [
525
- info.to_info_dict(with_handle=True)
753
+ info.to_info_dict(with_handle=True, with_url=not pool)
526
754
  for info in serve_state.get_replica_infos(service_name)
527
755
  ]
756
+ if pool:
757
+ for replica_info in record['replica_info']:
758
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
759
+ service_name, replica_info['name'])
760
+ replica_info['used_by'] = job_ids[0] if job_ids else None
528
761
  return record
529
762
 
530
763
 
531
- def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
764
+ def get_service_status_pickled(service_names: Optional[List[str]],
765
+ pool: bool) -> List[Dict[str, str]]:
532
766
  service_statuses: List[Dict[str, str]] = []
533
767
  if service_names is None:
534
768
  # Get all service names
535
769
  service_names = serve_state.get_glob_service_names(None)
536
770
  for service_name in service_names:
537
- service_status = _get_service_status(service_name)
771
+ service_status = _get_service_status(service_name, pool=pool)
538
772
  if service_status is None:
539
773
  continue
540
774
  service_statuses.append({
541
775
  k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
542
776
  for k, v in service_status.items()
543
777
  })
778
+ return sorted(service_statuses, key=lambda x: x['name'])
779
+
780
+
781
+ # TODO (kyuds): remove when serve codegen is removed
782
+ def get_service_status_encoded(service_names: Optional[List[str]],
783
+ pool: bool) -> str:
544
784
  # We have to use payload_type here to avoid the issue of
545
785
  # message_utils.decode_payload() not being able to correctly decode the
546
786
  # message with <sky-payload> tags.
787
+ service_statuses = get_service_status_pickled(service_names, pool)
547
788
  return message_utils.encode_payload(service_statuses,
548
789
  payload_type='service_status')
549
790
 
550
791
 
792
+ def unpickle_service_status(
793
+ payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
794
+ service_statuses: List[Dict[str, Any]] = []
795
+ for service_status in payload:
796
+ if not isinstance(service_status, dict):
797
+ raise ValueError(f'Invalid service status: {service_status}')
798
+ service_statuses.append({
799
+ k: pickle.loads(base64.b64decode(v))
800
+ for k, v in service_status.items()
801
+ })
802
+ return service_statuses
803
+
804
+
805
+ # TODO (kyuds): remove when serve codegen is removed
551
806
  def load_service_status(payload: str) -> List[Dict[str, Any]]:
552
807
  try:
553
808
  service_statuses_encoded = message_utils.decode_payload(
@@ -559,26 +814,85 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
559
814
  service_statuses_encoded = message_utils.decode_payload(payload)
560
815
  else:
561
816
  raise
562
- service_statuses: List[Dict[str, Any]] = []
563
- for service_status in service_statuses_encoded:
564
- if not isinstance(service_status, dict):
565
- raise ValueError(f'Invalid service status: {service_status}')
566
- service_statuses.append({
567
- k: pickle.loads(base64.b64decode(v))
568
- for k, v in service_status.items()
569
- })
570
- return service_statuses
817
+ return unpickle_service_status(service_statuses_encoded)
571
818
 
572
819
 
820
+ # TODO (kyuds): remove when serve codegen is removed
573
821
  def add_version_encoded(service_name: str) -> str:
574
822
  new_version = serve_state.add_version(service_name)
575
823
  return message_utils.encode_payload(new_version)
576
824
 
577
825
 
826
+ # TODO (kyuds): remove when serve codegen is removed
578
827
  def load_version_string(payload: str) -> str:
579
828
  return message_utils.decode_payload(payload)
580
829
 
581
830
 
831
+ def get_ready_replicas(
832
+ service_name: str) -> List['replica_managers.ReplicaInfo']:
833
+ logger.info(f'Get number of replicas for pool {service_name!r}')
834
+ return [
835
+ info for info in serve_state.get_replica_infos(service_name)
836
+ if info.status == serve_state.ReplicaStatus.READY
837
+ ]
838
+
839
+
840
+ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
841
+ """Get the next available cluster name from idle replicas.
842
+
843
+ Args:
844
+ service_name: The name of the service.
845
+ job_id: Optional job ID to associate with the acquired cluster.
846
+ If None, a placeholder will be used.
847
+
848
+ Returns:
849
+ The cluster name if an idle replica is found, None otherwise.
850
+ """
851
+ # Check if service exists
852
+ service_status = _get_service_status(service_name,
853
+ pool=True,
854
+ with_replica_info=False)
855
+ if service_status is None:
856
+ logger.error(f'Service {service_name!r} does not exist.')
857
+ return None
858
+ if not service_status['pool']:
859
+ logger.error(f'Service {service_name!r} is not a cluster pool.')
860
+ return None
861
+ with filelock.FileLock(get_service_filelock_path(service_name)):
862
+ logger.debug(f'Get next cluster name for pool {service_name!r}')
863
+ ready_replicas = get_ready_replicas(service_name)
864
+ idle_replicas: List['replica_managers.ReplicaInfo'] = []
865
+ for replica_info in ready_replicas:
866
+ jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
867
+ service_name, replica_info.cluster_name)
868
+ # TODO(tian): Make it resources aware. Currently we allow and only
869
+ # allow one job per replica. In the following PR, we should:
870
+ # i) When the replica is launched with `any_of` resources (
871
+ # replicas can have different resources), we should check if
872
+ # the resources that jobs require are available on the replica.
873
+ # e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
874
+ # should only goes to replica with A100.
875
+ # ii) When a job only requires a subset of the resources on the
876
+ # replica, each replica should be able to handle multiple jobs
877
+ # at the same time. e.g., if a job requires A100:1 on a A100:8
878
+ # pool, it should be able to run 4 jobs at the same time.
879
+ if not jobs_on_replica:
880
+ idle_replicas.append(replica_info)
881
+ if not idle_replicas:
882
+ logger.info(f'No idle replicas found for pool {service_name!r}')
883
+ return None
884
+
885
+ # Select the first idle replica.
886
+ # TODO(tian): "Load balancing" policy.
887
+ replica_info = idle_replicas[0]
888
+ logger.info(f'Selected replica {replica_info.replica_id} with cluster '
889
+ f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
890
+ f'{service_name!r}')
891
+ managed_job_state.set_current_cluster_name(job_id,
892
+ replica_info.cluster_name)
893
+ return replica_info.cluster_name
894
+
895
+
582
896
  def _terminate_failed_services(
583
897
  service_name: str,
584
898
  service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
@@ -598,8 +912,8 @@ def _terminate_failed_services(
598
912
  # replicas, so we don't need to try again here.
599
913
  for replica_info in serve_state.get_replica_infos(service_name):
600
914
  # TODO(tian): Refresh latest status of the cluster.
601
- if global_user_state.get_cluster_from_name(
602
- replica_info.cluster_name) is not None:
915
+ if global_user_state.cluster_with_name_exists(
916
+ replica_info.cluster_name):
603
917
  remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
604
918
  serve_state.remove_replica(service_name, replica_info.replica_id)
605
919
 
@@ -608,9 +922,11 @@ def _terminate_failed_services(
608
922
  shutil.rmtree(service_dir)
609
923
  serve_state.remove_service(service_name)
610
924
  serve_state.delete_all_versions(service_name)
925
+ serve_state.remove_ha_recovery_script(service_name)
611
926
 
612
927
  if not remaining_replica_clusters:
613
928
  return None
929
+ # TODO(tian): Try to terminate those replica clusters.
614
930
  remaining_identity = ', '.join(remaining_replica_clusters)
615
931
  return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
616
932
  f'failed status ({service_status}). This may indicate a resource '
@@ -618,17 +934,38 @@ def _terminate_failed_services(
618
934
  f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
619
935
 
620
936
 
621
- def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
937
+ def terminate_services(service_names: Optional[List[str]], purge: bool,
938
+ pool: bool) -> str:
939
+ noun = 'pool' if pool else 'service'
940
+ capnoun = noun.capitalize()
622
941
  service_names = serve_state.get_glob_service_names(service_names)
623
942
  terminated_service_names: List[str] = []
624
943
  messages: List[str] = []
625
944
  for service_name in service_names:
626
945
  service_status = _get_service_status(service_name,
946
+ pool=pool,
627
947
  with_replica_info=False)
948
+ if service_status is None:
949
+ continue
628
950
  if (service_status is not None and service_status['status']
629
951
  == serve_state.ServiceStatus.SHUTTING_DOWN):
630
952
  # Already scheduled to be terminated.
631
953
  continue
954
+ if pool:
955
+ nonterminal_job_ids = (
956
+ managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
957
+ if nonterminal_job_ids:
958
+ nonterminal_job_ids_str = ','.join(
959
+ str(job_id) for job_id in nonterminal_job_ids)
960
+ num_nonterminal_jobs = len(nonterminal_job_ids)
961
+ messages.append(
962
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
963
+ f'{num_nonterminal_jobs} nonterminal jobs: '
964
+ f'{nonterminal_job_ids_str}. To terminate the {noun}, '
965
+ f'please run `sky jobs cancel --pool {service_name}` to '
966
+ 'cancel all jobs in the pool first.'
967
+ f'{colorama.Style.RESET_ALL}')
968
+ continue
632
969
  # If the `services` and `version_specs` table are not aligned, it might
633
970
  # result in a None service status. In this case, the controller process
634
971
  # is not functioning as well and we should also use the
@@ -636,10 +973,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
636
973
  # This is a safeguard for a rare case, that is accidentally abort
637
974
  # between `serve_state.add_service` and
638
975
  # `serve_state.add_or_update_version` in service.py.
639
- if (service_status is None or service_status['status']
976
+ purge_cmd = (f'sky jobs pool down {service_name} --purge'
977
+ if pool else f'sky serve down {service_name} --purge')
978
+ if (service_status['status']
640
979
  in serve_state.ServiceStatus.failed_statuses()):
641
- failed_status = (service_status['status']
642
- if service_status is not None else None)
980
+ failed_status = service_status['status']
643
981
  if purge:
644
982
  message = _terminate_failed_services(service_name,
645
983
  failed_status)
@@ -647,11 +985,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
647
985
  messages.append(message)
648
986
  else:
649
987
  messages.append(
650
- f'{colorama.Fore.YELLOW}Service {service_name!r} is in '
988
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
651
989
  f'failed status ({failed_status}). Skipping '
652
990
  'its termination as it could lead to a resource leak. '
653
- f'(Use `sky serve down {service_name} --purge` to '
654
- 'forcefully terminate the service.)'
991
+ f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
655
992
  f'{colorama.Style.RESET_ALL}')
656
993
  # Don't add to terminated_service_names since it's not
657
994
  # actually terminated.
@@ -668,17 +1005,18 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
668
1005
  f.flush()
669
1006
  terminated_service_names.append(f'{service_name!r}')
670
1007
  if not terminated_service_names:
671
- messages.append('No service to terminate.')
1008
+ messages.append(f'No {noun} to terminate.')
672
1009
  else:
673
- identity_str = f'Service {terminated_service_names[0]} is'
1010
+ identity_str = f'{capnoun} {terminated_service_names[0]} is'
674
1011
  if len(terminated_service_names) > 1:
675
1012
  terminated_service_names_str = ', '.join(terminated_service_names)
676
- identity_str = f'Services {terminated_service_names_str} are'
1013
+ identity_str = f'{capnoun}s {terminated_service_names_str} are'
677
1014
  messages.append(f'{identity_str} scheduled to be terminated.')
678
1015
  return '\n'.join(messages)
679
1016
 
680
1017
 
681
- def wait_service_registration(service_name: str, job_id: int) -> str:
1018
+ def wait_service_registration(service_name: str, job_id: int,
1019
+ pool: bool) -> str:
682
1020
  """Util function to call at the end of `sky.serve.up()`.
683
1021
 
684
1022
  This function will:
@@ -691,49 +1029,67 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
691
1029
  Returns:
692
1030
  Encoded load balancer port assigned to the service.
693
1031
  """
1032
+ # TODO (kyuds): when codegen is fully deprecated, return the lb port
1033
+ # as an int directly instead of encoding it.
694
1034
  start_time = time.time()
695
1035
  setup_completed = False
1036
+ noun = 'pool' if pool else 'service'
696
1037
  while True:
697
- job_status = job_lib.get_status(job_id)
698
- if job_status is None or job_status < job_lib.JobStatus.RUNNING:
699
- # Wait for the controller process to finish setting up. It can be
700
- # slow if a lot cloud dependencies are being installed.
701
- if (time.time() - start_time >
702
- constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
703
- with ux_utils.print_exception_no_traceback():
704
- raise RuntimeError(
705
- f'Failed to start the controller '
706
- f'process for the service {service_name!r} '
707
- f'within '
708
- f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS} seconds.'
709
- )
710
- # No need to check the service status as the controller process
711
- # is still setting up.
712
- time.sleep(1)
713
- continue
1038
+ # Only do this check for non-consolidation mode as consolidation mode
1039
+ # has no setup process.
1040
+ if not is_consolidation_mode(pool):
1041
+ job_status = job_lib.get_status(job_id)
1042
+ if job_status is None or job_status < job_lib.JobStatus.RUNNING:
1043
+ # Wait for the controller process to finish setting up. It
1044
+ # can be slow if a lot cloud dependencies are being installed.
1045
+ if (time.time() - start_time >
1046
+ constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
1047
+ with ux_utils.print_exception_no_traceback():
1048
+ raise RuntimeError(
1049
+ f'Failed to start the controller process for '
1050
+ f'the {noun} {service_name!r} within '
1051
+ f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
1052
+ f' seconds.')
1053
+ # No need to check the service status as the controller process
1054
+ # is still setting up.
1055
+ time.sleep(1)
1056
+ continue
714
1057
 
715
1058
  if not setup_completed:
716
1059
  setup_completed = True
717
1060
  # Reset the start time to wait for the service to be registered.
718
1061
  start_time = time.time()
719
1062
 
720
- record = serve_state.get_service_from_name(service_name)
1063
+ record = _get_service_status(service_name,
1064
+ pool=pool,
1065
+ with_replica_info=False)
721
1066
  if record is not None:
722
1067
  if job_id != record['controller_job_id']:
1068
+ if pool:
1069
+ command_to_run = 'sky jobs pool apply --pool'
1070
+ else:
1071
+ command_to_run = 'sky serve update'
723
1072
  with ux_utils.print_exception_no_traceback():
724
1073
  raise ValueError(
725
- f'The service {service_name!r} is already running. '
726
- 'Please specify a different name for your service. '
727
- 'To update an existing service, run: sky serve update '
728
- f'{service_name} <new-service-yaml>')
1074
+ f'The {noun} {service_name!r} is already running. '
1075
+ f'Please specify a different name for your {noun}. '
1076
+ f'To update an existing {noun}, run: {command_to_run}'
1077
+ f' {service_name} <new-{noun}-yaml>')
729
1078
  lb_port = record['load_balancer_port']
730
1079
  if lb_port is not None:
731
1080
  return message_utils.encode_payload(lb_port)
732
- elif len(serve_state.get_services()) >= get_num_service_threshold():
733
- with ux_utils.print_exception_no_traceback():
734
- raise RuntimeError('Max number of services reached. '
735
- 'To spin up more services, please '
736
- 'tear down some existing services.')
1081
+ else:
1082
+ controller_log_path = os.path.expanduser(
1083
+ generate_remote_controller_log_file_name(service_name))
1084
+ if os.path.exists(controller_log_path):
1085
+ with open(controller_log_path, 'r', encoding='utf-8') as f:
1086
+ log_content = f.read()
1087
+ if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
1088
+ in log_content):
1089
+ with ux_utils.print_exception_no_traceback():
1090
+ raise RuntimeError('Max number of services reached. '
1091
+ 'To spin up more services, please '
1092
+ 'tear down some existing services.')
737
1093
  elapsed = time.time() - start_time
738
1094
  if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
739
1095
  # Print the controller log to help user debug.
@@ -754,12 +1110,16 @@ def load_service_initialization_result(payload: str) -> int:
754
1110
  return message_utils.decode_payload(payload)
755
1111
 
756
1112
 
757
- def check_service_status_healthy(service_name: str) -> Optional[str]:
758
- service_record = serve_state.get_service_from_name(service_name)
1113
+ def _check_service_status_healthy(service_name: str,
1114
+ pool: bool) -> Optional[str]:
1115
+ service_record = _get_service_status(service_name,
1116
+ pool,
1117
+ with_replica_info=False)
1118
+ capnoun = 'Service' if not pool else 'Pool'
759
1119
  if service_record is None:
760
- return f'Service {service_name!r} does not exist.'
1120
+ return f'{capnoun} {service_name!r} does not exist.'
761
1121
  if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
762
- return (f'Service {service_name!r} is still initializing its '
1122
+ return (f'{capnoun} {service_name!r} is still initializing its '
763
1123
  'controller. Please try again later.')
764
1124
  return None
765
1125
 
@@ -782,6 +1142,73 @@ def get_latest_version_with_min_replicas(
782
1142
  return active_versions[-1] if active_versions else None
783
1143
 
784
1144
 
1145
+ def _process_line(line: str,
1146
+ cluster_name: str,
1147
+ stop_on_eof: bool = False) -> Iterator[str]:
1148
+ # The line might be directing users to view logs, like
1149
+ # `✓ Cluster launched: new-http. View logs at: *.log`
1150
+ # We should tail the detailed logs for user.
1151
+ def cluster_is_up() -> bool:
1152
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
1153
+ return status == status_lib.ClusterStatus.UP
1154
+
1155
+ provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1156
+ line)
1157
+ provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
1158
+ line)
1159
+ log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1160
+
1161
+ def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
1162
+ try:
1163
+ with open(p, 'r', newline='', encoding='utf-8') as f:
1164
+ # Exit if >10s without new content to avoid hanging when INIT
1165
+ yield from log_utils.follow_logs(f,
1166
+ should_stop=cluster_is_up,
1167
+ stop_on_eof=stop_on_eof,
1168
+ idle_timeout_seconds=10)
1169
+ except FileNotFoundError:
1170
+ # Fall back cleanly if the hinted path doesn't exist
1171
+ yield line
1172
+ yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
1173
+ f'Try to expand log file {p} but not found. Skipping...'
1174
+ f'{colorama.Style.RESET_ALL}')
1175
+ return
1176
+
1177
+ if provision_api_log_prompt is not None:
1178
+ rel_path = provision_api_log_prompt.group(1)
1179
+ nested_log_path = pathlib.Path(
1180
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1181
+ rel_path).resolve()
1182
+ yield from _stream_provision_path(nested_log_path)
1183
+ return
1184
+
1185
+ if provision_log_cmd_prompt is not None:
1186
+ # Resolve provision log via cluster table first, then history.
1187
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1188
+ cluster_name)
1189
+ if not log_path_str:
1190
+ log_path_str = (
1191
+ global_user_state.get_cluster_history_provision_log_path(
1192
+ cluster_name))
1193
+ if not log_path_str:
1194
+ yield line
1195
+ return
1196
+ yield from _stream_provision_path(
1197
+ pathlib.Path(log_path_str).expanduser().resolve())
1198
+ return
1199
+
1200
+ if log_prompt is not None:
1201
+ # Now we skip other logs (file sync logs) since we lack
1202
+ # utility to determine when these log files are finished
1203
+ # writing.
1204
+ # TODO(tian): We should not skip these logs since there are
1205
+ # small chance that error will happen in file sync. Need to
1206
+ # find a better way to do this.
1207
+ return
1208
+
1209
+ yield line
1210
+
1211
+
785
1212
  def _follow_logs_with_provision_expanding(
786
1213
  file: TextIO,
787
1214
  cluster_name: str,
@@ -804,51 +1231,8 @@ def _follow_logs_with_provision_expanding(
804
1231
  Log lines, including expanded content from referenced provision logs.
805
1232
  """
806
1233
 
807
- def cluster_is_up() -> bool:
808
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
809
- if cluster_record is None:
810
- return False
811
- return cluster_record['status'] == status_lib.ClusterStatus.UP
812
-
813
1234
  def process_line(line: str) -> Iterator[str]:
814
- # The line might be directing users to view logs, like
815
- # `✓ Cluster launched: new-http. View logs at: *.log`
816
- # We should tail the detailed logs for user.
817
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
818
- log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
819
-
820
- if provision_log_prompt is not None:
821
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
822
-
823
- try:
824
- with open(nested_log_path, 'r', newline='',
825
- encoding='utf-8') as f:
826
- # We still exit if more than 10 seconds without new content
827
- # to avoid any internal bug that causes the launch to fail
828
- # while cluster status remains INIT.
829
- yield from log_utils.follow_logs(f,
830
- should_stop=cluster_is_up,
831
- stop_on_eof=stop_on_eof,
832
- idle_timeout_seconds=10)
833
- except FileNotFoundError:
834
- yield line
835
-
836
- yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
837
- f'Try to expand log file {nested_log_path} but not '
838
- f'found. Skipping...{colorama.Style.RESET_ALL}')
839
- pass
840
- return
841
-
842
- if log_prompt is not None:
843
- # Now we skip other logs (file sync logs) since we lack
844
- # utility to determine when these log files are finished
845
- # writing.
846
- # TODO(tian): We should not skip these logs since there are
847
- # small chance that error will happen in file sync. Need to
848
- # find a better way to do this.
849
- return
850
-
851
- yield line
1235
+ yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
852
1236
 
853
1237
  return log_utils.follow_logs(file,
854
1238
  should_stop=should_stop,
@@ -857,24 +1241,59 @@ def _follow_logs_with_provision_expanding(
857
1241
  idle_timeout_seconds=idle_timeout_seconds)
858
1242
 
859
1243
 
860
- def stream_replica_logs(service_name: str, replica_id: int,
861
- follow: bool) -> str:
862
- msg = check_service_status_healthy(service_name)
1244
+ def _capped_follow_logs_with_provision_expanding(
1245
+ log_list: List[str],
1246
+ cluster_name: str,
1247
+ *,
1248
+ line_cap: int = 100,
1249
+ ) -> Iterator[str]:
1250
+ """Follows logs and expands any provision.log references found.
1251
+
1252
+ Args:
1253
+ log_list: List of Log Lines to read from.
1254
+ cluster_name: Name of the cluster being launched.
1255
+ line_cap: Number of last lines to return
1256
+
1257
+ Yields:
1258
+ Log lines, including expanded content from referenced provision logs.
1259
+ """
1260
+ all_lines: Deque[str] = collections.deque(maxlen=line_cap)
1261
+
1262
+ for line in log_list:
1263
+ for processed in _process_line(line=line,
1264
+ cluster_name=cluster_name,
1265
+ stop_on_eof=False):
1266
+ all_lines.append(processed)
1267
+
1268
+ yield from all_lines
1269
+
1270
+
1271
+ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1272
+ tail: Optional[int], pool: bool) -> str:
1273
+ msg = _check_service_status_healthy(service_name, pool=pool)
863
1274
  if msg is not None:
864
1275
  return msg
1276
+ repnoun = 'worker' if pool else 'replica'
1277
+ caprepnoun = repnoun.capitalize()
865
1278
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
866
- f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
867
-
1279
+ f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
868
1280
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
869
1281
  if os.path.exists(log_file_name):
870
- with open(log_file_name, 'r', encoding='utf-8') as f:
871
- print(f.read(), flush=True)
1282
+ if tail is not None:
1283
+ lines = common_utils.read_last_n_lines(log_file_name, tail)
1284
+ for line in lines:
1285
+ if not line.endswith('\n'):
1286
+ line += '\n'
1287
+ print(line, end='', flush=True)
1288
+ else:
1289
+ with open(log_file_name, 'r', encoding='utf-8') as f:
1290
+ print(f.read(), flush=True)
872
1291
  return ''
873
1292
 
874
1293
  launch_log_file_name = generate_replica_launch_log_file_name(
875
1294
  service_name, replica_id)
876
1295
  if not os.path.exists(launch_log_file_name):
877
- return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.'
1296
+ return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
878
1297
  f'{colorama.Style.RESET_ALL}')
879
1298
 
880
1299
  replica_cluster_name = generate_replica_cluster_name(
@@ -891,42 +1310,89 @@ def stream_replica_logs(service_name: str, replica_id: int,
891
1310
 
892
1311
  replica_provisioned = (
893
1312
  lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
894
- with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
895
- for line in _follow_logs_with_provision_expanding(
896
- f,
897
- replica_cluster_name,
898
- should_stop=replica_provisioned,
899
- stop_on_eof=not follow,
900
- ):
901
- print(line, end='', flush=True)
1313
+
1314
+ # Handle launch logs based on number parameter
1315
+ final_lines_to_print = []
1316
+ if tail is not None:
1317
+ static_lines = common_utils.read_last_n_lines(launch_log_file_name,
1318
+ tail)
1319
+ lines = list(
1320
+ _capped_follow_logs_with_provision_expanding(
1321
+ log_list=static_lines,
1322
+ cluster_name=replica_cluster_name,
1323
+ line_cap=tail,
1324
+ ))
1325
+ final_lines_to_print += lines
1326
+ else:
1327
+ with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
1328
+ for line in _follow_logs_with_provision_expanding(
1329
+ f,
1330
+ replica_cluster_name,
1331
+ should_stop=replica_provisioned,
1332
+ stop_on_eof=not follow,
1333
+ ):
1334
+ print(line, end='', flush=True)
902
1335
 
903
1336
  if (not follow and
904
1337
  _get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
905
1338
  # Early exit if not following the logs.
1339
+ if tail is not None:
1340
+ for line in final_lines_to_print:
1341
+ if not line.endswith('\n'):
1342
+ line += '\n'
1343
+ print(line, end='', flush=True)
906
1344
  return ''
907
1345
 
908
1346
  backend = backends.CloudVmRayBackend()
909
1347
  handle = global_user_state.get_handle_from_cluster_name(
910
1348
  replica_cluster_name)
911
1349
  if handle is None:
1350
+ if tail is not None:
1351
+ for line in final_lines_to_print:
1352
+ if not line.endswith('\n'):
1353
+ line += '\n'
1354
+ print(line, end='', flush=True)
912
1355
  return _FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id)
913
1356
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
914
1357
 
915
1358
  # Notify user here to make sure user won't think the log is finished.
916
1359
  print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
917
- f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
1360
+ f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
918
1361
 
919
1362
  # Always tail the latest logs, which represent user setup & run.
920
- returncode = backend.tail_logs(handle, job_id=None, follow=follow)
921
- if returncode != 0:
922
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
923
- f'{replica_id}.{colorama.Style.RESET_ALL}')
1363
+ if tail is None:
1364
+ returncode = backend.tail_logs(handle, job_id=None, follow=follow)
1365
+ if returncode != 0:
1366
+ return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
1367
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1368
+ elif not follow and tail > 0:
1369
+ final = backend.tail_logs(handle,
1370
+ job_id=None,
1371
+ follow=follow,
1372
+ tail=tail,
1373
+ stream_logs=False,
1374
+ require_outputs=True,
1375
+ process_stream=True)
1376
+ if isinstance(final, int) or (final[0] != 0 and final[0] != 101):
1377
+ if tail is not None:
1378
+ for line in final_lines_to_print:
1379
+ if not line.endswith('\n'):
1380
+ line += '\n'
1381
+ print(line, end='', flush=True)
1382
+ return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1383
+ f'{replica_id}.{colorama.Style.RESET_ALL}')
1384
+ final_lines_to_print += final[1].splitlines()
1385
+ for line in final_lines_to_print[-tail:]:
1386
+ if not line.endswith('\n'):
1387
+ line += '\n'
1388
+ print(line, end='', flush=True)
924
1389
  return ''
925
1390
 
926
1391
 
927
1392
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
928
- follow: bool) -> str:
929
- msg = check_service_status_healthy(service_name)
1393
+ follow: bool, tail: Optional[int],
1394
+ pool: bool) -> str:
1395
+ msg = _check_service_status_healthy(service_name, pool)
930
1396
  if msg is not None:
931
1397
  return msg
932
1398
  if stream_controller:
@@ -935,19 +1401,31 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
935
1401
  log_file = generate_remote_load_balancer_log_file_name(service_name)
936
1402
 
937
1403
  def _service_is_terminal() -> bool:
938
- record = serve_state.get_service_from_name(service_name)
1404
+ record = _get_service_status(service_name,
1405
+ pool,
1406
+ with_replica_info=False)
939
1407
  if record is None:
940
1408
  return True
941
1409
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
942
1410
 
943
- with open(os.path.expanduser(log_file), 'r', newline='',
944
- encoding='utf-8') as f:
945
- for line in log_utils.follow_logs(
946
- f,
947
- should_stop=_service_is_terminal,
948
- stop_on_eof=not follow,
949
- ):
1411
+ if tail is not None:
1412
+ lines = common_utils.read_last_n_lines(os.path.expanduser(log_file),
1413
+ tail)
1414
+ for line in lines:
1415
+ if not line.endswith('\n'):
1416
+ line += '\n'
950
1417
  print(line, end='', flush=True)
1418
+ else:
1419
+ with open(os.path.expanduser(log_file),
1420
+ 'r',
1421
+ newline='',
1422
+ encoding='utf-8') as f:
1423
+ for line in log_utils.follow_logs(
1424
+ f,
1425
+ should_stop=_service_is_terminal,
1426
+ stop_on_eof=not follow,
1427
+ ):
1428
+ print(line, end='', flush=True)
951
1429
  return ''
952
1430
 
953
1431
 
@@ -965,18 +1443,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
965
1443
  return f'{ready_replica_num}/{total_replica_num}'
966
1444
 
967
1445
 
968
- def format_service_table(service_records: List[Dict[str, Any]],
969
- show_all: bool) -> str:
1446
+ def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
1447
+ pool: bool) -> str:
1448
+ noun = 'pool' if pool else 'service'
970
1449
  if not service_records:
971
- return 'No existing services.'
1450
+ return f'No existing {noun}s.'
972
1451
 
973
1452
  service_columns = [
974
- 'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'
1453
+ 'NAME', 'VERSION', 'UPTIME', 'STATUS',
1454
+ 'REPLICAS' if not pool else 'WORKERS'
975
1455
  ]
1456
+ if not pool:
1457
+ service_columns.append('ENDPOINT')
976
1458
  if show_all:
977
1459
  service_columns.extend([
978
1460
  'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
979
1461
  ])
1462
+ if pool:
1463
+ # Remove the load balancing policy column for pools.
1464
+ service_columns.pop(-2)
980
1465
  service_table = log_utils.create_table(service_columns)
981
1466
 
982
1467
  replica_infos: List[Dict[str, Any]] = []
@@ -1007,37 +1492,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
1007
1492
  uptime,
1008
1493
  status_str,
1009
1494
  replicas,
1010
- endpoint,
1011
1495
  ]
1496
+ if not pool:
1497
+ service_values.append(endpoint)
1012
1498
  if show_all:
1013
1499
  service_values.extend(
1014
1500
  [policy, load_balancing_policy, requested_resources_str])
1501
+ if pool:
1502
+ service_values.pop(-2)
1015
1503
  service_table.add_row(service_values)
1016
1504
 
1017
- replica_table = _format_replica_table(replica_infos, show_all)
1505
+ replica_table = _format_replica_table(replica_infos, show_all, pool)
1506
+ replica_noun = 'Pool Workers' if pool else 'Service Replicas'
1018
1507
  return (f'{service_table}\n'
1019
1508
  f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1020
- f'Service Replicas{colorama.Style.RESET_ALL}\n'
1509
+ f'{replica_noun}{colorama.Style.RESET_ALL}\n'
1021
1510
  f'{replica_table}')
1022
1511
 
1023
1512
 
1024
- def _format_replica_table(replica_records: List[Dict[str, Any]],
1025
- show_all: bool) -> str:
1513
+ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1514
+ pool: bool) -> str:
1515
+ noun = 'worker' if pool else 'replica'
1026
1516
  if not replica_records:
1027
- return 'No existing replicas.'
1517
+ return f'No existing {noun}s.'
1028
1518
 
1029
1519
  replica_columns = [
1030
- 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'RESOURCES',
1031
- 'STATUS', 'REGION'
1520
+ 'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
1521
+ 'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
1032
1522
  ]
1033
- if show_all:
1034
- replica_columns.append('ZONE')
1523
+ if pool:
1524
+ replica_columns.append('USED_BY')
1525
+ # Remove the endpoint column for pool workers.
1526
+ replica_columns.pop(3)
1035
1527
  replica_table = log_utils.create_table(replica_columns)
1036
1528
 
1037
1529
  truncate_hint = ''
1038
1530
  if not show_all:
1039
1531
  if len(replica_records) > _REPLICA_TRUNC_NUM:
1040
- truncate_hint = '\n... (use --all to show all replicas)'
1532
+ truncate_hint = f'\n... (use --all to show all {noun}s)'
1041
1533
  replica_records = replica_records[:_REPLICA_TRUNC_NUM]
1042
1534
 
1043
1535
  for record in replica_records:
@@ -1047,21 +1539,26 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1047
1539
  version = (record['version'] if 'version' in record else '-')
1048
1540
  replica_endpoint = endpoint if endpoint else '-'
1049
1541
  launched_at = log_utils.readable_time_duration(record['launched_at'])
1542
+ infra = '-'
1050
1543
  resources_str = '-'
1051
1544
  replica_status = record['status']
1052
1545
  status_str = replica_status.colored_str()
1053
- region = '-'
1054
- zone = '-'
1546
+ used_by = record.get('used_by', None)
1547
+ used_by_str = str(used_by) if used_by is not None else '-'
1055
1548
 
1056
1549
  replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
1057
1550
  'handle']
1058
1551
  if replica_handle is not None:
1059
- resources_str = resources_utils.get_readable_resources_repr(
1060
- replica_handle, simplify=not show_all)
1061
- if replica_handle.launched_resources.region is not None:
1062
- region = replica_handle.launched_resources.region
1063
- if replica_handle.launched_resources.zone is not None:
1064
- zone = replica_handle.launched_resources.zone
1552
+ infra = replica_handle.launched_resources.infra.formatted_str()
1553
+ simplified = not show_all
1554
+ resources_str_simple, resources_str_full = (
1555
+ resources_utils.get_readable_resources_repr(
1556
+ replica_handle, simplified_only=simplified))
1557
+ if simplified:
1558
+ resources_str = resources_str_simple
1559
+ else:
1560
+ assert resources_str_full is not None
1561
+ resources_str = resources_str_full
1065
1562
 
1066
1563
  replica_values = [
1067
1564
  service_name,
@@ -1069,18 +1566,20 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1069
1566
  version,
1070
1567
  replica_endpoint,
1071
1568
  launched_at,
1569
+ infra,
1072
1570
  resources_str,
1073
1571
  status_str,
1074
- region,
1075
1572
  ]
1076
- if show_all:
1077
- replica_values.append(zone)
1573
+ if pool:
1574
+ replica_values.append(used_by_str)
1575
+ replica_values.pop(3)
1078
1576
  replica_table.add_row(replica_values)
1079
1577
 
1080
1578
  return f'{replica_table}{truncate_hint}'
1081
1579
 
1082
1580
 
1083
1581
  # =========================== CodeGen for Sky Serve ===========================
1582
+ # TODO (kyuds): deprecate and remove serve codegen entirely.
1084
1583
 
1085
1584
 
1086
1585
  # TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
@@ -1099,13 +1598,16 @@ class ServeCodeGen:
1099
1598
  'from sky.serve import serve_state',
1100
1599
  'from sky.serve import serve_utils',
1101
1600
  'from sky.serve import constants',
1601
+ 'serve_version = constants.SERVE_VERSION',
1102
1602
  ]
1103
1603
 
1104
1604
  @classmethod
1105
- def get_service_status(cls, service_names: Optional[List[str]]) -> str:
1605
+ def get_service_status(cls, service_names: Optional[List[str]],
1606
+ pool: bool) -> str:
1106
1607
  code = [
1107
- f'msg = serve_utils.get_service_status_encoded({service_names!r})',
1108
- 'print(msg, end="", flush=True)'
1608
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1609
+ f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
1610
+ '**kwargs)', 'print(msg, end="", flush=True)'
1109
1611
  ]
1110
1612
  return cls._build(code)
1111
1613
 
@@ -1118,11 +1620,12 @@ class ServeCodeGen:
1118
1620
  return cls._build(code)
1119
1621
 
1120
1622
  @classmethod
1121
- def terminate_services(cls, service_names: Optional[List[str]],
1122
- purge: bool) -> str:
1623
+ def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
1624
+ pool: bool) -> str:
1123
1625
  code = [
1626
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1124
1627
  f'msg = serve_utils.terminate_services({service_names!r}, '
1125
- f'purge={purge})', 'print(msg, end="", flush=True)'
1628
+ f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
1126
1629
  ]
1127
1630
  return cls._build(code)
1128
1631
 
@@ -1139,29 +1642,48 @@ class ServeCodeGen:
1139
1642
  return cls._build(code)
1140
1643
 
1141
1644
  @classmethod
1142
- def wait_service_registration(cls, service_name: str, job_id: int) -> str:
1645
+ def wait_service_registration(cls, service_name: str, job_id: int,
1646
+ pool: bool) -> str:
1143
1647
  code = [
1648
+ f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
1144
1649
  'msg = serve_utils.wait_service_registration('
1145
- f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)'
1650
+ f'{service_name!r}, {job_id}, **kwargs)',
1651
+ 'print(msg, end="", flush=True)'
1146
1652
  ]
1147
1653
  return cls._build(code)
1148
1654
 
1149
1655
  @classmethod
1150
1656
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1151
- follow: bool) -> str:
1657
+ follow: bool, tail: Optional[int],
1658
+ pool: bool) -> str:
1152
1659
  code = [
1660
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1153
1661
  'msg = serve_utils.stream_replica_logs('
1154
- f'{service_name!r}, {replica_id!r}, follow={follow})',
1155
- 'print(msg, flush=True)'
1662
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
1663
+ '**kwargs)', 'print(msg, flush=True)'
1156
1664
  ]
1157
1665
  return cls._build(code)
1158
1666
 
1159
1667
  @classmethod
1160
1668
  def stream_serve_process_logs(cls, service_name: str,
1161
- stream_controller: bool, follow: bool) -> str:
1669
+ stream_controller: bool, follow: bool,
1670
+ tail: Optional[int], pool: bool) -> str:
1162
1671
  code = [
1672
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1163
1673
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1164
- f'{stream_controller}, follow={follow})', 'print(msg, flush=True)'
1674
+ f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
1675
+ 'print(msg, flush=True)'
1676
+ ]
1677
+ return cls._build(code)
1678
+
1679
+ @classmethod
1680
+ def update_service(cls, service_name: str, version: int, mode: str,
1681
+ pool: bool) -> str:
1682
+ code = [
1683
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1684
+ f'msg = serve_utils.update_service_encoded({service_name!r}, '
1685
+ f'{version}, mode={mode!r}, **kwargs)',
1686
+ 'print(msg, end="", flush=True)',
1165
1687
  ]
1166
1688
  return cls._build(code)
1167
1689
 
@@ -1175,12 +1697,3 @@ class ServeCodeGen:
1175
1697
  f'"{common_utils.get_user_hash()}"; '
1176
1698
  f'{skylet_constants.SKY_PYTHON_CMD} '
1177
1699
  f'-u -c {shlex.quote(generated_code)}')
1178
-
1179
- @classmethod
1180
- def update_service(cls, service_name: str, version: int, mode: str) -> str:
1181
- code = [
1182
- f'msg = serve_utils.update_service_encoded({service_name!r}, '
1183
- f'{version}, mode={mode!r})',
1184
- 'print(msg, end="", flush=True)',
1185
- ]
1186
- return cls._build(code)