skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/serve/serve_utils.py CHANGED
@@ -10,12 +10,11 @@ import pickle
10
10
  import re
11
11
  import shlex
12
12
  import shutil
13
- import threading
14
13
  import time
15
14
  import traceback
16
15
  import typing
17
- from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
18
- List, Optional, TextIO, Type, TypeVar, Union)
16
+ from typing import (Any, Callable, DefaultDict, Deque, Dict, Iterator, List,
17
+ Optional, TextIO, Type, Union)
19
18
  import uuid
20
19
 
21
20
  import colorama
@@ -158,50 +157,6 @@ _SIGNAL_TO_ERROR = {
158
157
  UserSignal.TERMINATE: exceptions.ServeUserTerminatedError,
159
158
  }
160
159
 
161
- # pylint: disable=invalid-name
162
- KeyType = TypeVar('KeyType')
163
- ValueType = TypeVar('ValueType')
164
-
165
-
166
- # Google style guide: Do not rely on the atomicity of built-in types.
167
- # Our launch and down process pool will be used by multiple threads,
168
- # therefore we need to use a thread-safe dict.
169
- # see https://google.github.io/styleguide/pyguide.html#218-threading
170
- class ThreadSafeDict(Generic[KeyType, ValueType]):
171
- """A thread-safe dict."""
172
-
173
- def __init__(self, *args: Any, **kwargs: Any) -> None:
174
- self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
175
- self._lock = threading.Lock()
176
-
177
- def __getitem__(self, key: KeyType) -> ValueType:
178
- with self._lock:
179
- return self._dict.__getitem__(key)
180
-
181
- def __setitem__(self, key: KeyType, value: ValueType) -> None:
182
- with self._lock:
183
- return self._dict.__setitem__(key, value)
184
-
185
- def __delitem__(self, key: KeyType) -> None:
186
- with self._lock:
187
- return self._dict.__delitem__(key)
188
-
189
- def __len__(self) -> int:
190
- with self._lock:
191
- return self._dict.__len__()
192
-
193
- def __contains__(self, key: KeyType) -> bool:
194
- with self._lock:
195
- return self._dict.__contains__(key)
196
-
197
- def items(self):
198
- with self._lock:
199
- return self._dict.items()
200
-
201
- def values(self):
202
- with self._lock:
203
- return self._dict.values()
204
-
205
160
 
206
161
  class RequestsAggregator:
207
162
  """Base class for request aggregator."""
@@ -262,26 +217,24 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
262
217
  controller = controller_utils.get_controller_for_pool(pool).value
263
218
  if current_is_consolidation_mode:
264
219
  controller_cn = controller.cluster_name
265
- if global_user_state.get_cluster_from_name(controller_cn) is not None:
266
- with ux_utils.print_exception_no_traceback():
267
- raise exceptions.InconsistentConsolidationModeError(
268
- f'{colorama.Fore.RED}Consolidation mode for '
269
- f'{controller.controller_type} is enabled, but the '
270
- f'controller cluster {controller_cn} is still running. '
271
- 'Please terminate the controller cluster first.'
272
- f'{colorama.Style.RESET_ALL}')
220
+ if global_user_state.cluster_with_name_exists(controller_cn):
221
+ logger.warning(
222
+ f'{colorama.Fore.RED}Consolidation mode for '
223
+ f'{controller.controller_type} is enabled, but the controller '
224
+ f'cluster {controller_cn} is still running. Please terminate '
225
+ 'the controller cluster first.'
226
+ f'{colorama.Style.RESET_ALL}')
273
227
  else:
274
228
  noun = 'pool' if pool else 'service'
275
229
  all_services = [
276
230
  svc for svc in serve_state.get_services() if svc['pool'] == pool
277
231
  ]
278
232
  if all_services:
279
- with ux_utils.print_exception_no_traceback():
280
- raise exceptions.InconsistentConsolidationModeError(
281
- f'{colorama.Fore.RED}Consolidation mode for '
282
- f'{controller.controller_type} is disabled, but there are '
283
- f'still {len(all_services)} {noun}s running. Please '
284
- f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
233
+ logger.warning(
234
+ f'{colorama.Fore.RED}Consolidation mode for '
235
+ f'{controller.controller_type} is disabled, but there are '
236
+ f'still {len(all_services)} {noun}s running. Please terminate '
237
+ f'those {noun}s first.{colorama.Style.RESET_ALL}')
285
238
 
286
239
 
287
240
  @annotations.lru_cache(scope='request', maxsize=1)
@@ -291,6 +244,10 @@ def is_consolidation_mode(pool: bool = False) -> bool:
291
244
  consolidation_mode = skypilot_config.get_nested(
292
245
  (controller.controller_type, 'controller', 'consolidation_mode'),
293
246
  default_value=False)
247
+ if os.environ.get(skylet_constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
248
+ # if we are in the job controller, we must always be in consolidation
249
+ # mode.
250
+ return True
294
251
  # We should only do this check on API server, as the controller will not
295
252
  # have related config and will always seemingly disabled for consolidation
296
253
  # mode. Check #6611 for more details.
@@ -397,12 +354,28 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
397
354
  if task.service.dynamic_ondemand_fallback else 'spot')
398
355
  for resource in list(task.resources):
399
356
  if resource.job_recovery is not None:
400
- sys_name = 'SkyServe' if not pool else 'Cluster Pool'
357
+ sys_name = 'SkyServe' if not pool else 'Pool'
401
358
  with ux_utils.print_exception_no_traceback():
402
359
  raise ValueError(f'job_recovery is disabled for {sys_name}. '
403
360
  f'{sys_name} will replenish preempted spot '
404
361
  f'with {policy_description} instances.')
405
362
 
363
+ if pool:
364
+ accelerators = set()
365
+ for resource in task.resources:
366
+ if resource.accelerators is not None:
367
+ if isinstance(resource.accelerators, str):
368
+ accelerators.add(resource.accelerators)
369
+ elif isinstance(resource.accelerators, dict):
370
+ accelerators.update(resource.accelerators.keys())
371
+ elif isinstance(resource.accelerators, list):
372
+ accelerators.update(resource.accelerators)
373
+ if len(accelerators) > 1:
374
+ with ux_utils.print_exception_no_traceback():
375
+ raise ValueError('Heterogeneous clusters are not supported for '
376
+ 'pools please specify one accelerator '
377
+ 'for all workers.')
378
+
406
379
  # Try to create a spot placer from the task yaml. Check if the task yaml
407
380
  # is valid for spot placer.
408
381
  spot_placer.SpotPlacer.from_task(task.service, task)
@@ -447,7 +420,7 @@ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
447
420
  if (task.service.ports is not None or
448
421
  requested_resources.ports is not None):
449
422
  with ux_utils.print_exception_no_traceback():
450
- raise ValueError('Cannot specify ports in a cluster pool.')
423
+ raise ValueError('Cannot specify ports in a pool.')
451
424
 
452
425
 
453
426
  def generate_service_name(pool: bool = False):
@@ -675,6 +648,18 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
675
648
  return message
676
649
 
677
650
 
651
+ def get_yaml_content(service_name: str, version: int) -> str:
652
+ yaml_content = serve_state.get_yaml_content(service_name, version)
653
+ if yaml_content is not None:
654
+ return yaml_content
655
+ # Backward compatibility for old service records that
656
+ # does not dump the yaml content to version database.
657
+ # TODO(tian): Remove this after 2 minor releases, i.e. 0.13.0.
658
+ latest_yaml_path = generate_task_yaml_file_name(service_name, version)
659
+ with open(latest_yaml_path, 'r', encoding='utf-8') as f:
660
+ return f.read()
661
+
662
+
678
663
  def _get_service_status(
679
664
  service_name: str,
680
665
  pool: bool,
@@ -697,21 +682,30 @@ def _get_service_status(
697
682
 
698
683
  record['pool_yaml'] = ''
699
684
  if record['pool']:
700
- latest_yaml_path = generate_task_yaml_file_name(service_name,
701
- record['version'])
702
- raw_yaml_config = yaml_utils.read_yaml(latest_yaml_path)
703
- original_config = raw_yaml_config.get('_user_specified_yaml')
704
- if original_config is None:
705
- # Fall back to old display format.
706
- original_config = raw_yaml_config
707
- original_config.pop('run', None)
708
- svc: Dict[str, Any] = original_config.pop('service')
709
- if svc is not None:
710
- svc.pop('pool', None) # Remove pool from service config
711
- original_config['pool'] = svc # Add pool to root config
685
+ version = record['version']
686
+ try:
687
+ yaml_content = get_yaml_content(service_name, version)
688
+ raw_yaml_config = yaml_utils.read_yaml_str(yaml_content)
689
+ except Exception as e: # pylint: disable=broad-except
690
+ # If this is a consolidation mode running without an PVC, the file
691
+ # might lost after an API server update (restart). In such case, we
692
+ # don't want it to crash the command. Fall back to an empty string.
693
+ logger.error(f'Failed to read YAML for service {service_name} '
694
+ f'with version {version}: {e}')
695
+ record['pool_yaml'] = ''
712
696
  else:
713
- original_config = yaml_utils.safe_load(original_config)
714
- record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
697
+ original_config = raw_yaml_config.get('_user_specified_yaml')
698
+ if original_config is None:
699
+ # Fall back to old display format.
700
+ original_config = raw_yaml_config
701
+ original_config.pop('run', None)
702
+ svc: Dict[str, Any] = original_config.pop('service')
703
+ if svc is not None:
704
+ svc.pop('pool', None) # Remove pool from service config
705
+ original_config['pool'] = svc # Add pool to root config
706
+ else:
707
+ original_config = yaml_utils.safe_load(original_config)
708
+ record['pool_yaml'] = yaml_utils.dump_yaml_str(original_config)
715
709
 
716
710
  record['target_num_replicas'] = 0
717
711
  try:
@@ -740,8 +734,8 @@ def _get_service_status(
740
734
  return record
741
735
 
742
736
 
743
- def get_service_status_encoded(service_names: Optional[List[str]],
744
- pool: bool) -> str:
737
+ def get_service_status_pickled(service_names: Optional[List[str]],
738
+ pool: bool) -> List[Dict[str, str]]:
745
739
  service_statuses: List[Dict[str, str]] = []
746
740
  if service_names is None:
747
741
  # Get all service names
@@ -754,14 +748,34 @@ def get_service_status_encoded(service_names: Optional[List[str]],
754
748
  k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
755
749
  for k, v in service_status.items()
756
750
  })
757
- service_statuses = sorted(service_statuses, key=lambda x: x['name'])
751
+ return sorted(service_statuses, key=lambda x: x['name'])
752
+
753
+
754
+ # TODO (kyuds): remove when serve codegen is removed
755
+ def get_service_status_encoded(service_names: Optional[List[str]],
756
+ pool: bool) -> str:
758
757
  # We have to use payload_type here to avoid the issue of
759
758
  # message_utils.decode_payload() not being able to correctly decode the
760
759
  # message with <sky-payload> tags.
760
+ service_statuses = get_service_status_pickled(service_names, pool)
761
761
  return message_utils.encode_payload(service_statuses,
762
762
  payload_type='service_status')
763
763
 
764
764
 
765
+ def unpickle_service_status(
766
+ payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
767
+ service_statuses: List[Dict[str, Any]] = []
768
+ for service_status in payload:
769
+ if not isinstance(service_status, dict):
770
+ raise ValueError(f'Invalid service status: {service_status}')
771
+ service_statuses.append({
772
+ k: pickle.loads(base64.b64decode(v))
773
+ for k, v in service_status.items()
774
+ })
775
+ return service_statuses
776
+
777
+
778
+ # TODO (kyuds): remove when serve codegen is removed
765
779
  def load_service_status(payload: str) -> List[Dict[str, Any]]:
766
780
  try:
767
781
  service_statuses_encoded = message_utils.decode_payload(
@@ -773,22 +787,16 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
773
787
  service_statuses_encoded = message_utils.decode_payload(payload)
774
788
  else:
775
789
  raise
776
- service_statuses: List[Dict[str, Any]] = []
777
- for service_status in service_statuses_encoded:
778
- if not isinstance(service_status, dict):
779
- raise ValueError(f'Invalid service status: {service_status}')
780
- service_statuses.append({
781
- k: pickle.loads(base64.b64decode(v))
782
- for k, v in service_status.items()
783
- })
784
- return service_statuses
790
+ return unpickle_service_status(service_statuses_encoded)
785
791
 
786
792
 
793
+ # TODO (kyuds): remove when serve codegen is removed
787
794
  def add_version_encoded(service_name: str) -> str:
788
795
  new_version = serve_state.add_version(service_name)
789
796
  return message_utils.encode_payload(new_version)
790
797
 
791
798
 
799
+ # TODO (kyuds): remove when serve codegen is removed
792
800
  def load_version_string(payload: str) -> str:
793
801
  return message_utils.decode_payload(payload)
794
802
 
@@ -821,7 +829,7 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
821
829
  logger.error(f'Service {service_name!r} does not exist.')
822
830
  return None
823
831
  if not service_status['pool']:
824
- logger.error(f'Service {service_name!r} is not a cluster pool.')
832
+ logger.error(f'Service {service_name!r} is not a pool.')
825
833
  return None
826
834
  with filelock.FileLock(get_service_filelock_path(service_name)):
827
835
  logger.debug(f'Get next cluster name for pool {service_name!r}')
@@ -877,8 +885,8 @@ def _terminate_failed_services(
877
885
  # replicas, so we don't need to try again here.
878
886
  for replica_info in serve_state.get_replica_infos(service_name):
879
887
  # TODO(tian): Refresh latest status of the cluster.
880
- if global_user_state.get_cluster_from_name(
881
- replica_info.cluster_name) is not None:
888
+ if global_user_state.cluster_with_name_exists(
889
+ replica_info.cluster_name):
882
890
  remaining_replica_clusters.append(f'{replica_info.cluster_name!r}')
883
891
  serve_state.remove_replica(service_name, replica_info.replica_id)
884
892
 
@@ -994,6 +1002,8 @@ def wait_service_registration(service_name: str, job_id: int,
994
1002
  Returns:
995
1003
  Encoded load balancer port assigned to the service.
996
1004
  """
1005
+ # TODO (kyuds): when codegen is fully deprecated, return the lb port
1006
+ # as an int directly instead of encoding it.
997
1007
  start_time = time.time()
998
1008
  setup_completed = False
999
1009
  noun = 'pool' if pool else 'service'
@@ -1105,17 +1115,17 @@ def get_latest_version_with_min_replicas(
1105
1115
  return active_versions[-1] if active_versions else None
1106
1116
 
1107
1117
 
1108
- def _process_line(line: str,
1109
- cluster_name: str,
1110
- stop_on_eof: bool = False) -> Iterator[str]:
1118
+ def _process_line(
1119
+ line: str,
1120
+ cluster_name: str,
1121
+ stop_on_eof: bool = False,
1122
+ streamed_provision_log_paths: Optional[set] = None) -> Iterator[str]:
1111
1123
  # The line might be directing users to view logs, like
1112
1124
  # `✓ Cluster launched: new-http. View logs at: *.log`
1113
1125
  # We should tail the detailed logs for user.
1114
1126
  def cluster_is_up() -> bool:
1115
- cluster_record = global_user_state.get_cluster_from_name(cluster_name)
1116
- if cluster_record is None:
1117
- return False
1118
- return cluster_record['status'] == status_lib.ClusterStatus.UP
1127
+ status = global_user_state.get_status_from_cluster_name(cluster_name)
1128
+ return status == status_lib.ClusterStatus.UP
1119
1129
 
1120
1130
  provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1121
1131
  line)
@@ -1124,6 +1134,20 @@ def _process_line(line: str,
1124
1134
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1125
1135
 
1126
1136
  def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
1137
+ # Check if this provision log has already been streamed to avoid
1138
+ # duplicate expansion. When a Kubernetes cluster needs to pull a Docker
1139
+ # image, rich spinner updates can produce hundreds of lines matching
1140
+ # _SKYPILOT_PROVISION_LOG_CMD_PATTERN (e.g., "Launching (1 pod(s)
1141
+ # pending due to Pulling)... View logs: sky logs --provision ...").
1142
+ # Without this check, the same provision log would be expanded hundreds
1143
+ # of times, creating huge log files (30M+) and making users think the
1144
+ # system is stuck in an infinite loop.
1145
+ if streamed_provision_log_paths is not None:
1146
+ resolved_path = str(p.resolve())
1147
+ if resolved_path in streamed_provision_log_paths:
1148
+ return
1149
+ streamed_provision_log_paths.add(resolved_path)
1150
+
1127
1151
  try:
1128
1152
  with open(p, 'r', newline='', encoding='utf-8') as f:
1129
1153
  # Exit if >10s without new content to avoid hanging when INIT
@@ -1195,9 +1219,14 @@ def _follow_logs_with_provision_expanding(
1195
1219
  Yields:
1196
1220
  Log lines, including expanded content from referenced provision logs.
1197
1221
  """
1222
+ streamed_provision_log_paths: set = set()
1198
1223
 
1199
1224
  def process_line(line: str) -> Iterator[str]:
1200
- yield from _process_line(line, cluster_name, stop_on_eof=stop_on_eof)
1225
+ yield from _process_line(
1226
+ line,
1227
+ cluster_name,
1228
+ stop_on_eof=stop_on_eof,
1229
+ streamed_provision_log_paths=streamed_provision_log_paths)
1201
1230
 
1202
1231
  return log_utils.follow_logs(file,
1203
1232
  should_stop=should_stop,
@@ -1223,11 +1252,14 @@ def _capped_follow_logs_with_provision_expanding(
1223
1252
  Log lines, including expanded content from referenced provision logs.
1224
1253
  """
1225
1254
  all_lines: Deque[str] = collections.deque(maxlen=line_cap)
1255
+ streamed_provision_log_paths: set = set()
1226
1256
 
1227
1257
  for line in log_list:
1228
- for processed in _process_line(line=line,
1229
- cluster_name=cluster_name,
1230
- stop_on_eof=False):
1258
+ for processed in _process_line(
1259
+ line=line,
1260
+ cluster_name=cluster_name,
1261
+ stop_on_eof=False,
1262
+ streamed_provision_log_paths=streamed_provision_log_paths):
1231
1263
  all_lines.append(processed)
1232
1264
 
1233
1265
  yield from all_lines
@@ -1308,10 +1340,6 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1308
1340
  print(line, end='', flush=True)
1309
1341
  return ''
1310
1342
 
1311
- # For pools, we don't stream the job logs as the run section is ignored.
1312
- if pool:
1313
- return ''
1314
-
1315
1343
  backend = backends.CloudVmRayBackend()
1316
1344
  handle = global_user_state.get_handle_from_cluster_name(
1317
1345
  replica_cluster_name)
@@ -1519,8 +1547,15 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1519
1547
  'handle']
1520
1548
  if replica_handle is not None:
1521
1549
  infra = replica_handle.launched_resources.infra.formatted_str()
1522
- resources_str = resources_utils.get_readable_resources_repr(
1523
- replica_handle, simplify=not show_all)
1550
+ simplified = not show_all
1551
+ resources_str_simple, resources_str_full = (
1552
+ resources_utils.get_readable_resources_repr(
1553
+ replica_handle, simplified_only=simplified))
1554
+ if simplified:
1555
+ resources_str = resources_str_simple
1556
+ else:
1557
+ assert resources_str_full is not None
1558
+ resources_str = resources_str_full
1524
1559
 
1525
1560
  replica_values = [
1526
1561
  service_name,
@@ -1541,6 +1576,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1541
1576
 
1542
1577
 
1543
1578
  # =========================== CodeGen for Sky Serve ===========================
1579
+ # TODO (kyuds): deprecate and remove serve codegen entirely.
1544
1580
 
1545
1581
 
1546
1582
  # TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
sky/serve/server/core.py CHANGED
@@ -5,7 +5,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
  from sky import backends
6
6
  from sky import exceptions
7
7
  from sky import sky_logging
8
+ from sky.adaptors import common as adaptors_common
8
9
  from sky.backends import backend_utils
10
+ from sky.serve import serve_rpc_utils
9
11
  from sky.serve import serve_utils
10
12
  from sky.serve.server import impl
11
13
  from sky.usage import usage_lib
@@ -13,7 +15,11 @@ from sky.utils import controller_utils
13
15
  from sky.utils import subprocess_utils
14
16
 
15
17
  if typing.TYPE_CHECKING:
18
+ import grpc
19
+
16
20
  import sky
21
+ else:
22
+ grpc = adaptors_common.LazyImport('grpc')
17
23
 
18
24
  logger = sky_logging.init_logger(__name__)
19
25
 
@@ -40,20 +46,23 @@ def up(
40
46
 
41
47
 
42
48
  @usage_lib.entrypoint
43
- def update(
44
- task: 'sky.Task',
45
- service_name: str,
46
- mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE) -> None:
49
+ def update(task: Optional['sky.Task'],
50
+ service_name: str,
51
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
52
+ workers: Optional[int] = None) -> None:
47
53
  """Updates an existing service.
48
54
 
49
55
  Please refer to the sky.cli.serve_update for the document.
50
56
 
51
57
  Args:
52
- task: sky.Task to update.
58
+ task: sky.Task to update, or None if updating
59
+ the number of workers/replicas.
53
60
  service_name: Name of the service.
54
61
  mode: Update mode.
62
+ workers: Number of workers/replicas to set for the service when
63
+ task is None.
55
64
  """
56
- return impl.update(task, service_name, mode, pool=False)
65
+ return impl.update(task, service_name, mode, pool=False, workers=workers)
57
66
 
58
67
 
59
68
  @usage_lib.entrypoint
@@ -105,25 +114,37 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
105
114
  'Please spin up a service first.',
106
115
  )
107
116
 
108
- backend = backend_utils.get_backend_from_handle(handle)
109
- assert isinstance(backend, backends.CloudVmRayBackend)
110
-
111
- code = serve_utils.ServeCodeGen.terminate_replica(service_name, replica_id,
112
- purge)
113
- returncode, stdout, stderr = backend.run_on_head(handle,
114
- code,
115
- require_outputs=True,
116
- stream_logs=False,
117
- separate_stderr=True)
118
-
119
- try:
120
- subprocess_utils.handle_returncode(returncode,
121
- code,
122
- 'Failed to terminate the replica',
123
- stderr,
124
- stream_logs=True)
125
- except exceptions.CommandError as e:
126
- raise RuntimeError(e.error_msg) from e
117
+ assert isinstance(handle, backends.CloudVmRayResourceHandle)
118
+ use_legacy = not handle.is_grpc_enabled_with_flag
119
+
120
+ if not use_legacy:
121
+ try:
122
+ stdout = serve_rpc_utils.RpcRunner.terminate_replica(
123
+ handle, service_name, replica_id, purge)
124
+ except exceptions.SkyletMethodNotImplementedError:
125
+ use_legacy = True
126
+
127
+ if use_legacy:
128
+ backend = backend_utils.get_backend_from_handle(handle)
129
+ assert isinstance(backend, backends.CloudVmRayBackend)
130
+
131
+ code = serve_utils.ServeCodeGen.terminate_replica(
132
+ service_name, replica_id, purge)
133
+ returncode, stdout, stderr = backend.run_on_head(handle,
134
+ code,
135
+ require_outputs=True,
136
+ stream_logs=False,
137
+ separate_stderr=True)
138
+
139
+ try:
140
+ subprocess_utils.handle_returncode(
141
+ returncode,
142
+ code,
143
+ 'Failed to terminate the replica',
144
+ stderr,
145
+ stream_logs=True)
146
+ except exceptions.CommandError as e:
147
+ raise RuntimeError(e.error_msg) from e
127
148
 
128
149
  sky_logging.print(stdout)
129
150