PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/nixl/conn.py CHANGED Viewed

@@ -24,10 +24,8 @@ from sglang.srt.disaggregation.common.conn import (
     CommonKVManager,
     CommonKVReceiver,
 )
-from sglang.srt.disaggregation.utils import (
-    DisaggregationMode,
-    group_concurrent_contiguous,
-)
+from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
+from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_local_ip_by_remote
@@ -46,7 +44,7 @@ class TransferInfo:
     agent_metadata: bytes
     agent_name: str
     dst_kv_ptrs: list[int]
-    dst_kv_indices: npt.NDArray[np.int64]
+    dst_kv_indices: npt.NDArray[np.int32]
     dst_aux_ptrs: list[int]
     dst_aux_index: int
     dst_gpu_id: int
@@ -64,7 +62,7 @@ class TransferInfo:
             agent_metadata=msg[3],
             agent_name=msg[4].decode("ascii"),
             dst_kv_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
-            dst_kv_indices=np.frombuffer(msg[6], dtype=np.int64),
+            dst_kv_indices=np.frombuffer(msg[6], dtype=np.int32),
             dst_aux_ptrs=list(struct.unpack(f"{len(msg[7])//8}Q", msg[7])),
             dst_aux_index=int(msg[8].decode("ascii")),
             dst_gpu_id=int(msg[9].decode("ascii")),
@@ -164,9 +162,9 @@ class NixlKVManager(CommonKVManager):
     def send_kvcache(
         self,
         peer_name: str,
-        prefill_kv_indices: npt.NDArray[np.int64],
+        prefill_kv_indices: npt.NDArray[np.int32],
         dst_kv_ptrs: list[int],
-        dst_kv_indices: npt.NDArray[np.int64],
+        dst_kv_indices: npt.NDArray[np.int32],
         dst_gpu_id: int,
         notif: str,
     ):
@@ -248,7 +246,7 @@ class NixlKVManager(CommonKVManager):
     def add_transfer_request(
         self,
         bootstrap_room: int,
-        kv_indices: npt.NDArray[np.int64],
+        kv_indices: npt.NDArray[np.int32],
         index_slice: slice,
         is_last: bool,
         chunk_id: int,
@@ -350,7 +348,14 @@ class NixlKVManager(CommonKVManager):
 class NixlKVSender(BaseKVSender):
-    def __init__(self, mgr: NixlKVManager, bootstrap_addr: str, bootstrap_room: int):
+    def __init__(
+        self,
+        mgr: NixlKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
+    ):
         self.kv_mgr = mgr
         self.bootstrap_room = bootstrap_room
         self.aux_index = None
@@ -368,7 +373,7 @@ class NixlKVSender(BaseKVSender):
     def send(
         self,
-        kv_indices: npt.NDArray[np.int64],
+        kv_indices: npt.NDArray[np.int32],
     ):
         index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
         self.curr_idx += len(kv_indices)
@@ -412,7 +417,7 @@ class NixlKVReceiver(CommonKVReceiver):
         self.started_transfer = False
         super().__init__(mgr, bootstrap_addr, bootstrap_room, data_parallel_rank)
-    def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
         for bootstrap_info in self.bootstrap_infos:
             self.prefill_server_url = (
                 f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"

sglang/srt/disaggregation/prefill.py CHANGED Viewed

@@ -25,12 +25,13 @@ from collections import deque
 from http import HTTPStatus
 from typing import TYPE_CHECKING, List, Optional
+import numpy as np
 import torch
-from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
+from sglang.srt.disaggregation.base import BaseKVManager, KVPoll
 from sglang.srt.disaggregation.utils import (
+    FAKE_BOOTSTRAP_HOST,
     DisaggregationMode,
-    FakeBootstrapHost,
     KVClassType,
     MetadataBuffers,
     ReqToMetadataIdxAllocator,
@@ -51,7 +52,6 @@ if TYPE_CHECKING:
     from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
     from sglang.srt.mem_cache.memory_pool import KVCache
 logger = logging.getLogger(__name__)
@@ -68,35 +68,45 @@ class PrefillBootstrapQueue:
         metadata_buffers: MetadataBuffers,
         tp_rank: int,
         tp_size: int,
+        gpu_id: int,
         bootstrap_port: int,
         gloo_group: ProcessGroup,
-        transfer_backend: TransferBackend,
+        max_total_num_tokens: int,
+        decode_tp_size: int,
+        decode_dp_size: int,
         scheduler: Scheduler,
+        pp_rank: int,
+        pp_size: int,
+        transfer_backend: TransferBackend,
     ):
         self.token_to_kv_pool = token_to_kv_pool
         self.draft_token_to_kv_pool = draft_token_to_kv_pool
         self.is_mla_backend = is_mla_backend(token_to_kv_pool)
         self.metadata_buffers = metadata_buffers
         self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
         self.tp_rank = tp_rank
         self.tp_size = tp_size
-        self.transfer_backend = transfer_backend
-        self.scheduler = scheduler
-        self.kv_manager = self._init_kv_manager()
+        self.decode_tp_size = decode_tp_size
+        self.decode_dp_size = decode_dp_size
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+        self.gpu_id = gpu_id
+        self.bootstrap_port = bootstrap_port
         self.queue: List[Req] = []
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
         self.gloo_group = gloo_group
-        self.bootstrap_port = bootstrap_port
-    def store_prefill_results(self, idx: int, token_id: int):
-        assert token_id >= 0, f"token_id: {token_id} is negative"
-        output_id_buffer = self.metadata_buffers[0]
-        output_id_buffer[idx] = token_id
+        self.max_total_num_tokens = max_total_num_tokens
+        self.scheduler = scheduler
+        self.transfer_backend = transfer_backend
+        self.kv_manager = self._init_kv_manager()
     def _init_kv_manager(self) -> BaseKVManager:
-        kv_args = KVArgs()
+        kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
+        kv_args = kv_args_class()
         kv_args.engine_rank = self.tp_rank
+        kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
+        kv_args.prefill_pp_size = self.pp_size
         kv_data_ptrs, kv_data_lens, kv_item_lens = (
             self.token_to_kv_pool.get_contiguous_buf_infos()
         )
@@ -115,12 +125,12 @@ class PrefillBootstrapQueue:
         kv_args.kv_data_lens = kv_data_lens
         kv_args.kv_item_lens = kv_item_lens
-        # Define req -> input ids buffer
         kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
             self.metadata_buffers.get_buf_infos()
         )
         kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
         kv_manager = kv_manager_class(
             kv_args,
@@ -130,23 +140,39 @@ class PrefillBootstrapQueue:
         )
         return kv_manager
-    def add(self, req: Req) -> None:
-        if req.bootstrap_host == FakeBootstrapHost:
-            # Fake transfer for warmup reqs
+    def add(self, req: Req, num_kv_heads: int) -> None:
+        if self._check_if_req_exceed_kv_capacity(req):
+            return
+        if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
             kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
         else:
             kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
+        dest_tp_ranks = [self.tp_rank]
         req.disagg_kv_sender = kv_sender_class(
             mgr=self.kv_manager,
             bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
             bootstrap_room=req.bootstrap_room,
+            dest_tp_ranks=dest_tp_ranks,
+            pp_rank=self.pp_rank,
         )
         self._process_req(req)
         self.queue.append(req)
-    def extend(self, reqs: List[Req]) -> None:
+    def extend(self, reqs: List[Req], num_kv_heads: int) -> None:
         for req in reqs:
-            self.add(req)
+            self.add(req, num_kv_heads)
+    def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
+        if len(req.origin_input_ids) > self.max_total_num_tokens:
+            message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
+            logger.error(message)
+            prepare_abort(req, message)
+            self.scheduler.stream_output([req], req.return_logprob)
+            return True
+        return False
     def _process_req(self, req: Req) -> None:
         """
@@ -154,19 +180,40 @@ class PrefillBootstrapQueue:
         """
         req.sampling_params.max_new_tokens = 1
-    def pop_bootstrapped(self) -> List[Req]:
-        """pop the reqs which has finished bootstrapping"""
+    def pop_bootstrapped(
+        self,
+        return_failed_reqs: bool = False,
+        rids_to_check: Optional[List[str]] = None,
+    ) -> List[Req]:
+        """
+        pop the reqs which has finished bootstrapping
+        return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
+        """
         bootstrapped_reqs = []
+        failed_reqs = []
         indices_to_remove = set()
         if len(self.queue) == 0:
-            return []
+            if return_failed_reqs is False:
+                return []
+            else:
+                return [], []
         polls = poll_and_all_reduce(
             [req.disagg_kv_sender for req in self.queue], self.gloo_group
         )
         for i, (req, poll) in enumerate(zip(self.queue, polls)):
+            if rids_to_check is not None:
+                # if req not in reqs_info_to_check, skip
+                if req.rid not in rids_to_check:
+                    continue
+                # Either waiting for input or failed
+                assert poll == KVPoll.WaitingForInput or poll == KVPoll.Failed
             if poll == KVPoll.Bootstrapping:
                 continue
             elif poll == KVPoll.Failed:
@@ -181,9 +228,10 @@ class PrefillBootstrapQueue:
                 )
                 self.scheduler.stream_output([req], req.return_logprob)
                 indices_to_remove.add(i)
+                failed_reqs.append(req)
                 continue
-            # KV.WaitingForInput
+            # KV.WaitingForInput - init here
             num_kv_indices = len(req.origin_input_ids)
             if self.req_to_metadata_buffer_idx_allocator.available_size() == 0:
                 break
@@ -192,9 +240,9 @@ class PrefillBootstrapQueue:
                 self.req_to_metadata_buffer_idx_allocator.alloc()
             )
             assert req.metadata_buffer_index is not None
             num_pages = kv_to_page_num(num_kv_indices, self.token_to_kv_pool.page_size)
             req.disagg_kv_sender.init(num_pages, req.metadata_buffer_index)
             bootstrapped_reqs.append(req)
             indices_to_remove.add(i)
@@ -202,7 +250,10 @@ class PrefillBootstrapQueue:
             entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
         ]
-        return bootstrapped_reqs
+        if return_failed_reqs is False:
+            return bootstrapped_reqs
+        else:
+            return bootstrapped_reqs, failed_reqs
 class SchedulerDisaggregationPrefillMixin:
@@ -211,7 +262,7 @@ class SchedulerDisaggregationPrefillMixin:
     """
     @torch.no_grad()
-    def event_loop_normal_disagg_prefill(self: Scheduler):
+    def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
         """A normal scheduler loop for prefill worker in disaggregation mode."""
         while True:
@@ -229,7 +280,6 @@ class SchedulerDisaggregationPrefillMixin:
                 or self.server_args.enable_sp_layernorm
             ):
                 batch, _ = self.prepare_dp_attn_batch(batch)
             self.cur_batch = batch
             if batch:
@@ -242,6 +292,7 @@ class SchedulerDisaggregationPrefillMixin:
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
+                self.maybe_sleep_on_idle()
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -249,7 +300,7 @@ class SchedulerDisaggregationPrefillMixin:
             self.running_batch.batch_is_full = False
     @torch.no_grad()
-    def event_loop_overlap_disagg_prefill(self: Scheduler):
+    def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
         self.result_queue = deque()
         while True:
@@ -267,9 +318,7 @@ class SchedulerDisaggregationPrefillMixin:
                 or self.server_args.enable_sp_layernorm
             ):
                 batch, _ = self.prepare_dp_attn_batch(batch)
             self.cur_batch = batch
             if batch:
                 result = self.run_batch(batch)
                 self.result_queue.append((batch.copy(), result))
@@ -286,6 +335,9 @@ class SchedulerDisaggregationPrefillMixin:
             if self.last_batch:
                 tmp_batch, tmp_result = self.result_queue.popleft()
+                tmp_batch.next_batch_sampling_info = (
+                    self.tp_worker.cur_sampling_info if batch else None
+                )
                 self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
             if len(self.disagg_prefill_inflight_queue) > 0:
@@ -294,6 +346,7 @@ class SchedulerDisaggregationPrefillMixin:
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
                 self.check_memory()
                 self.new_token_ratio = self.init_new_token_ratio
+                self.maybe_sleep_on_idle()
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -307,7 +360,7 @@ class SchedulerDisaggregationPrefillMixin:
         launch_done: Optional[threading.Event] = None,
     ) -> None:
         """
-        Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
+        Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
         Adapted from process_batch_result_prefill
         """
         (
@@ -323,7 +376,7 @@ class SchedulerDisaggregationPrefillMixin:
         )
         logprob_pt = 0
-        # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
+        # Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
         if self.enable_overlap:
             # wait
             logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
@@ -395,11 +448,15 @@ class SchedulerDisaggregationPrefillMixin:
         # We need to remove the sync in the following function for overlap schedule.
         self.set_next_batch_sampling_info_done(batch)
-    def process_disagg_prefill_inflight_queue(self: Scheduler) -> None:
+    def process_disagg_prefill_inflight_queue(
+        self: Scheduler, rids_to_check: Optional[List[str]] = None
+    ) -> List[Req]:
         """
         Poll the requests in the middle of transfer. If done, return the request.
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
         """
-        assert len(self.disagg_prefill_inflight_queue) > 0
+        if len(self.disagg_prefill_inflight_queue) == 0:
+            return []
         done_reqs = []
@@ -411,6 +468,14 @@ class SchedulerDisaggregationPrefillMixin:
         undone_reqs: List[Req] = []
         # Check .poll() for the reqs in disagg_prefill_inflight_queue. If Success, respond to the client and remove it from the queue
         for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+            if rids_to_check is not None:
+                if req.rid not in rids_to_check:
+                    undone_reqs.append(req)
+                    continue
+                assert poll == KVPoll.Success or poll == KVPoll.Failed
             if poll in [KVPoll.WaitingForInput, KVPoll.Transferring]:
                 undone_reqs.append(req)
             elif poll == KVPoll.Success:  # transfer done
@@ -432,11 +497,8 @@ class SchedulerDisaggregationPrefillMixin:
                     req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
                 )
                 done_reqs.append(req)
-        for req in done_reqs:
-            self.disagg_prefill_bootstrap_queue.req_to_metadata_buffer_idx_allocator.free(
-                req.metadata_buffer_index
-            )
+            else:
+                assert False, f"Unexpected polling state {poll=}"
         # Stream requests which have finished transfer
         self.stream_output(
@@ -444,9 +506,32 @@ class SchedulerDisaggregationPrefillMixin:
             any(req.return_logprob for req in done_reqs),
             None,
         )
+        for req in done_reqs:
+            req: Req
+            self.req_to_metadata_buffer_idx_allocator.free(req.metadata_buffer_index)
+            req.metadata_buffer_index = -1
         self.disagg_prefill_inflight_queue = undone_reqs
+        return done_reqs
+    def get_transferred_rids(self: Scheduler) -> List[str]:
+        """
+        Used by PP, get the transferred rids but **do not pop**
+        """
+        polls = poll_and_all_reduce(
+            [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
+            self.tp_worker.get_tp_group().cpu_group,
+        )
+        transferred_rids: List[str] = []
+        for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+            if poll == KVPoll.Success or poll == KVPoll.Failed:
+                transferred_rids.append(req.rid)
+        return transferred_rids
     def process_prefill_chunk(self: Scheduler) -> None:
         if self.last_batch and self.last_batch.forward_mode.is_extend():
             if self.chunked_req:

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl