PyPI - sglang - Versions diffs - 0.5.3.post2__py3-none-any.whl → 0.5.3.post3__py3-none-any.whl - Mend

sglang 0.5.3.post2py3-none-any.whl → 0.5.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

sglang/bench_one_batch.py +13 -8
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +1 -0
sglang/srt/disaggregation/decode.py +113 -8
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +148 -17
sglang/srt/disaggregation/nixl/conn.py +7 -1
sglang/srt/disaggregation/prefill.py +71 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -3
sglang/srt/environ.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -0
sglang/srt/layers/layernorm.py +41 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/utils.py +4 -2
sglang/srt/layers/rotary_embedding.py +16 -2
sglang/srt/layers/sampler.py +3 -3
sglang/srt/managers/scheduler.py +0 -6
sglang/srt/mem_cache/allocator_ascend.py +1 -1
sglang/srt/mem_cache/common.py +1 -5
sglang/srt/mem_cache/memory_pool.py +248 -137
sglang/srt/model_executor/model_runner.py +28 -13
sglang/srt/model_executor/npu_graph_runner.py +2 -2
sglang/srt/model_loader/weight_utils.py +2 -2
sglang/srt/models/deepseek_v2.py +1 -0
sglang/srt/models/glm4_moe.py +4 -2
sglang/srt/server_args.py +31 -9
sglang/srt/speculative/eagle_worker.py +2 -2
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/standalone_worker.py +1 -1
sglang/test/runners.py +1 -1
sglang/test/send_one.py +27 -1
sglang/test/test_disaggregation_utils.py +33 -15
sglang/test/test_utils.py +37 -2
sglang/version.py +1 -1
{sglang-0.5.3.post2.dist-info → sglang-0.5.3.post3.dist-info}/METADATA +1 -1
{sglang-0.5.3.post2.dist-info → sglang-0.5.3.post3.dist-info}/RECORD +39 -39
{sglang-0.5.3.post2.dist-info → sglang-0.5.3.post3.dist-info}/WHEEL +0 -0
{sglang-0.5.3.post2.dist-info → sglang-0.5.3.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3.post2.dist-info → sglang-0.5.3.post3.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -72,6 +72,8 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.utils import (
     configure_logger,
     get_bool_env_var,
+    is_cuda_alike,
+    is_xpu,
     kill_process_tree,
     require_mlp_sync,
     require_mlp_tp_gather,
@@ -80,6 +82,15 @@ from sglang.srt.utils import (
 )
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
+    profiler_activity
+    for available, profiler_activity in [
+        (is_cuda_alike(), torch.profiler.ProfilerActivity.CUDA),
+        (is_xpu(), torch.profiler.ProfilerActivity.XPU),
+    ]
+    if available
+]
 @dataclasses.dataclass
 class BenchArgs:
@@ -424,10 +435,7 @@ def latency_test_run_once(
     profiler = None
     if profile:
         profiler = torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
+            activities=profile_activities,
             with_stack=True,
             record_shapes=profile_record_shapes,
         )
@@ -460,10 +468,7 @@ def latency_test_run_once(
         if profile and i == output_len / 2:
             profiler = None
             profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
+                activities=profile_activities,
                 with_stack=True,
                 record_shapes=profile_record_shapes,
             )

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -20,6 +20,10 @@ class KVArgs:
     aux_data_ptrs: List[int]
     aux_data_lens: List[int]
     aux_item_lens: List[int]
+    state_data_ptrs: List[int]
+    state_data_lens: List[int]
+    state_item_lens: List[int]
+    state_type: str  # "none", "mamba", "swa"
     ib_device: str
     ib_traffic_class: str
     gpu_id: int
@@ -76,9 +80,13 @@ class BaseKVSender(ABC):
         ...
     @abstractmethod
-    def send(self, kv_indices: npt.NDArray[np.int32]):
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        state_indices: Optional[List[int]] = None,
+    ):
         """
-        Send the kv cache at the given kv indices to the decoder server
+        Send the kv cache at the given kv indices and the extra cache/state at the given indices to the decoder server
         """
         ...
@@ -108,9 +116,14 @@ class BaseKVReceiver(ABC):
     ): ...
     @abstractmethod
-    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+    def init(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        aux_index: Optional[int] = None,
+        state_indices: Optional[List[int]] = None,
+    ):
         """
-        Notify the prefill server about the kv indices and aux index
+        Notify the prefill server about the kv indices, aux index, and state_indices.
         """
         ...

sglang/srt/disaggregation/common/conn.py CHANGED Viewed

@@ -201,6 +201,7 @@ class CommonKVSender(BaseKVSender):
     def send(
         self,
         kv_indices: npt.NDArray[np.int32],
+        state_indices: Optional[List[int]] = None,
     ):
         pass

sglang/srt/disaggregation/decode.py CHANGED Viewed

@@ -25,11 +25,12 @@ import time
 from collections import deque
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
 import torch
 from torch.distributed import ProcessGroup
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams
 from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
 from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVPoll
 from sglang.srt.disaggregation.utils import (
@@ -47,9 +48,19 @@ from sglang.srt.disaggregation.utils import (
 )
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, RequestStage, ScheduleBatch
-from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.allocator import (
+    BaseTokenToKVPoolAllocator,
+    SWATokenToKVPoolAllocator,
+)
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
-from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
+from sglang.srt.mem_cache.memory_pool import (
+    HybridLinearKVPool,
+    HybridReqToTokenPool,
+    KVCache,
+    NSATokenToKVPool,
+    ReqToTokenPool,
+    SWAKVPool,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.utils import get_int_env_var, require_mlp_sync
 from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -124,6 +135,35 @@ class DecodeReqToTokenPool:
         self.free_slots = list(range(self.size + self.pre_alloc_size))
+class HybridMambaDecodeReqToTokenPool(HybridReqToTokenPool):
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+        cache_params: "Mamba2CacheParams",
+        speculative_num_draft_tokens: int,
+        pre_alloc_size: int,
+    ):
+        DecodeReqToTokenPool.__init__(
+            self,
+            size=size,
+            max_context_len=max_context_len,
+            device=device,
+            enable_memory_saver=enable_memory_saver,
+            pre_alloc_size=pre_alloc_size,
+        )
+        self._init_mamba_pool(
+            size + pre_alloc_size, cache_params, device, speculative_num_draft_tokens
+        )
+    def clear(self):
+        self.free_slots = list(range(self.size + self.pre_alloc_size))
+        self.mamba_pool.clear()
 @dataclass
 class DecodeRequest:
     req: Req
@@ -217,6 +257,28 @@ class DecodePreallocQueue:
             self.metadata_buffers.get_buf_infos()
         )
+        if hasattr(self.token_to_kv_pool, "get_state_buf_infos"):
+            state_data_ptrs, state_data_lens, state_item_lens = (
+                self.token_to_kv_pool.get_state_buf_infos()
+            )
+            kv_args.state_data_ptrs = state_data_ptrs
+            kv_args.state_data_lens = state_data_lens
+            kv_args.state_item_lens = state_item_lens
+            if isinstance(self.token_to_kv_pool, SWAKVPool):
+                kv_args.state_type = "swa"
+            elif isinstance(self.token_to_kv_pool, HybridLinearKVPool):
+                kv_args.state_type = "mamba"
+            elif isinstance(self.token_to_kv_pool, NSATokenToKVPool):
+                kv_args.state_type = "nsa"
+            else:
+                kv_args.state_type = "none"
+        else:
+            kv_args.state_data_ptrs = []
+            kv_args.state_data_lens = []
+            kv_args.state_item_lens = []
+            kv_args.state_type = "none"
         kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class: Type[BaseKVManager] = get_kv_class(
@@ -414,16 +476,56 @@ class DecodePreallocQueue:
                 .cpu()
                 .numpy()
             )
+            page_size = self.token_to_kv_pool_allocator.page_size
+            # Prepare extra pool indices for hybrid models
+            if isinstance(self.token_to_kv_pool, HybridLinearKVPool):
+                # Mamba hybrid model: single mamba state index
+                state_indices = [
+                    self.req_to_token_pool.req_index_to_mamba_index_mapping[
+                        decode_req.req.req_pool_idx
+                    ]
+                    .cpu()
+                    .numpy()
+                ]
+            elif isinstance(self.token_to_kv_pool, SWAKVPool):
+                # SWA hybrid model: send decode-side SWA window indices
+                seq_len = len(decode_req.req.origin_input_ids)
+                window_size = self.scheduler.sliding_window_size
+                window_start = max(0, seq_len - window_size)
+                window_start = (window_start // page_size) * page_size
+                window_kv_indices_full = self.req_to_token_pool.req_to_token[
+                    decode_req.req.req_pool_idx, window_start:seq_len
+                ]
+                # Translate to SWA pool indices
+                window_kv_indices_swa = (
+                    self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                        window_kv_indices_full
+                    )
+                )
+                state_indices = window_kv_indices_swa.cpu().numpy()
+                state_indices = kv_to_page_indices(state_indices, page_size)
+            elif isinstance(self.token_to_kv_pool, NSATokenToKVPool):
+                seq_len = len(decode_req.req.origin_input_ids)
+                kv_indices_full = self.req_to_token_pool.req_to_token[
+                    decode_req.req.req_pool_idx, :seq_len
+                ]
+                state_indices = kv_indices_full.cpu().numpy()
+                state_indices = kv_to_page_indices(state_indices, page_size)
+            else:
+                state_indices = None
             decode_req.metadata_buffer_index = (
                 self.req_to_metadata_buffer_idx_allocator.alloc()
             )
             assert decode_req.metadata_buffer_index is not None
-            page_indices = kv_to_page_indices(
-                kv_indices, self.token_to_kv_pool_allocator.page_size
+            page_indices = kv_to_page_indices(kv_indices, page_size)
+            decode_req.kv_receiver.init(
+                page_indices, decode_req.metadata_buffer_index, state_indices
             )
-            decode_req.kv_receiver.init(page_indices, decode_req.metadata_buffer_index)
+            decode_req.req.add_latency(RequestStage.DECODE_BOOTSTRAP)
             preallocated_reqs.append(decode_req)
             indices_to_remove.add(i)
             decode_req.req.time_stats.decode_transfer_queue_entry_time = (
@@ -503,7 +605,10 @@ class DecodePreallocQueue:
     def _pre_alloc(self, req: Req) -> torch.Tensor:
         """Pre-allocate the memory for req_to_token and token_kv_pool"""
-        req_pool_indices = self.req_to_token_pool.alloc(1)
+        if isinstance(self.req_to_token_pool, HybridMambaDecodeReqToTokenPool):
+            req_pool_indices = self.req_to_token_pool.alloc(1, [req])
+        else:
+            req_pool_indices = self.req_to_token_pool.alloc(1)
         assert (
             req_pool_indices is not None

sglang/srt/disaggregation/fake/conn.py CHANGED Viewed

@@ -48,9 +48,12 @@ class FakeKVSender(BaseKVSender):
     def send(
         self,
         kv_indices: npt.NDArray[np.int32],
+        state_indices: Optional[List[int]] = None,
     ):
         self.has_sent = True
-        logger.debug(f"FakeKVSender send with kv_indices: {kv_indices}")
+        logger.debug(
+            f"FakeKVSender send with kv_indices: {kv_indices}, state_indices: {state_indices}"
+        )
     def failure_exception(self):
         raise Exception("Fake KVSender Exception")
@@ -75,10 +78,15 @@ class FakeKVReceiver(BaseKVReceiver):
             logger.debug("FakeKVReceiver poll success")
             return KVPoll.Success
-    def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
+    def init(
+        self,
+        kv_indices: list[int],
+        aux_index: Optional[int] = None,
+        state_indices: Optional[List[int]] = None,
+    ):
         self.has_init = True
         logger.debug(
-            f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
+            f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}, state_indices: {state_indices}"
         )
     def failure_exception(self):

sglang/srt/disaggregation/mooncake/conn.py CHANGED Viewed

@@ -58,6 +58,7 @@ class TransferKVChunk:
     index_slice: slice
     is_last: bool
     prefill_aux_index: Optional[int]
+    state_indices: Optional[List[int]]
 # decode
@@ -69,6 +70,7 @@ class TransferInfo:
     mooncake_session_id: str
     dst_kv_indices: npt.NDArray[np.int32]
     dst_aux_index: int
+    dst_state_indices: List[int]
     required_dst_info_num: int
     is_dummy: bool
@@ -78,9 +80,14 @@ class TransferInfo:
             is_dummy = True
             dst_kv_indices = np.array([], dtype=np.int32)
             dst_aux_index = None
+            dst_state_indices = []
         else:
             dst_kv_indices = np.frombuffer(msg[4], dtype=np.int32)
             dst_aux_index = int(msg[5].decode("ascii"))
+            if msg[6] == b"":
+                dst_state_indices = []
+            else:
+                dst_state_indices = list(np.frombuffer(msg[6], dtype=np.int32))
             is_dummy = False
         return cls(
             room=int(msg[0].decode("ascii")),
@@ -89,7 +96,8 @@ class TransferInfo:
             mooncake_session_id=msg[3].decode("ascii"),
             dst_kv_indices=dst_kv_indices,
             dst_aux_index=dst_aux_index,
-            required_dst_info_num=int(msg[6].decode("ascii")),
+            dst_state_indices=dst_state_indices,
+            required_dst_info_num=int(msg[7].decode("ascii")),
             is_dummy=is_dummy,
         )
@@ -103,6 +111,7 @@ class KVArgsRegisterInfo:
     mooncake_session_id: str
     dst_kv_ptrs: list[int]
     dst_aux_ptrs: list[int]
+    dst_state_data_ptrs: list[int]
     dst_tp_rank: int
     dst_attn_tp_size: int
     dst_kv_item_len: int
@@ -116,9 +125,10 @@ class KVArgsRegisterInfo:
             mooncake_session_id=msg[3].decode("ascii"),
             dst_kv_ptrs=list(struct.unpack(f"{len(msg[4])//8}Q", msg[4])),
             dst_aux_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
-            dst_tp_rank=int(msg[6].decode("ascii")),
-            dst_attn_tp_size=int(msg[7].decode("ascii")),
-            dst_kv_item_len=int(msg[8].decode("ascii")),
+            dst_state_data_ptrs=list(struct.unpack(f"{len(msg[6])//8}Q", msg[6])),
+            dst_tp_rank=int(msg[7].decode("ascii")),
+            dst_attn_tp_size=int(msg[8].decode("ascii")),
+            dst_kv_item_len=int(msg[9].decode("ascii")),
         )
@@ -180,6 +190,9 @@ class MooncakeKVManager(CommonKVManager):
                 )
                 for _ in range(transfer_queue_size)
             ]
+            self.state_executors = concurrent.futures.ThreadPoolExecutor(
+                transfer_thread_pool_size // transfer_queue_size
+            )
             for queue, executor in zip(self.transfer_queues, self.executors):
                 threading.Thread(
                     target=self.transfer_worker, args=(queue, executor), daemon=True
@@ -239,6 +252,12 @@ class MooncakeKVManager(CommonKVManager):
                 self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
             )
+        # Batch register state/extra pool data buffers
+        if self.kv_args.state_data_ptrs and self.kv_args.state_data_lens:
+            self.engine.batch_register(
+                self.kv_args.state_data_ptrs, self.kv_args.state_data_lens
+            )
     def _transfer_data(self, mooncake_session_id, transfer_blocks):
         if not transfer_blocks:
             return 0
@@ -248,17 +267,23 @@ class MooncakeKVManager(CommonKVManager):
             mooncake_session_id, list(src_addrs), list(dst_addrs), list(lengths)
         )
-    def send_kvcache(
+    def _send_kvcache_generic(
         self,
         mooncake_session_id: str,
-        prefill_kv_indices: npt.NDArray[np.int32],
-        dst_kv_ptrs: list[int],
-        dst_kv_indices: npt.NDArray[np.int32],
+        src_data_ptrs: list[int],
+        dst_data_ptrs: list[int],
+        item_lens: list[int],
+        prefill_data_indices: npt.NDArray[np.int32],
+        dst_data_indices: npt.NDArray[np.int32],
         executor: concurrent.futures.ThreadPoolExecutor,
-    ):
-        # Group by indices
+    ) -> int:
+        """
+        Generic KV cache transfer supporting both MHA and MLA architectures.
+        This method is used by both send_kvcache (full pool) and maybe_send_extra.
+        """
+        # Group by indices for optimization
         prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
-            prefill_kv_indices, dst_kv_indices
+            prefill_data_indices, dst_data_indices
         )
         layers_params = None
@@ -266,9 +291,9 @@ class MooncakeKVManager(CommonKVManager):
         # pp is not supported on the decode side yet
         if self.is_mla_backend:
             src_kv_ptrs, dst_kv_ptrs, layers_current_pp_stage = (
-                self.get_mla_kv_ptrs_with_pp(self.kv_args.kv_data_ptrs, dst_kv_ptrs)
+                self.get_mla_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
             )
-            kv_item_len = self.kv_args.kv_item_lens[0]
+            kv_item_len = item_lens[0]
             layers_params = [
                 (
                     src_kv_ptrs[layer_id],
@@ -279,9 +304,9 @@ class MooncakeKVManager(CommonKVManager):
             ]
         else:
             src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
-                self.get_mha_kv_ptrs_with_pp(self.kv_args.kv_data_ptrs, dst_kv_ptrs)
+                self.get_mha_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
             )
-            kv_item_len = self.kv_args.kv_item_lens[0]
+            kv_item_len = item_lens[0]
             layers_params = [
                 (
                     src_k_ptrs[layer_id],
@@ -345,6 +370,24 @@ class MooncakeKVManager(CommonKVManager):
         return 0
+    def send_kvcache(
+        self,
+        mooncake_session_id: str,
+        prefill_kv_indices: npt.NDArray[np.int32],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int32],
+        executor: concurrent.futures.ThreadPoolExecutor,
+    ):
+        return self._send_kvcache_generic(
+            mooncake_session_id=mooncake_session_id,
+            src_data_ptrs=self.kv_args.kv_data_ptrs,
+            dst_data_ptrs=dst_kv_ptrs,
+            item_lens=self.kv_args.kv_item_lens,
+            prefill_data_indices=prefill_kv_indices,
+            dst_data_indices=dst_kv_indices,
+            executor=executor,
+        )
     def send_kvcache_slice(
         self,
         mooncake_session_id: str,
@@ -593,6 +636,58 @@ class MooncakeKVManager(CommonKVManager):
             f"Received AUX_DATA for bootstrap_room {room} with length:{len(data)}"
         )
+    def maybe_send_extra(
+        self,
+        req: TransferInfo,
+        prefill_state_indices: list[int],
+        dst_state_data_ptrs: list[int],
+    ):
+        """Send state or extra pool data with type-specific handling."""
+        state_type = getattr(self.kv_args, "state_type", "none")
+        if state_type == "mamba":
+            return self._send_mamba_state(
+                req,
+                prefill_state_indices,
+                dst_state_data_ptrs,
+            )
+        elif state_type in ["swa", "nsa"]:
+            # Reuse _send_kvcache_generic interface to send extra pool data
+            prefill_state_indices = np.array(prefill_state_indices, dtype=np.int32)
+            dst_state_indices = np.array(req.dst_state_indices, dtype=np.int32)
+            return self._send_kvcache_generic(
+                mooncake_session_id=req.mooncake_session_id,
+                src_data_ptrs=self.kv_args.state_data_ptrs,
+                dst_data_ptrs=dst_state_data_ptrs,
+                item_lens=self.kv_args.state_item_lens,
+                prefill_data_indices=prefill_state_indices,
+                dst_data_indices=dst_state_indices,
+                executor=self.state_executors,
+            )
+        else:
+            return 0
+    def _send_mamba_state(
+        self,
+        req: TransferInfo,
+        prefill_mamba_index: list[int],
+        dst_state_data_ptrs: list[int],
+    ):
+        """Transfer Mamba states."""
+        assert len(prefill_mamba_index) == 1, "Mamba should have single state index"
+        transfer_blocks = []
+        prefill_state_data_ptrs = self.kv_args.state_data_ptrs
+        prefill_state_item_lens = self.kv_args.state_item_lens
+        for i, dst_state_ptr in enumerate(dst_state_data_ptrs):
+            length = prefill_state_item_lens[i]
+            src_addr = prefill_state_data_ptrs[i] + length * int(prefill_mamba_index[0])
+            dst_addr = dst_state_ptr + length * int(req.dst_state_indices[0])
+            transfer_blocks.append((src_addr, dst_addr, length))
+        return self._transfer_data(req.mooncake_session_id, transfer_blocks)
     def sync_status_to_decode_endpoint(
         self, remote: str, dst_port: int, room: int, status: int, prefill_rank: int
     ):
@@ -702,6 +797,21 @@ class MooncakeKVManager(CommonKVManager):
                             break
                         if kv_chunk.is_last:
+                            if kv_chunk.state_indices is not None:
+                                if not self.is_mla_backend and (
+                                    self.attn_tp_size
+                                    != target_rank_registration_info.dst_attn_tp_size
+                                ):
+                                    raise RuntimeError(
+                                        f"PD Disaggregation does NOT support PD different TP sizes for non-MLA hybrid models yet."
+                                    )
+                                self.maybe_send_extra(
+                                    req,
+                                    kv_chunk.state_indices,
+                                    target_rank_registration_info.dst_state_data_ptrs,
+                                )
                             if self.pp_group.is_last_rank:
                                 # Only the last chunk we need to send the aux data
                                 ret = self.send_aux(
@@ -765,7 +875,7 @@ class MooncakeKVManager(CommonKVManager):
                     )
                     continue
                 else:
-                    required_dst_info_num = int(waiting_req_bytes[6].decode("ascii"))
+                    required_dst_info_num = int(waiting_req_bytes[7].decode("ascii"))
                     room = int(room)
                     if room not in self.transfer_infos:
                         self.transfer_infos[room] = {}
@@ -876,6 +986,7 @@ class MooncakeKVManager(CommonKVManager):
         index_slice: slice,
         is_last: bool,
         aux_index: Optional[int] = None,
+        state_indices: Optional[List[int]] = None,
     ):
         assert self.disaggregation_mode == DisaggregationMode.PREFILL
         assert not is_last or (is_last and aux_index is not None)
@@ -909,6 +1020,7 @@ class MooncakeKVManager(CommonKVManager):
                 index_slice=index_slice,
                 is_last=is_last,
                 prefill_aux_index=aux_index,
+                state_indices=state_indices,
             )
         )
@@ -989,6 +1101,7 @@ class MooncakeKVSender(CommonKVSender):
     def send(
         self,
         kv_indices: npt.NDArray[np.int32],
+        state_indices: Optional[List[int]] = None,
     ):
         index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
         self.curr_idx += len(kv_indices)
@@ -1008,6 +1121,7 @@ class MooncakeKVSender(CommonKVSender):
                 index_slice,
                 True,
                 aux_index=self.aux_index,
+                state_indices=state_indices,
             )
     def poll(self) -> KVPoll:
@@ -1110,6 +1224,9 @@ class MooncakeKVReceiver(CommonKVReceiver):
             packed_aux_data_ptrs = b"".join(
                 struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.aux_data_ptrs
             )
+            packed_state_data_ptrs = b"".join(
+                struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.state_data_ptrs
+            )
             # Note(shangming): No need to add pp rank here since pp is not supported on the decode side yet
             tp_rank = self.kv_mgr.kv_args.engine_rank
             kv_item_len = self.kv_mgr.kv_args.kv_item_lens[0]
@@ -1127,13 +1244,19 @@ class MooncakeKVReceiver(CommonKVReceiver):
                         self.session_id.encode("ascii"),
                         packed_kv_data_ptrs,
                         packed_aux_data_ptrs,
+                        packed_state_data_ptrs,
                         dst_tp_rank,
                         dst_attn_tp_size,
                         dst_kv_item_len,
                     ]
                 )
-    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+    def init(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        aux_index: Optional[int] = None,
+        state_indices: Optional[List[int]] = None,
+    ):
         for bootstrap_info in self.bootstrap_infos:
             sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
             is_dummy = bootstrap_info["is_dummy"]
@@ -1147,6 +1270,14 @@ class MooncakeKVReceiver(CommonKVReceiver):
                         self.session_id.encode("ascii"),
                         kv_indices.tobytes() if not is_dummy else b"",
                         str(aux_index).encode("ascii") if not is_dummy else b"",
+                        (
+                            np.array(
+                                state_indices,
+                                dtype=np.int32,
+                            ).tobytes()
+                            if not is_dummy and state_indices is not None
+                            else b""
+                        ),
                         str(self.required_dst_info_num).encode("ascii"),
                     ]
                 )

sglang/srt/disaggregation/nixl/conn.py CHANGED Viewed

@@ -704,6 +704,7 @@ class NixlKVSender(CommonKVSender):
     def send(
         self,
         kv_indices: npt.NDArray[np.int32],
+        state_indices: Optional[List[int]] = None,
     ):
         index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
         self.curr_idx += len(kv_indices)
@@ -755,7 +756,12 @@ class NixlKVReceiver(CommonKVReceiver):
                 self.bootstrap_room
             )
-    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+    def init(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+        aux_index: Optional[int] = None,
+        state_indices: Optional[List[int]] = None,
+    ):
         for bootstrap_info in self.bootstrap_infos:
             logger.debug(
                 f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"

sglang 0.5.3.post2__py3-none-any.whl → 0.5.3.post3__py3-none-any.whl

sglang 0.5.3.post2py3-none-any.whl → 0.5.3.post3py3-none-any.whl