PyPI - sglang - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl - Mend

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

sglang/bench_one_batch.py +0 -2
sglang/bench_serving.py +224 -127
sglang/compile_deep_gemm.py +3 -0
sglang/launch_server.py +0 -14
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/falcon_h1.py +12 -58
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +68 -31
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/qwen3_next.py +11 -43
sglang/srt/disaggregation/decode.py +7 -18
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/nixl/conn.py +55 -23
sglang/srt/disaggregation/prefill.py +17 -32
sglang/srt/entrypoints/engine.py +2 -2
sglang/srt/entrypoints/grpc_request_manager.py +10 -23
sglang/srt/entrypoints/grpc_server.py +220 -80
sglang/srt/entrypoints/http_server.py +49 -1
sglang/srt/entrypoints/openai/protocol.py +159 -31
sglang/srt/entrypoints/openai/serving_chat.py +13 -71
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +4 -0
sglang/srt/function_call/function_call_parser.py +8 -6
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
sglang/srt/layers/attention/attention_registry.py +31 -22
sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
sglang/srt/layers/attention/flashattention_backend.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +223 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/triton_backend.py +1 -1
sglang/srt/layers/logits_processor.py +136 -6
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
sglang/srt/layers/moe/ep_moe/layer.py +8 -286
sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/utils.py +7 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/fp8.py +84 -18
sglang/srt/layers/quantization/modelopt_quant.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/w4afp8.py +2 -16
sglang/srt/lora/lora_manager.py +0 -8
sglang/srt/managers/overlap_utils.py +18 -16
sglang/srt/managers/schedule_batch.py +119 -90
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +213 -126
sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
sglang/srt/managers/tokenizer_manager.py +270 -53
sglang/srt/managers/tp_worker.py +39 -28
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +162 -68
sglang/srt/mem_cache/radix_cache.py +8 -3
sglang/srt/mem_cache/swa_radix_cache.py +70 -14
sglang/srt/model_executor/cuda_graph_runner.py +1 -1
sglang/srt/model_executor/forward_batch_info.py +4 -18
sglang/srt/model_executor/model_runner.py +55 -51
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +187 -6
sglang/srt/model_loader/weight_utils.py +3 -0
sglang/srt/models/falcon_h1.py +11 -9
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/grok.py +5 -13
sglang/srt/models/mixtral.py +1 -3
sglang/srt/models/mllama4.py +11 -1
sglang/srt/models/nemotron_h.py +514 -0
sglang/srt/models/utils.py +5 -1
sglang/srt/sampling/sampling_batch_info.py +11 -9
sglang/srt/server_args.py +100 -33
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_utils.py +0 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils/common.py +18 -0
sglang/srt/utils/hf_transformers_utils.py +2 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +40 -0
sglang/test/simple_eval_longbench_v2.py +332 -0
sglang/test/test_cutlass_w4a8_moe.py +9 -19
sglang/test/test_deterministic.py +18 -2
sglang/test/test_deterministic_utils.py +81 -0
sglang/test/test_disaggregation_utils.py +63 -0
sglang/test/test_utils.py +32 -11
sglang/version.py +1 -1
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -15,6 +15,9 @@ limitations under the License.
 from __future__ import annotations
+from dataclasses import dataclass
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams
 from sglang.srt.layers.attention.nsa import index_buf_accessor
 from sglang.srt.layers.attention.nsa.quant_k_cache import quantize_k_cache
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
@@ -109,17 +112,38 @@ class ReqToTokenPool:
 class MambaPool:
+    @dataclass(frozen=True, kw_only=True)
+    class State:
+        conv: torch.Tensor
+        temporal: torch.Tensor
+        def at_layer_idx(self, layer: int):
+            return type(self)(**{k: v[layer] for k, v in vars(self).items()})
+        def mem_usage_bytes(self):
+            return sum(get_tensor_size_bytes(t) for t in vars(self).values())
+    @dataclass(frozen=True, kw_only=True)
+    class SpeculativeState(State):
+        intermediate_ssm: torch.Tensor
+        intermediate_conv_window: torch.Tensor
     def __init__(
         self,
+        *,
         size: int,
-        conv_dtype: torch.dtype,
-        ssm_dtype: torch.dtype,
-        num_mamba_layers: int,
-        conv_state_shape: Tuple[int, int],
-        temporal_state_shape: Tuple[int, int],
+        cache_params: "Mamba2CacheParams",
         device: str,
         speculative_num_draft_tokens: Optional[int] = None,
     ):
+        conv_state_shape = cache_params.shape.conv
+        temporal_state_shape = cache_params.shape.temporal
+        conv_dtype = cache_params.dtype.conv
+        ssm_dtype = cache_params.dtype.temporal
+        num_mamba_layers = len(cache_params.layers)
+        # assume conv_state = (dim, state_len)
+        assert conv_state_shape[0] > conv_state_shape[1]
         conv_state = torch.zeros(
             size=(num_mamba_layers, size + 1) + conv_state_shape,
             dtype=conv_dtype,
@@ -158,11 +182,11 @@ class MambaPool:
                 dtype=conv_dtype,
                 device="cuda",
             )
-            self.mamba_cache = (
-                conv_state,
-                temporal_state,
-                intermediate_ssm_state_cache,
-                intermediate_conv_window_cache,
+            self.mamba_cache = self.SpeculativeState(
+                conv=conv_state,
+                temporal=temporal_state,
+                intermediate_ssm=intermediate_ssm_state_cache,
+                intermediate_conv_window=intermediate_conv_window_cache,
             )
             logger.info(
                 f"Mamba Cache is allocated. "
@@ -172,7 +196,7 @@ class MambaPool:
                 f"intermediate_conv_window_cache size: {get_tensor_size_bytes(intermediate_conv_window_cache) / GB:.2f}GB "
             )
         else:
-            self.mamba_cache = (conv_state, temporal_state)
+            self.mamba_cache = self.State(conv=conv_state, temporal=temporal_state)
             logger.info(
                 f"Mamba Cache is allocated. "
                 f"conv_state size: {get_tensor_size_bytes(conv_state) / GB:.2f}GB, "
@@ -180,16 +204,14 @@ class MambaPool:
             )
         self.size = size
         self.free_slots = list(range(size))
-        self.mem_usage = self.get_mamba_size() / GB
-    def get_mamba_params_all_layers(self):
-        return [self.mamba_cache[i] for i in range(len(self.mamba_cache))]
+        self.mem_usage = self.mamba_cache.mem_usage_bytes() / GB
-    def get_mamba_params(self, layer_id: int):
-        return [self.mamba_cache[i][layer_id] for i in range(len(self.mamba_cache))]
+    def get_speculative_mamba2_params_all_layers(self) -> SpeculativeState:
+        assert isinstance(self.mamba_cache, self.SpeculativeState)
+        return self.mamba_cache
-    def get_mamba_size(self):
-        return sum(get_tensor_size_bytes(t) for t in self.mamba_cache)
+    def mamba2_layer_cache(self, layer_id: int):
+        return self.mamba_cache.at_layer_idx(layer_id)
     def available_size(self):
         return len(self.free_slots)
@@ -208,7 +230,9 @@ class MambaPool:
             self.free_slots.append(free_index)
         else:
             self.free_slots.extend(free_index)
-        self.mamba_cache[0][:, free_index] = self.mamba_cache[1][:, free_index] = 0
+        self.mamba_cache.conv[:, free_index] = self.mamba_cache.temporal[
+            :, free_index
+        ] = 0
     def clear(self):
         self.free_slots = list(range(self.size))
@@ -219,16 +243,13 @@ class HybridReqToTokenPool(ReqToTokenPool):
     def __init__(
         self,
+        *,
         size: int,
         max_context_len: int,
         device: str,
         enable_memory_saver: bool,
-        conv_dtype: torch.dtype,
-        ssm_dtype: torch.dtype,
-        mamba_layers: List[int],
-        conv_state_shape: Tuple[int, int],
-        temporal_state_shape: Tuple[int, int],
-        speculative_num_draft_tokens: int,
+        cache_params: "Mamba2CacheParams",
+        speculative_num_draft_tokens: int = None,
     ):
         super().__init__(
             size=size,
@@ -238,16 +259,12 @@ class HybridReqToTokenPool(ReqToTokenPool):
         )
         self.mamba_pool = MambaPool(
-            size,
-            conv_dtype,
-            ssm_dtype,
-            len(mamba_layers),
-            conv_state_shape,
-            temporal_state_shape,
-            device,
-            speculative_num_draft_tokens,
+            size=size,
+            cache_params=cache_params,
+            device=device,
+            speculative_num_draft_tokens=speculative_num_draft_tokens,
         )
-        self.mamba_map = {layer_id: i for i, layer_id in enumerate(mamba_layers)}
+        self.mamba_map = {layer_id: i for i, layer_id in enumerate(cache_params.layers)}
         self.device = device
         self.req_index_to_mamba_index_mapping: torch.Tensor = torch.zeros(
@@ -287,12 +304,12 @@ class HybridReqToTokenPool(ReqToTokenPool):
     def get_mamba_indices(self, req_indices: torch.Tensor) -> torch.Tensor:
         return self.req_index_to_mamba_index_mapping[req_indices]
-    def get_mamba_params(self, layer_id: int):
+    def mamba2_layer_cache(self, layer_id: int):
         assert layer_id in self.mamba_map
-        return self.mamba_pool.get_mamba_params(self.mamba_map[layer_id])
+        return self.mamba_pool.mamba2_layer_cache(self.mamba_map[layer_id])
-    def get_mamba_params_all_layers(self):
-        return self.mamba_pool.get_mamba_params_all_layers()
+    def get_speculative_mamba2_params_all_layers(self) -> MambaPool.SpeculativeState:
+        return self.mamba_pool.get_speculative_mamba2_params_all_layers()
     # For chunk prefill, we can not free mamba cache, we need use it in the future
     def free(self, free_index: Union[int, List[int]], free_mamba_cache: bool = True):
@@ -415,6 +432,7 @@ class MHATokenToKVPool(KVCache):
         enable_memory_saver: bool,
         start_layer: Optional[int] = None,
         end_layer: Optional[int] = None,
+        enable_kv_cache_copy: bool = False,
     ):
         super().__init__(
             size,
@@ -446,8 +464,57 @@ class MHATokenToKVPool(KVCache):
         self.device_module = torch.get_device_module(self.device)
         self.alt_stream = self.device_module.Stream() if _is_cuda else None
+        if enable_kv_cache_copy:
+            self._init_kv_copy_and_warmup()
+        else:
+            self._kv_copy_config = None
         self._finalize_allocation_log(size)
+    def _init_kv_copy_and_warmup(self):
+        # Heuristics for KV copy tiling
+        _KV_COPY_STRIDE_THRESHOLD_LARGE = 8192
+        _KV_COPY_STRIDE_THRESHOLD_MEDIUM = 4096
+        _KV_COPY_TILE_SIZE_LARGE = 512
+        _KV_COPY_TILE_SIZE_MEDIUM = 256
+        _KV_COPY_TILE_SIZE_SMALL = 128
+        _KV_COPY_NUM_WARPS_LARGE_TILE = 8
+        _KV_COPY_NUM_WARPS_SMALL_TILE = 4
+        stride_bytes = int(self.data_strides[0].item())
+        if stride_bytes >= _KV_COPY_STRIDE_THRESHOLD_LARGE:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_LARGE
+        elif stride_bytes >= _KV_COPY_STRIDE_THRESHOLD_MEDIUM:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_MEDIUM
+        else:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_SMALL
+        self._kv_copy_config = {
+            "bytes_per_tile": bytes_per_tile,
+            "byte_tiles": (stride_bytes + bytes_per_tile - 1) // bytes_per_tile,
+            "num_warps": (
+                _KV_COPY_NUM_WARPS_SMALL_TILE
+                if bytes_per_tile <= _KV_COPY_TILE_SIZE_MEDIUM
+                else _KV_COPY_NUM_WARPS_LARGE_TILE
+            ),
+        }
+        dummy_loc = torch.zeros(1, dtype=torch.int32, device=self.device)
+        grid = (self.data_ptrs.numel(), self._kv_copy_config["byte_tiles"])
+        copy_all_layer_kv_cache_tiled[grid](
+            self.data_ptrs,
+            self.data_strides,
+            dummy_loc,
+            dummy_loc,
+            1,
+            1,
+            BYTES_PER_TILE=self._kv_copy_config["bytes_per_tile"],
+            num_warps=self._kv_copy_config["num_warps"],
+            num_stages=2,
+        )
     def _create_buffers(self):
         with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
             with (
@@ -642,13 +709,28 @@ class MHATokenToKVPool(KVCache):
             self.v_buffer[layer_id - self.start_layer][loc] = cache_v
     def move_kv_cache(self, tgt_loc: torch.Tensor, src_loc: torch.Tensor):
-        copy_all_layer_kv_cache[(len(self.data_ptrs),)](
+        N = tgt_loc.numel()
+        if N == 0:
+            return
+        assert (
+            self._kv_copy_config is not None
+        ), "KV copy not initialized. Set enable_kv_cache_copy=True in __init__"
+        cfg = self._kv_copy_config
+        N_upper = next_power_of_2(N)
+        grid = (self.data_ptrs.numel(), cfg["byte_tiles"])
+        copy_all_layer_kv_cache_tiled[grid](
             self.data_ptrs,
             self.data_strides,
             tgt_loc,
             src_loc,
-            len(tgt_loc),
-            next_power_of_2(len(tgt_loc)),
+            N,
+            N_upper,
+            BYTES_PER_TILE=cfg["bytes_per_tile"],
+            num_warps=cfg["num_warps"],
+            num_stages=2,
         )
@@ -749,6 +831,7 @@ class SWAKVPool(KVCache):
         self,
         size: int,
         size_swa: int,
+        dtype: torch.dtype,
         swa_attention_layer_ids: List[int],
         full_attention_layer_ids: List[int],
         enable_kvcache_transpose: bool,
@@ -757,6 +840,7 @@ class SWAKVPool(KVCache):
     ):
         self.size = size
         self.size_swa = size_swa
+        self.dtype = dtype
         self.swa_layer_nums = len(swa_attention_layer_ids)
         self.full_layer_nums = len(full_attention_layer_ids)
         kwargs["page_size"] = 1
@@ -766,11 +850,13 @@ class SWAKVPool(KVCache):
         self.swa_kv_pool = token_to_kv_pool_class(
             size=size_swa,
+            dtype=dtype,
             layer_num=self.swa_layer_nums,
             **kwargs,
         )
         self.full_kv_pool = token_to_kv_pool_class(
             size=size,
+            dtype=dtype,
             layer_num=self.full_layer_nums,
             **kwargs,
         )
@@ -1091,7 +1177,9 @@ class MLATokenToKVPool(KVCache):
             dtype=torch.uint64,
             device=self.device,
         )
-        self._finalize_allocation_log(size)
+        if not use_nsa:
+            # NSA will allocate indexer KV cache later and then log the total size
+            self._finalize_allocation_log(size)
     def get_kv_size_bytes(self):
         assert hasattr(self, "kv_buffer")
@@ -1212,6 +1300,9 @@ class MLATokenToKVPool(KVCache):
 class NSATokenToKVPool(MLATokenToKVPool):
+    quant_block_size = 128
+    index_k_with_scale_buffer_dtype = torch.uint8
     def __init__(
         self,
         size: int,
@@ -1245,8 +1336,6 @@ class NSATokenToKVPool(MLATokenToKVPool):
         # num head == 1 and head dim == 128 for index_k in NSA
         assert index_head_dim == 128
-        self.quant_block_size = 128
         assert self.page_size == 64
         self.index_k_with_scale_buffer = [
             torch.zeros(
@@ -1261,11 +1350,12 @@ class NSATokenToKVPool(MLATokenToKVPool):
                     self.page_size
                     * (index_head_dim + index_head_dim // self.quant_block_size * 4),
                 ),
-                dtype=torch.uint8,
+                dtype=self.index_k_with_scale_buffer_dtype,
                 device=device,
             )
             for _ in range(layer_num)
         ]
+        self._finalize_allocation_log(size)
     def get_index_k_with_scale_buffer(self, layer_id: int) -> torch.Tensor:
         if self.layer_transfer_counter is not None:
@@ -1307,6 +1397,12 @@ class NSATokenToKVPool(MLATokenToKVPool):
             pool=self, buf=buf, loc=loc, index_k=index_k, index_k_scale=index_k_scale
         )
+    def get_kv_size_bytes(self):
+        kv_size_bytes = super().get_kv_size_bytes()
+        for index_k_cache in self.index_k_with_scale_buffer:
+            kv_size_bytes += get_tensor_size_bytes(index_k_cache)
+        return kv_size_bytes
 class AscendMLAPagedTokenToKVPool(MLATokenToKVPool):
     def __init__(
@@ -1584,38 +1680,36 @@ class DoubleSparseTokenToKVPool(KVCache):
 @triton.jit
-def copy_all_layer_kv_cache(
+def copy_all_layer_kv_cache_tiled(
     data_ptrs,
     strides,
     tgt_loc_ptr,
     src_loc_ptr,
     num_locs,
     num_locs_upper: tl.constexpr,
+    BYTES_PER_TILE: tl.constexpr,
 ):
-    BLOCK_SIZE: tl.constexpr = 128
+    """2D tiled kernel. Safe for in-place copy."""
     bid = tl.program_id(0)
+    tid = tl.program_id(1)
     stride = tl.load(strides + bid)
+    base_ptr = tl.load(data_ptrs + bid)
+    base_ptr = tl.cast(base_ptr, tl.pointer_type(tl.uint8))
-    data_ptr = tl.load(data_ptrs + bid)
-    data_ptr = tl.cast(data_ptr, tl.pointer_type(tl.uint8))
+    byte_off = tid * BYTES_PER_TILE + tl.arange(0, BYTES_PER_TILE)
+    mask_byte = byte_off < stride
+    tl.multiple_of(byte_off, 16)
-    num_locs_offset = tl.arange(0, num_locs_upper)
-    tgt_locs = tl.load(tgt_loc_ptr + num_locs_offset, mask=num_locs_offset < num_locs)
-    src_locs = tl.load(src_loc_ptr + num_locs_offset, mask=num_locs_offset < num_locs)
+    loc_idx = tl.arange(0, num_locs_upper)
+    mask_loc = loc_idx < num_locs
-    # NOTE: we cannot parallelize over the tgt_loc_ptr dim with cuda blocks
-    # because this copy is an inplace operation.
+    src = tl.load(src_loc_ptr + loc_idx, mask=mask_loc, other=0)
+    tgt = tl.load(tgt_loc_ptr + loc_idx, mask=mask_loc, other=0)
-    num_loop = tl.cdiv(stride, BLOCK_SIZE)
-    for i in range(num_loop):
-        copy_offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
-        mask = (num_locs_offset < num_locs)[:, None] & (copy_offset < stride)[None, :]
-        value = tl.load(
-            data_ptr + src_locs[:, None] * stride + copy_offset[None, :], mask=mask
-        )
-        tl.store(
-            data_ptr + tgt_locs[:, None] * stride + copy_offset[None, :],
-            value,
-            mask=mask,
-        )
+    src_ptr = base_ptr + src[:, None] * stride + byte_off[None, :]
+    tgt_ptr = base_ptr + tgt[:, None] * stride + byte_off[None, :]
+    mask = mask_loc[:, None] & mask_byte[None, :]
+    vals = tl.load(src_ptr, mask=mask)
+    tl.store(tgt_ptr, vals, mask=mask)

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -326,6 +326,8 @@ class RadixCache(BasePrefixCache):
         token_ids = (req.origin_input_ids + req.output_ids)[:-1]
         all_token_len = len(token_ids)
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
         actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, :all_token_len
@@ -349,7 +351,8 @@ class RadixCache(BasePrefixCache):
         old_prefix_len = len(req.prefix_indices)
         if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
-            # prefix_indices attached partial part (for page_size > 1) and one unmatched token (for EAGLE)
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
             old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool
@@ -370,7 +373,8 @@ class RadixCache(BasePrefixCache):
         token_ids = req.fill_ids
         all_token_len = len(token_ids)
-        # The actual kv len for EAGLE is len(token_ids), since EAGLE uses bigram key
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
         actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, :all_token_len
@@ -393,7 +397,8 @@ class RadixCache(BasePrefixCache):
         old_prefix_len = len(req.prefix_indices)
         if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
-            # prefix_indices attached partial part (for page_size > 1) and one unmatched token (for EAGLE)
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
             old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool

sglang/srt/mem_cache/swa_radix_cache.py CHANGED Viewed

@@ -32,6 +32,7 @@ from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
 from sglang.srt.mem_cache.radix_cache import (
     RadixKey,
+    _convert_to_bigram_key,
     _key_match_page_size1,
     _key_match_paged,
     get_child_key,
@@ -327,12 +328,14 @@ class SWARadixCache(BasePrefixCache):
         sliding_window_size: int,
         page_size: int,
         disable: bool = False,
+        is_eagle: bool = False,
     ):
         assert isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
         self.page_size = page_size
         self.disable = disable
+        self.is_eagle = is_eagle
         if self.token_to_kv_pool_allocator:
             self.device = self.token_to_kv_pool_allocator.device
@@ -346,6 +349,11 @@ class SWARadixCache(BasePrefixCache):
             self.key_match_fn = partial(_key_match_paged, page_size=page_size)
             self.get_child_key_fn = partial(get_child_key, page_size=page_size)
+        if is_eagle:
+            self.key_convert_fn = _convert_to_bigram_key
+        else:
+            self.key_convert_fn = lambda key: key
         self.sliding_window_size = sliding_window_size
         self.reset()
@@ -376,6 +384,8 @@ class SWARadixCache(BasePrefixCache):
             The last node create a new child if the prefix is shorter
             than the last node's value.
         """
+        key.token_ids = self.key_convert_fn(key.token_ids)
         if self.disable or len(key) == 0:
             return MatchResult(
                 device_indices=torch.empty(
@@ -406,8 +416,15 @@ class SWARadixCache(BasePrefixCache):
         if self.disable:
             return 0
+        key.token_ids = self.key_convert_fn(key.token_ids)
         if value is None:
             value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
+        if self.is_eagle:
+            # Make sure the value len equal to the EAGLE bigram key len
+            value = value[: len(key)]
         return self._insert_helper(self.root_node, key, value, prev_prefix_len)
     def cache_finished_req(self, req: Req) -> None:
@@ -422,25 +439,41 @@ class SWARadixCache(BasePrefixCache):
             return
         token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        all_token_len = len(token_ids)
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
+        actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :all_token_len
         ]
         if self.page_size != 1:
-            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_len = actual_kv_len // self.page_size * self.page_size
             page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
             self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
         else:
-            page_aligned_len = len(kv_indices)
+            page_aligned_len = actual_kv_len
             page_aligned_kv_indices = kv_indices.clone()
+            if self.is_eagle:
+                self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
+        page_aligned_token_len = (
+            page_aligned_len + 1 if self.is_eagle else page_aligned_len
+        )
+        old_prefix_len = len(req.prefix_indices)
+        if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
+            old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool
         # insert the token_ids and kv_indices into the radix tree
         # Note: the insert function already frees the overlapped kv_indices
         new_prefix_len = self.insert(
-            RadixKey(token_ids[:page_aligned_len], req.extra_key),
+            RadixKey(token_ids[:page_aligned_token_len], req.extra_key),
             page_aligned_kv_indices,
-            len(req.prefix_indices),
+            old_prefix_len,
         )
         # Remove req slot release the cache lock
@@ -459,39 +492,56 @@ class SWARadixCache(BasePrefixCache):
             return
         token_ids = req.fill_ids
+        all_token_len = len(token_ids)
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
+        actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, : len(token_ids)
+            req.req_pool_idx, :all_token_len
         ]
         if self.page_size != 1:
-            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_len = actual_kv_len // self.page_size * self.page_size
             page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
         else:
-            page_aligned_len = len(kv_indices)
+            page_aligned_len = actual_kv_len
             page_aligned_kv_indices = kv_indices.clone()
-        page_aligned_token_ids = token_ids[:page_aligned_len]
+        # For EAGLE, the page_aligned_len is for the bigram key, the normal key len should +1
+        page_aligned_token_len = (
+            page_aligned_len + 1 if self.is_eagle else page_aligned_len
+        )
+        page_aligned_token_ids = token_ids[:page_aligned_token_len]
+        old_prefix_len = len(req.prefix_indices)
+        if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
+            old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool
         # Note: the insert function already frees the overlapped kv_indices
         new_prefix_len = self.insert(
             RadixKey(page_aligned_token_ids, req.extra_key),
             page_aligned_kv_indices,
-            len(req.prefix_indices),
+            old_prefix_len,
         )
         # The prefix indices could be updated, reuse it
         new_indices, new_last_node, _, _ = self.match_prefix(
             RadixKey(page_aligned_token_ids, req.extra_key)
         )
-        assert len(req.prefix_indices) <= len(
+        assert old_prefix_len <= len(
             new_indices
         ), f"{req.prefix_indices=}, {new_indices=}"
         assert new_prefix_len <= len(new_indices), f"{new_prefix_len=}, {new_indices=}"
         self.req_to_token_pool.write(
-            (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
-            new_indices[len(req.prefix_indices) :],
+            (req.req_pool_idx, slice(old_prefix_len, len(new_indices))),
+            new_indices[old_prefix_len:],
         )
+        req.last_matched_prefix_len = len(new_indices)
         self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
         swa_uuid_for_lock = self.inc_lock_ref(new_last_node)
@@ -501,7 +551,13 @@ class SWARadixCache(BasePrefixCache):
                 [new_indices, kv_indices[len(new_indices) :]]
             )
         else:
-            req.prefix_indices = new_indices
+            if self.is_eagle:
+                # Attach the kv index of the last token for EAGLE, it can be used in chunked prefill
+                req.prefix_indices = torch.cat(
+                    [new_indices, kv_indices[actual_kv_len:]]
+                )
+            else:
+                req.prefix_indices = new_indices
         req.last_node = new_last_node
         req.swa_uuid_for_lock = swa_uuid_for_lock

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -849,7 +849,7 @@ class CudaGraphRunner:
                 )
         elif self.model_runner.spec_algorithm.is_ngram():
-            from sglang.srt.speculative.ngram_utils import NgramVerifyInput
+            from sglang.srt.speculative.ngram_info import NgramVerifyInput
             spec_info = NgramVerifyInput(
                 draft_token=None,

sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl