PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/bench_serving.py +23 -3
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +5 -16
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +218 -79
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/topk.py +30 -3
sglang/srt/layers/quantization/__init__.py +134 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +12 -0
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/scheduler.py +25 -19
sglang/srt/managers/tokenizer_manager.py +0 -1
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -8
sglang/srt/model_executor/model_runner.py +9 -6
sglang/srt/model_loader/loader.py +11 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +151 -26
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +6 -0
sglang/srt/openai_api/adapter.py +88 -87
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/server_args.py +21 -11
sglang/srt/speculative/eagle_worker.py +1 -1
sglang/srt/utils.py +33 -0
sglang/test/runners.py +27 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/bench_serving.py CHANGED Viewed

@@ -965,7 +965,7 @@ async def benchmark(
     request_rate: float,
     max_concurrency: Optional[int],
     disable_tqdm: bool,
-    lora_name: str,
+    lora_names: List[str],
     extra_request_body: Dict[str, Any],
     profile: bool,
     pd_seperated: bool = False,
@@ -988,6 +988,11 @@ async def benchmark(
     # Warmup
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    if lora_names != None and len(lora_names) != 0:
+        lora_name = lora_names[0]
+    else:
+        lora_name = None
     test_input = RequestFuncInput(
         model=model_id,
         prompt=test_prompt,
@@ -1028,6 +1033,12 @@ async def benchmark(
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
+        if lora_names != None and len(lora_names) != 0:
+            idx = random.randint(0, len(lora_names) - 1)
+            lora_name = lora_names[idx]
+        else:
+            lora_name = None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=prompt,
@@ -1347,7 +1358,7 @@ def run_benchmark(args_: argparse.Namespace):
             request_rate=args.request_rate,
             max_concurrency=args.max_concurrency,
             disable_tqdm=args.disable_tqdm,
-            lora_name=args.lora_name,
+            lora_names=args.lora_name,
             extra_request_body=extra_request_body,
             profile=args.profile,
             pd_seperated=args.pd_seperated,
@@ -1366,6 +1377,13 @@ def set_ulimit(target_soft_limit=65535):
             print(f"Fail to set RLIMIT_NOFILE: {e}")
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, [])
+        for lora_name in values:
+            getattr(namespace, self.dest).append(lora_name)
 if __name__ == "__main__":
     parser = ArgumentParser(description="Benchmark the online serving throughput.")
     parser.add_argument(
@@ -1509,8 +1527,10 @@ if __name__ == "__main__":
     parser.add_argument(
         "--lora-name",
         type=str,
+        nargs="*",
         default=None,
-        help="The name of LoRA adapter",
+        action=LoRAPathAction,
+        help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
     )
     parser.add_argument(
         "--prompt-suffix",

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -4,7 +4,6 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 import torch
-import torchvision.transforms as T
 from PIL import Image, ImageOps
 from transformers import (
     AutoProcessor,
@@ -76,6 +75,16 @@ class ImageTransform(object):
         self.std = std
         self.normalize = normalize
+        # only load torchvision.transforms when needed
+        try:
+            import torchvision.transforms as T
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
+            ) from err
         transform_pipelines = [T.ToTensor()]
         if normalize:

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -22,11 +22,7 @@ import torch
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
-from sglang.srt.layers.quantization import (
-    BASE_QUANTIZATION_METHODS,
-    QUANTIZATION_METHODS,
-    VLLM_AVAILABLE,
-)
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
 from sglang.srt.utils import get_bool_env_var, is_hip
 logger = logging.getLogger(__name__)
@@ -239,12 +235,7 @@ class ModelConfig:
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
     def _verify_quantization(self) -> None:
-        # Select supported quantization methods based on vllm availability
-        if VLLM_AVAILABLE:
-            supported_quantization = [*QUANTIZATION_METHODS]
-        else:
-            supported_quantization = [*BASE_QUANTIZATION_METHODS]
+        supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
             "awq",
             "gptq",
@@ -282,11 +273,7 @@ class ModelConfig:
             quant_method = quant_cfg.get("quant_method", "").lower()
             # Detect which checkpoint is it
-            # Only iterate through currently available quantization methods
-            available_methods = (
-                QUANTIZATION_METHODS if VLLM_AVAILABLE else BASE_QUANTIZATION_METHODS
-            )
-            for _, method in available_methods.items():
+            for _, method in QUANTIZATION_METHODS.items():
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization
                 )
@@ -467,6 +454,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "InternLM2ForRewardModel" in model_architectures
         or "Qwen2ForRewardModel" in model_architectures
         or "Qwen2ForSequenceClassification" in model_architectures
+        or "CLIPModel" in model_architectures
     ):
         return False
     else:
@@ -488,6 +476,7 @@ multimodal_model_archs = [
     "MllamaForConditionalGeneration",
     "Qwen2VLForConditionalGeneration",
     "Qwen2_5_VLForConditionalGeneration",
+    "CLIPModel",
 ]

sglang/srt/distributed/device_communicators/custom_all_reduce.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 import os
 from contextlib import contextmanager
 from functools import wraps
-from typing import Callable, List, Optional, TypeVar, Union
+from typing import Any, Callable, List, Optional, TypeVar, Union
 import torch
 import torch.distributed as dist

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -264,10 +264,16 @@ class GroupCoordinator:
         self.ca_comm: Optional[CustomAllreduce] = None
         if use_custom_allreduce and self.world_size > 1:
             # Initialize a custom fast all-reduce implementation.
-            self.ca_comm = CustomAllreduce(
-                group=self.cpu_group,
-                device=self.device,
-            )
+            try:
+                self.ca_comm = CustomAllreduce(
+                    group=self.cpu_group,
+                    device=self.device,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Setup Custom allreduce failed with {e}. To silence this "
+                    "warning, specify --disable-custom-all-reduce explicitly."
+                )
         from sglang.srt.distributed.device_communicators.hpu_communicator import (
             HpuCommunicator,
@@ -439,6 +445,15 @@ class GroupCoordinator:
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+    ) -> None:
+        # TODO(ch-wan): support other backends
+        torch.distributed.reduce_scatter(output, input_list, group=self.device_group)
+        return output
     def _all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
         pynccl_comm = self.pynccl_comm
         if pynccl_comm is not None and not pynccl_comm.disabled:
@@ -456,11 +471,23 @@ class GroupCoordinator:
                 output, input, group_name=self.unique_name
             )
-    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    def all_gather(
+        self,
+        input_: torch.Tensor,
+        dim: int = -1,
+        tensor_list: List[torch.Tensor] = None,
+    ) -> torch.Tensor:
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
+        if tensor_list is not None:
+            # TODO(ch-wan): support other backends
+            return torch.distributed.all_gather(
+                tensor_list, input_, group=self.device_group
+            )
         assert (
             -input_.dim() <= dim < input_.dim()
         ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -561,7 +561,13 @@ def available_models():
     served_model_names = [_global_state.tokenizer_manager.served_model_name]
     model_cards = []
     for served_model_name in served_model_names:
-        model_cards.append(ModelCard(id=served_model_name, root=served_model_name))
+        model_cards.append(
+            ModelCard(
+                id=served_model_name,
+                root=served_model_name,
+                max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+            )
+        )
     return ModelList(data=model_cards)

sglang/srt/entrypoints/verl_engine.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch.distributed as dist
 from torch.distributed.tensor import DeviceMesh, DTensor
 from sglang.srt.model_executor.model_runner import LocalSerializedTensor
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.server import Engine
 from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj
@@ -30,6 +31,7 @@ class VerlEngine:
         nnodes: int = 1,
         **kwargs,
     ):
+        monkey_patch_torch_reductions()
         self._device_mesh_cpu = device_mesh_cpu
         self._tp_rank = device_mesh_cpu.get_local_rank()
         self._tp_size = device_mesh_cpu.size()

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -290,7 +290,6 @@ class BaseFormatDetector(ABC):
                             calls=[
                                 ToolCallItem(
                                     tool_index=self.current_tool_id,
-                                    name="",
                                     parameters=argument_diff,
                                 )
                             ],

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -13,7 +13,9 @@ from typing import TYPE_CHECKING, Optional, Union
 import torch
+from sglang.srt.configs.model_config import AttentionArch
 from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 if TYPE_CHECKING:
@@ -29,11 +31,11 @@ class FlashAttentionMetadata:
     cu_seqlens_q: torch.Tensor = None
     cu_seqlens_k: torch.Tensor = None
+    max_seq_len_q: int = 0
     max_seq_len_k: int = 0
     window_size: tuple = (-1, -1)
     page_table: torch.Tensor = None
     cache_seqlens_int32: torch.Tensor = None
-    max_seq_len_q: int = 0
 class FlashAttentionBackend(AttentionBackend):
@@ -57,13 +59,16 @@ class FlashAttentionBackend(AttentionBackend):
         self.device = model_runner.device
         self.decode_cuda_graph_metadata = {}
         self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.page_size = model_runner.page_size
+        self.use_mla = (
+            model_runner.model_config.attention_arch == AttentionArch.MLA
+        ) and (not global_server_args_dict["disable_mla"])
     def init_forward_metadata(self, forward_batch: ForwardBatch):
         """Initialize forward metadata to cache repetitive calculations."""
         # Create metadata based on forward mode
         metadata = FlashAttentionMetadata()
-        extend_seq_lens = forward_batch.extend_seq_lens
         # Get sequence information
         seqlens_in_batch = forward_batch.seq_lens
         # Precompute int32 version of sequence lengths
@@ -79,21 +84,33 @@ class FlashAttentionBackend(AttentionBackend):
         metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
             forward_batch.req_pool_indices, : metadata.max_seq_len_k
         ]
+        # Precompute strided indices
+        # [0, page_size, 2 * page_size, ...]
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
         if forward_batch.forward_mode == ForwardMode.DECODE:
             # Precompute cumulative sequence lengths
             metadata.cu_seqlens_q = torch.arange(
                 0, batch_size + 1, dtype=torch.int32, device=device
             )
         else:
-            extend_no_prefix = not any(forward_batch.extend_prefix_lens)
             # Precompute cumulative sequence lengths
-            if not extend_no_prefix:
+            if any(forward_batch.extend_prefix_lens_cpu):
+                extend_seq_lens = forward_batch.extend_seq_lens
                 metadata.cu_seqlens_q = torch.nn.functional.pad(
                     torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
                 )
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
             else:
                 metadata.cu_seqlens_q = metadata.cu_seqlens_k
-            metadata.max_seq_len_q = seqlens_in_batch.max().item()
+                metadata.max_seq_len_q = metadata.max_seq_len_k
         self.forward_metadata = metadata
     def forward_extend(
@@ -105,23 +122,30 @@ class FlashAttentionBackend(AttentionBackend):
         forward_batch: ForwardBatch,
         save_kv_cache=True,
     ):
-        cache_loc = (
-            forward_batch.out_cache_loc
-            if not layer.is_cross_attention
-            else forward_batch.encoder_out_cache_loc
-        )
         if k is not None:
             assert v is not None
             if save_kv_cache:
-                forward_batch.token_to_kv_pool.set_kv_buffer(
-                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
                 )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
         # Use precomputed metadata
         metadata = self.forward_metadata
-        # # Use Flash Attention for prefill
         # Calculate window size (can be moved to metadata if layer properties don't change)
         # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
         # here is two side inclusive
@@ -130,26 +154,72 @@ class FlashAttentionBackend(AttentionBackend):
             if layer.sliding_window_size is not None
             else (-1, -1)
         )
-        kv_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
-        key_cache, value_cache = kv_cache[0], kv_cache[1]
-        o = flash_attn_with_kvcache(
-            q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
-            k_cache=key_cache.unsqueeze(1),
-            v_cache=value_cache.unsqueeze(1),
-            page_table=metadata.page_table,
-            cache_seqlens=metadata.cache_seqlens_int32,
-            cu_seqlens_q=metadata.cu_seqlens_q,
-            cu_seqlens_k_new=metadata.cu_seqlens_k,
-            max_seqlen_q=metadata.max_seq_len_q,
-            softmax_scale=layer.scaling,
-            causal=True,
-            window_size=window_size,
-            softcap=layer.logit_cap,
-            k_descale=layer.k_scale,
-            v_descale=layer.v_scale,
-        )
-        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+        page_table = metadata.page_table
+        # # Use Flash Attention for prefill
+        if not self.use_mla:
+            # Do multi-head attention
+            kv_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+            o = flash_attn_with_kvcache(
+                q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache=key_cache,
+                v_cache=value_cache,
+                page_table=page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=metadata.max_seq_len_q,
+                softmax_scale=layer.scaling,
+                causal=True,
+                window_size=window_size,
+                softcap=layer.logit_cap,
+                k_descale=layer.k_scale,
+                v_descale=layer.v_scale,
+            )
+        else:
+            # Do absorbed multi-latent attention
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            k_rope = kv_cache[:, :, layer.v_head_dim :]
+            c_kv = kv_cache[:, :, : layer.v_head_dim]
+            k_rope_cache = k_rope.view(
+                -1,
+                self.page_size,
+                layer.tp_k_head_num,
+                layer.head_dim - layer.v_head_dim,
+            )
+            c_kv_cache = c_kv.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+            q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = q_all[:, :, : layer.v_head_dim]
+            q_rope = q_all[:, :, layer.v_head_dim :]
+            o = flash_attn_with_kvcache(
+                q=q_rope,
+                k_cache=k_rope_cache,
+                v_cache=c_kv_cache,
+                qv=q_nope,
+                page_table=page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=metadata.max_seq_len_q,
+                softmax_scale=layer.scaling,
+                causal=True,
+                softcap=layer.logit_cap,
+                k_descale=layer.k_scale,
+                v_descale=layer.v_scale,
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
     def forward_decode(
         self,
@@ -162,26 +232,29 @@ class FlashAttentionBackend(AttentionBackend):
     ) -> torch.Tensor:
         """Forward pass with FlashAttention using precomputed metadata."""
         # Save KV cache if needed
-        if k is not None and v is not None and save_kv_cache:
-            cache_loc = (
-                forward_batch.out_cache_loc
-                if not layer.is_cross_attention
-                else forward_batch.encoder_out_cache_loc
-            )
-            forward_batch.token_to_kv_pool.set_kv_buffer(
-                layer, cache_loc, k, v, layer.k_scale, layer.v_scale
-            )
-        # Get KV cache
-        kv_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
-        key_cache, value_cache = kv_cache[0], kv_cache[1]
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
         # Use precomputed metadata
         metadata = self.forward_metadata
-        # Pre-reshape query tensor
-        q_reshaped = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
         # Calculate window size (can be moved to metadata if layer properties don't change)
         # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
         # here is two side inclusive
@@ -190,25 +263,79 @@ class FlashAttentionBackend(AttentionBackend):
             if layer.sliding_window_size is not None
             else (-1, -1)
         )
-        # Run attention with precomputed values
-        o = flash_attn_with_kvcache(
-            q=q_reshaped,
-            k_cache=key_cache.unsqueeze(1),
-            v_cache=value_cache.unsqueeze(1),
-            page_table=metadata.page_table,
-            cache_seqlens=metadata.cache_seqlens_int32,
-            cu_seqlens_q=metadata.cu_seqlens_q,
-            cu_seqlens_k_new=metadata.cu_seqlens_k,
-            max_seqlen_q=1,
-            softmax_scale=layer.scaling,
-            causal=True,
-            window_size=window_size,
-            softcap=layer.logit_cap,
-            k_descale=layer.k_scale,
-            v_descale=layer.v_scale,
-        )
-        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+        page_table = metadata.page_table
+        if not self.use_mla:
+            # Do multi-head attention
+            # Get KV cache
+            kv_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+            # Pre-reshape query tensor
+            q_reshaped = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+            # Run attention with precomputed values
+            o = flash_attn_with_kvcache(
+                q=q_reshaped,
+                k_cache=key_cache,
+                v_cache=value_cache,
+                page_table=page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=1,
+                softmax_scale=layer.scaling,
+                causal=True,
+                window_size=window_size,
+                softcap=layer.logit_cap,
+                k_descale=layer.k_scale,
+                v_descale=layer.v_scale,
+            )
+        else:
+            # Do absorbed multi-latent attention
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            k_rope = kv_cache[:, :, layer.v_head_dim :]
+            c_kv = kv_cache[:, :, : layer.v_head_dim]
+            k_rope_cache = k_rope.view(
+                -1,
+                self.page_size,
+                layer.tp_k_head_num,
+                layer.head_dim - layer.v_head_dim,
+            )
+            c_kv_cache = c_kv.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+            q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = q_all[:, :, : layer.v_head_dim]
+            q_rope = q_all[:, :, layer.v_head_dim :]
+            o = flash_attn_with_kvcache(
+                q=q_rope,
+                k_cache=k_rope_cache,
+                v_cache=c_kv_cache,
+                qv=q_nope,
+                page_table=page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=1,
+                softmax_scale=layer.scaling,
+                causal=True,
+                softcap=layer.logit_cap,
+                k_descale=layer.k_scale,
+                v_descale=layer.v_scale,
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
     def init_cuda_graph_state(self, max_bs: int):
         """Initialize CUDA graph state for the attention backend.
@@ -223,7 +350,13 @@ class FlashAttentionBackend(AttentionBackend):
         self.decode_cuda_graph_metadata = {
             # Page table for token mapping (batch_size, max_context_len)
             "page_table": torch.zeros(
-                max_bs, self.max_context_len, dtype=torch.int32, device=self.device
+                max_bs,
+                (self.max_context_len + self.page_size - 1) // self.page_size,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
             ),
         }
@@ -274,21 +407,27 @@ class FlashAttentionBackend(AttentionBackend):
         seq_lens_cpu: Optional[torch.Tensor],
     ):
         # """Initialize forward metadata for replaying CUDA graph."""
-        seqlens_in_batch = seq_lens[:bs]
         metadata = self.decode_cuda_graph_metadata[bs]
-        metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+        # For CPU operations
+        max_len = seq_lens_cpu[:bs].max().item()
+        metadata.max_seq_len_k = max_len
+        # For GPU operations
+        seq_lens_in_batch = seq_lens[:bs]
+        metadata.cache_seqlens_int32 = seq_lens_in_batch.to(torch.int32)
         metadata.cu_seqlens_k = torch.nn.functional.pad(
-            torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            torch.cumsum(seq_lens_in_batch, dim=0, dtype=torch.int32), (1, 0)
         )
-        # Precompute maximum sequence length
-        metadata.max_seq_len_k = seqlens_in_batch.max().item()
-        # Only zero out the part out of max_len_k
-        metadata.page_table[:, metadata.max_seq_len_k :].fill_(0)
-        # Then do the copy
-        metadata.page_table[:, : metadata.max_seq_len_k].copy_(
-            self.req_to_token[req_pool_indices[:bs], : metadata.max_seq_len_k]
-        )
-        self.forward_decode_metadata = metadata
+        max_seq_pages = (metadata.max_seq_len_k + self.page_size - 1) // self.page_size
+        page_indices = self.req_to_token[
+            :, self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages]
+        ]
+        page_indices = page_indices[req_pool_indices[:bs]] // self.page_size
+        metadata.page_table[:, :max_seq_pages].copy_(page_indices)
+        metadata.page_table[:, max_seq_pages:].fill_(0)
+        self.forward_metadata = metadata
     def get_cuda_graph_seq_len_fill_value(self):
         """Get the fill value for sequence length in CUDA graph."""

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl