PyPI - sglang - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

sglang 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sglang/bench_latency.py +114 -63
sglang/check_env.py +2 -0
sglang/lang/backend/runtime_endpoint.py +0 -11
sglang/srt/hf_transformers_utils.py +2 -2
sglang/srt/layers/extend_attention.py +59 -7
sglang/srt/layers/radix_attention.py +22 -9
sglang/srt/layers/token_attention.py +28 -2
sglang/srt/managers/io_struct.py +9 -4
sglang/srt/managers/schedule_batch.py +15 -11
sglang/srt/managers/tokenizer_manager.py +28 -13
sglang/srt/mem_cache/memory_pool.py +65 -24
sglang/srt/model_config.py +11 -0
sglang/srt/model_executor/model_runner.py +52 -21
sglang/srt/models/deepseek_v2.py +198 -16
sglang/srt/openai_api/adapter.py +120 -20
sglang/srt/openai_api/protocol.py +1 -1
sglang/srt/server.py +87 -78
sglang/srt/server_args.py +8 -2
sglang/srt/utils.py +25 -20
sglang/test/run_eval.py +21 -10
sglang/test/runners.py +237 -0
sglang/test/simple_eval_common.py +12 -12
sglang/test/simple_eval_gpqa.py +92 -0
sglang/test/simple_eval_humaneval.py +5 -5
sglang/test/simple_eval_math.py +72 -0
sglang/test/test_utils.py +94 -13
sglang/utils.py +15 -37
sglang/version.py +1 -1
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/METADATA +29 -27
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/RECORD +33 -30
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/LICENSE +0 -0
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/WHEEL +0 -0
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -92,7 +92,7 @@ class GenerateReqInput:
                     for element in parallel_sample_num_list
                 )
                 if parallel_sample_num > 1 and (not all_equal):
-                    ## TODO cope with the case that the parallel_sample_num is different for different samples
+                    # TODO cope with the case that the parallel_sample_num is different for different samples
                     raise ValueError(
                         "The parallel_sample_num should be the same for all samples in sample params."
                     )
@@ -103,14 +103,19 @@ class GenerateReqInput:
             if parallel_sample_num != 1:
                 # parallel sampling +1 represents the original prefill stage
                 num = parallel_sample_num + 1
-                if isinstance(self.text, List):
-                    ## suppot batch operation
+                if isinstance(self.text, list):
+                    # suppot batch operation
                     self.batch_size = len(self.text)
                     num = num * len(self.text)
+                elif isinstance(self.input_ids, list) and isinstance(
+                    self.input_ids[0], list
+                ):
+                    self.batch_size = len(self.input_ids)
+                    num = num * len(self.input_ids)
                 else:
                     self.batch_size = 1
             else:
-                ## support select operation
+                # support select operation
                 num = len(self.text) if self.text is not None else len(self.input_ids)
                 self.batch_size = num

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -29,7 +29,7 @@ from sglang.global_config import global_config
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
-from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
 from sglang.srt.mem_cache.radix_cache import RadixCache
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -39,6 +39,7 @@ global_server_args_dict = {
     "disable_flashinfer": False,
     "disable_flashinfer_sampling": False,
     "attention_reduce_in_fp32": False,
+    "enable_mla": False,
 }
@@ -289,7 +290,7 @@ class Batch:
     # Request, memory pool, and cache
     reqs: List[Req]
     req_to_token_pool: ReqToTokenPool
-    token_to_kv_pool: TokenToKVPool
+    token_to_kv_pool: BaseTokenToKVPool
     tree_cache: RadixCache
     # Batched arguments to model runner
@@ -380,13 +381,15 @@ class Batch:
         extend_num_tokens = seq_lens.sum() - prefix_lens.sum()
         out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
         if out_cache_loc is None:
-            self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free)
-            out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
+            if self.tree_cache is not None:
+                self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free)
+                out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
             if out_cache_loc is None:
-                logger.error("Prefill out of memory. This should never happen.")
-                self.tree_cache.pretty_print()
-                exit()
+                logger.error("Prefill out of memory. Try to lower your batch size.")
+                if self.tree_cache is not None:
+                    self.tree_cache.pretty_print()
+                exit(1)
         pt = 0
         for i in range(bs):
@@ -637,9 +640,10 @@ class Batch:
         self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
         if self.out_cache_loc is None:
-            logger.error("Decode out of memory. This should never happen.")
-            self.tree_cache.pretty_print()
-            exit()
+            logger.error("Decode out of memory. Try to lower your batch size.")
+            if self.tree_cache is not None:
+                self.tree_cache.pretty_print()
+            exit(1)
         self.req_to_token_pool.req_to_token[
             self.req_pool_indices, self.seq_lens - 1
@@ -777,7 +781,7 @@ class InputMetadata:
     seq_lens: torch.Tensor
     positions: torch.Tensor
     req_to_token_pool: ReqToTokenPool
-    token_to_kv_pool: TokenToKVPool
+    token_to_kv_pool: BaseTokenToKVPool
     # For extend
     extend_seq_lens: torch.Tensor

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -153,8 +153,9 @@ class TokenizerManager:
     async def _handle_single_request(
         self, obj, request, index=None, is_cache_for_prefill=False
     ):
-        if not is_cache_for_prefill:
-            not_use_index = not (index is not None)
+        if not is_cache_for_prefill:  # The normal case with a single prompt
+            not_use_index = index is None
             rid = obj.rid if not_use_index else obj.rid[index]
             input_text = obj.text if not_use_index else obj.text[index]
             input_ids = (
@@ -182,14 +183,27 @@ class TokenizerManager:
             top_logprobs_num = (
                 obj.top_logprobs_num if not_use_index else obj.top_logprobs_num[index]
             )
-        else:
-            if isinstance(obj.text, list):
-                input_text = obj.text[index]
-                rid = obj.rid[index]
+        else:  # A prefill request to cache the common prompt for parallel sampling
+            if obj.text is not None:
+                if isinstance(obj.text, list):
+                    input_text = obj.text[index]
+                    rid = obj.rid[index]
+                else:
+                    input_text = obj.text
+                    rid = obj.rid[0]
+                input_ids = self.tokenizer.encode(input_text)
             else:
-                input_text = obj.text
-                rid = obj.rid[0]
-            input_ids = self.tokenizer.encode(input_text)
+                input_text = None
+                if isinstance(obj.input_ids, list) and isinstance(
+                    obj.input_ids[0], list
+                ):
+                    # when obj["input_ids"] is List[List[int]]
+                    input_ids = obj.input_ids[index]
+                    rid = obj.rid[index]
+                else:
+                    input_ids = obj.input_ids
+                    rid = obj.rid[0]
             sampling_params = SamplingParams(**obj.sampling_params[0])
             sampling_params.max_new_tokens = 0
             pixel_values, image_hash, image_size = await self._get_pixel_values(
@@ -240,11 +254,11 @@ class TokenizerManager:
                 ):
                     if input_id_result is not None:
                         input_id_result.append(input_id)
-                    pass
-            if len(input_id_result) > 1 and input_id_result is not None:
+            if input_id_result is not None and len(input_id_result) > 1:
                 obj.input_ids = input_id_result
             elif input_id_result is not None:
                 obj.input_ids = input_id_result[0]
         # First send out all requests
         for i in range(batch_size):
             for j in range(parallel_sample_num):
@@ -264,11 +278,12 @@ class TokenizerManager:
                         input_text = None
                         input_ids = obj.input_ids[i]
                 else:
+                    assert obj.input_ids is not None
                     if batch_size == 1:
-                        input_text = obj.text
+                        input_text = None
                         input_ids = obj.input_ids
                     else:
-                        input_text = obj.text[i]
+                        input_text = None
                         input_ids = obj.input_ids[i]
                 sampling_params = self._get_sampling_params(obj.sampling_params[index])
                 pixel_values, image_hash, image_size = await self._get_pixel_values(

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -57,32 +57,18 @@ class ReqToTokenPool:
         self.can_use_mem_size = len(self.mem_state)
-class TokenToKVPool:
+class BaseTokenToKVPool:
     """A memory pool that maps a token to its kv cache locations"""
     def __init__(
         self,
         size: int,
-        dtype: torch.dtype,
-        head_num: int,
-        head_dim: int,
-        layer_num: int,
     ):
         self.size = size
         # We also add one slot. This slot is used for writing dummy output from padded tokens.
         self.mem_state = torch.ones((self.size + 1,), dtype=torch.bool, device="cuda")
-        # [size, head_num, head_dim] for each layer
-        self.k_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
-            for _ in range(layer_num)
-        ]
-        self.v_buffer = [
-            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
-            for _ in range(layer_num)
-        ]
         # Prefetch buffer
         self.prefetch_buffer = torch.empty(0, device="cuda", dtype=torch.int32)
         self.prefetch_chunk_size = 512
@@ -90,15 +76,6 @@ class TokenToKVPool:
         self.can_use_mem_size = self.size
         self.clear()
-    def get_key_buffer(self, layer_id: int):
-        return self.k_buffer[layer_id]
-    def get_value_buffer(self, layer_id: int):
-        return self.v_buffer[layer_id]
-    def get_kv_buffer(self, layer_id: int):
-        return self.k_buffer[layer_id], self.v_buffer[layer_id]
     def available_size(self):
         return self.can_use_mem_size + len(self.prefetch_buffer)
@@ -139,3 +116,67 @@ class TokenToKVPool:
         # We also add one slot. This slot is used for writing dummy output from padded tokens.
         self.mem_state[0] = False
+class MHATokenToKVPool(BaseTokenToKVPool):
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+    ):
+        super().__init__(size)
+        # [size, head_num, head_dim] for each layer
+        self.k_buffer = [
+            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
+            for _ in range(layer_num)
+        ]
+        self.v_buffer = [
+            torch.empty((size + 1, head_num, head_dim), dtype=dtype, device="cuda")
+            for _ in range(layer_num)
+        ]
+    def get_key_buffer(self, layer_id: int):
+        return self.k_buffer[layer_id]
+    def get_value_buffer(self, layer_id: int):
+        return self.v_buffer[layer_id]
+    def get_kv_buffer(self, layer_id: int):
+        return self.k_buffer[layer_id], self.v_buffer[layer_id]
+class MLATokenToKVPool(BaseTokenToKVPool):
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        kv_lora_rank: int,
+        qk_rope_head_dim: int,
+        layer_num: int,
+    ):
+        super().__init__(size)
+        self.kv_lora_rank = kv_lora_rank
+        self.kv_buffer = [
+            torch.empty(
+                (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
+                dtype=dtype,
+                device="cuda",
+            )
+            for _ in range(layer_num)
+        ]
+    def get_key_buffer(self, layer_id: int):
+        return self.kv_buffer[layer_id]
+    def get_value_buffer(self, layer_id: int):
+        return self.kv_buffer[layer_id][..., : self.kv_lora_rank]
+    def get_kv_buffer(self, layer_id: int):
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)

sglang/srt/model_config.py CHANGED Viewed

@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+from enum import IntEnum, auto
 from typing import Optional
 from transformers import PretrainedConfig
@@ -20,6 +21,11 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+class AttentionArch(IntEnum):
+    MLA = auto()
+    MHA = auto()
 class ModelConfig:
     def __init__(
         self,
@@ -55,6 +61,11 @@ class ModelConfig:
         # FIXME: temporary special judge for deepseek v2 MLA architecture
         if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
             self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+        else:
+            self.attention_arch = AttentionArch.MHA
         self.num_attention_heads = self.hf_config.num_attention_heads
         self.num_key_value_heads = getattr(self.hf_config, "num_key_value_heads", None)

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -47,7 +47,12 @@ from sglang.srt.managers.schedule_batch import (
     InputMetadata,
     global_server_args_dict,
 )
-from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
+from sglang.srt.mem_cache.memory_pool import (
+    MHATokenToKVPool,
+    MLATokenToKVPool,
+    ReqToTokenPool,
+)
+from sglang.srt.model_config import AttentionArch
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_available_gpu_memory,
@@ -86,6 +91,7 @@ class ModelRunner:
                 "disable_flashinfer": server_args.disable_flashinfer,
                 "disable_flashinfer_sampling": server_args.disable_flashinfer_sampling,
                 "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
+                "enable_mla": server_args.enable_mla,
             }
         )
@@ -193,15 +199,23 @@ class ModelRunner:
         available_gpu_memory = get_available_gpu_memory(
             self.gpu_id, distributed=self.tp_size > 1
         )
-        head_dim = self.model_config.head_dim
-        head_num = self.model_config.get_num_kv_heads(self.tp_size)
-        cell_size = (
-            head_num
-            * head_dim
-            * self.model_config.num_hidden_layers
-            * 2
-            * torch._utils._element_size(self.dtype)
-        )
+        if (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and self.server_args.enable_mla
+        ):
+            cell_size = (
+                (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
+                * self.model_config.num_hidden_layers
+                * torch._utils._element_size(self.dtype)
+            )
+        else:
+            cell_size = (
+                self.model_config.get_num_kv_heads(self.tp_size)
+                * self.model_config.head_dim
+                * self.model_config.num_hidden_layers
+                * 2
+                * torch._utils._element_size(self.dtype)
+            )
         rest_memory = available_gpu_memory - total_gpu_memory * (
             1 - self.mem_fraction_static
         )
@@ -241,13 +255,28 @@ class ModelRunner:
             max_num_reqs,
             self.model_config.context_len + 8,
         )
-        self.token_to_kv_pool = TokenToKVPool(
-            self.max_total_num_tokens,
-            dtype=self.dtype,
-            head_num=self.model_config.get_num_kv_heads(self.tp_size),
-            head_dim=self.model_config.head_dim,
-            layer_num=self.model_config.num_hidden_layers,
-        )
+        if (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and self.server_args.enable_mla
+        ):
+            self.token_to_kv_pool = MLATokenToKVPool(
+                self.max_total_num_tokens,
+                dtype=self.dtype,
+                kv_lora_rank=self.model_config.kv_lora_rank,
+                qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                layer_num=self.model_config.num_hidden_layers,
+            )
+            logger.info("using MLA Triton implementaion, flashinfer is disabled")
+            # FIXME: temporarily only Triton MLA is supported
+            self.server_args.disable_flashinfer = True
+        else:
+            self.token_to_kv_pool = MHATokenToKVPool(
+                self.max_total_num_tokens,
+                dtype=self.dtype,
+                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_dim=self.model_config.head_dim,
+                layer_num=self.model_config.num_hidden_layers,
+            )
         logger.info(
             f"[gpu={self.gpu_id}] Memory pool end. "
             f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
@@ -312,10 +341,12 @@ class ModelRunner:
             self.cuda_graph_runner.capture(batch_size_list)
         except RuntimeError as e:
             raise Exception(
-                f"Capture cuda graph failed: {e}. Possible solutions:\n"
-                f"1. disable cuda graph by --disable-cuda-graph\n"
-                f"2. set --mem-fraction-static to a smaller value\n"
-                f"Open an issue on GitHub with reproducible scripts if you need help.\n"
+                f"Capture cuda graph failed: {e}\n"
+                "Possible solutions:\n"
+                "1. disable torch compile by not using --enable-torch-compile\n"
+                "2. disable cuda graph by --disable-cuda-graph\n"
+                "3. set --mem-fraction-static to a smaller value\n"
+                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )
     @torch.inference_mode()

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -45,6 +45,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.model_runner import InputMetadata
@@ -312,6 +313,165 @@ class DeepseekV2Attention(nn.Module):
         return output
+class DeepseekV2AttentionMLA(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id=None,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+            )
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        rope_scaling["type"] = "deepseek_yarn"
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+        self.attn = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+        )
+        kv_b_proj = self.kv_b_proj
+        w_kc, w_vc = kv_b_proj.weight.unflatten(
+            0, (-1, qk_nope_head_dim + v_head_dim)
+        ).split([qk_nope_head_dim, v_head_dim], dim=1)
+        self.w_kc = w_kc
+        self.w_vc = w_vc
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        q_len = hidden_states.shape[0]
+        q_input = hidden_states.new_empty(
+            q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        q_nope_out = q_input[..., : self.kv_lora_rank]
+        torch.bmm(q_nope.transpose(0, 1), self.w_kc, out=q_nope_out.transpose(0, 1))
+        k_input = self.kv_a_proj_with_mqa(hidden_states)[0].unsqueeze(1)
+        k_pe = k_input[..., self.kv_lora_rank :]
+        v_input = k_input[..., : self.kv_lora_rank]
+        v_input = self.kv_a_layernorm(v_input.contiguous())
+        k_input[..., : self.kv_lora_rank] = v_input
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q_input[..., self.kv_lora_rank :] = q_pe
+        k_input[..., self.kv_lora_rank :] = k_pe
+        attn_output = self.attn(q_input, k_input, v_input, input_metadata)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+        attn_bmm_output = attn_output.new_empty(
+            q_len, self.num_local_heads, self.v_head_dim
+        )
+        torch.bmm(
+            attn_output.transpose(0, 1),
+            self.w_vc.transpose(1, 2).contiguous(),
+            out=attn_bmm_output.transpose(0, 1),
+        )
+        attn_output = attn_bmm_output.flatten(1, 2)
+        output, _ = self.o_proj(attn_output)
+        return output
 class DeepseekV2DecoderLayer(nn.Module):
     def __init__(
@@ -326,22 +486,44 @@ class DeepseekV2DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        self.self_attn = DeepseekV2Attention(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            qk_nope_head_dim=config.qk_nope_head_dim,
-            qk_rope_head_dim=config.qk_rope_head_dim,
-            v_head_dim=config.v_head_dim,
-            q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
-            kv_lora_rank=config.kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            layer_id=layer_id,
-        )
+        if global_server_args_dict["enable_mla"]:
+            self.self_attn = DeepseekV2AttentionMLA(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=(
+                    config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                ),
+                kv_lora_rank=config.kv_lora_rank,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+            )
+        else:
+            self.self_attn = DeepseekV2Attention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=(
+                    config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                ),
+                kv_lora_rank=config.kv_lora_rank,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+            )
         if (
             config.n_routed_experts is not None
             and layer_id >= config.first_k_dense_replace

sglang 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

sglang 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl