PyPI - sglang - Versions diffs - 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl - Mend

sglang 0.4.4.post4py3-none-any.whl → 0.4.5.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

sglang/bench_one_batch.py +21 -0
sglang/bench_serving.py +10 -4
sglang/lang/chat_template.py +24 -0
sglang/srt/configs/model_config.py +40 -4
sglang/srt/constrained/base_grammar_backend.py +26 -5
sglang/srt/constrained/llguidance_backend.py +1 -0
sglang/srt/constrained/outlines_backend.py +1 -0
sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/conversation.py +29 -4
sglang/srt/disaggregation/base/__init__.py +8 -0
sglang/srt/disaggregation/base/conn.py +113 -0
sglang/srt/disaggregation/decode.py +18 -5
sglang/srt/disaggregation/mini_lb.py +53 -122
sglang/srt/disaggregation/mooncake/__init__.py +6 -0
sglang/srt/disaggregation/mooncake/conn.py +615 -0
sglang/srt/disaggregation/mooncake/transfer_engine.py +108 -0
sglang/srt/disaggregation/prefill.py +43 -19
sglang/srt/disaggregation/utils.py +31 -0
sglang/srt/entrypoints/EngineBase.py +53 -0
sglang/srt/entrypoints/engine.py +36 -8
sglang/srt/entrypoints/http_server.py +37 -8
sglang/srt/entrypoints/http_server_engine.py +142 -0
sglang/srt/entrypoints/verl_engine.py +37 -10
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/attention/flashattention_backend.py +609 -202
sglang/srt/layers/attention/flashinfer_backend.py +13 -7
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/dp_attention.py +2 -4
sglang/srt/layers/elementwise.py +15 -2
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +51 -24
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/router.py +7 -1
sglang/srt/layers/moe/topk.py +37 -16
sglang/srt/layers/quantization/__init__.py +13 -5
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +4 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +68 -45
sglang/srt/layers/quantization/fp8.py +28 -14
sglang/srt/layers/quantization/fp8_kernel.py +130 -4
sglang/srt/layers/quantization/fp8_utils.py +34 -6
sglang/srt/layers/quantization/kv_cache.py +43 -52
sglang/srt/layers/quantization/modelopt_quant.py +271 -4
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/w8a8_fp8.py +154 -4
sglang/srt/layers/quantization/w8a8_int8.py +3 -0
sglang/srt/layers/radix_attention.py +14 -0
sglang/srt/layers/rotary_embedding.py +75 -1
sglang/srt/managers/io_struct.py +254 -97
sglang/srt/managers/mm_utils.py +3 -2
sglang/srt/managers/multimodal_processors/base_processor.py +114 -77
sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
sglang/srt/managers/multimodal_processors/mllama4.py +146 -0
sglang/srt/managers/schedule_batch.py +62 -21
sglang/srt/managers/scheduler.py +71 -14
sglang/srt/managers/tokenizer_manager.py +17 -3
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/memory_pool.py +14 -1
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +7 -4
sglang/srt/model_executor/forward_batch_info.py +234 -15
sglang/srt/model_executor/model_runner.py +49 -9
sglang/srt/model_loader/loader.py +31 -4
sglang/srt/model_loader/weight_utils.py +4 -2
sglang/srt/models/baichuan.py +2 -0
sglang/srt/models/chatglm.py +1 -0
sglang/srt/models/commandr.py +1 -0
sglang/srt/models/dbrx.py +1 -0
sglang/srt/models/deepseek.py +1 -0
sglang/srt/models/deepseek_v2.py +248 -61
sglang/srt/models/exaone.py +1 -0
sglang/srt/models/gemma.py +1 -0
sglang/srt/models/gemma2.py +1 -0
sglang/srt/models/gemma3_causal.py +1 -0
sglang/srt/models/gpt2.py +1 -0
sglang/srt/models/gpt_bigcode.py +1 -0
sglang/srt/models/granite.py +1 -0
sglang/srt/models/grok.py +1 -0
sglang/srt/models/internlm2.py +1 -0
sglang/srt/models/llama.py +13 -4
sglang/srt/models/llama4.py +487 -0
sglang/srt/models/minicpm.py +1 -0
sglang/srt/models/minicpm3.py +2 -0
sglang/srt/models/mixtral.py +1 -0
sglang/srt/models/mixtral_quant.py +1 -0
sglang/srt/models/mllama.py +51 -8
sglang/srt/models/mllama4.py +227 -0
sglang/srt/models/olmo.py +1 -0
sglang/srt/models/olmo2.py +1 -0
sglang/srt/models/olmoe.py +1 -0
sglang/srt/models/phi3_small.py +1 -0
sglang/srt/models/qwen.py +1 -0
sglang/srt/models/qwen2.py +1 -0
sglang/srt/models/qwen2_5_vl.py +35 -70
sglang/srt/models/qwen2_moe.py +1 -0
sglang/srt/models/qwen2_vl.py +27 -25
sglang/srt/models/stablelm.py +1 -0
sglang/srt/models/xverse.py +1 -0
sglang/srt/models/xverse_moe.py +1 -0
sglang/srt/openai_api/adapter.py +4 -1
sglang/srt/patch_torch.py +11 -0
sglang/srt/server_args.py +34 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
sglang/srt/speculative/eagle_utils.py +1 -11
sglang/srt/speculative/eagle_worker.py +6 -2
sglang/srt/utils.py +120 -9
sglang/test/attention/test_flashattn_backend.py +259 -221
sglang/test/attention/test_flashattn_mla_backend.py +285 -0
sglang/test/attention/test_prefix_chunk_info.py +224 -0
sglang/test/test_block_fp8.py +57 -0
sglang/test/test_utils.py +19 -8
sglang/version.py +1 -1
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/METADATA +14 -4
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/RECORD +133 -109
sglang/srt/disaggregation/conn.py +0 -81
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post4.dist-info → sglang-0.4.5.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -18,6 +18,7 @@
 import logging
 import os
+from enum import IntEnum, auto
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
@@ -50,13 +51,13 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, EPMoE
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import per_tensor_quant_mla_fp8
 from sglang.srt.layers.quantization.fp8_utils import (
     block_quant_to_tensor_quant,
-    input_to_float8,
+    channel_quant_to_tensor_quant,
     normalize_e4m3fn_to_e4m3fnuz,
 )
 from sglang.srt.layers.quantization.int8_utils import (
@@ -78,7 +79,9 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_cuda:
-    from sgl_kernel import awq_dequantize, bmm_fp8
+    from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
+    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 else:
     from vllm import _custom_ops as ops
@@ -92,6 +95,19 @@ expert_distribution_recorder = ExpertDistributionRecorder()
 logger = logging.getLogger(__name__)
+class AttnForwardMethod(IntEnum):
+    # Use multi-head attention
+    MHA = auto()
+    # Use absorbed multi-latent attention
+    MLA = auto()
+    # Use multi-head attention, but with KV cache chunked.
+    # This method can avoid OOM when prefix lengths are long.
+    MHA_CHUNKED_KV = auto()
 class DeepseekV2MLP(nn.Module):
     def __init__(
         self,
@@ -178,7 +194,6 @@ class DeepseekV2MoE(nn.Module):
             else 0
         )
-        self.routed_scaling_factor = config.routed_scaling_factor
         if self.tp_size > config.n_routed_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
@@ -278,10 +293,7 @@ class DeepseekV2MoE(nn.Module):
             return self.forward_deepep(hidden_states, forward_mode)
     def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if self.n_shared_experts is not None and self.n_share_experts_fusion == 0:
-            shared_output = self.shared_experts(hidden_states)
-        else:
-            shared_output = None
+        shared_output = self._forward_shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
         final_hidden_states = (
@@ -311,8 +323,7 @@ class DeepseekV2MoE(nn.Module):
         ):
             # router_logits: (num_tokens, n_experts)
             router_logits = self.gate(hidden_states)
-            if self.n_shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = self._forward_shared_experts(hidden_states)
             topk_weights, topk_idx = select_experts(
                 hidden_states=hidden_states,
                 router_logits=router_logits,
@@ -324,6 +335,7 @@ class DeepseekV2MoE(nn.Module):
                 correction_bias=self.correction_bias,
             )
         if self.ep_size > 1:
+            # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
             (
                 hidden_states,
                 topk_idx,
@@ -336,19 +348,15 @@ class DeepseekV2MoE(nn.Module):
                 hidden_states,
                 topk_idx,
                 topk_weights,
-                self.num_experts,
                 forward_mode=forward_mode,
             )
-        final_hidden_states = (
-            self.experts(
-                hidden_states=hidden_states,
-                reorder_topk_ids=reorder_topk_ids,
-                seg_indptr=seg_indptr,
-                masked_m=masked_m,
-                expected_m=expected_m,
-                forward_mode=forward_mode,
-            )
-            * self.routed_scaling_factor
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            reorder_topk_ids=reorder_topk_ids,
+            seg_indptr=seg_indptr,
+            masked_m=masked_m,
+            expected_m=expected_m,
+            forward_mode=forward_mode,
         )
         if self.ep_size > 1:
             final_hidden_states = self.deepep_dispatcher.combine(
@@ -357,11 +365,19 @@ class DeepseekV2MoE(nn.Module):
                 topk_weights,
                 forward_mode,
             )
+        final_hidden_states *= self.routed_scaling_factor
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
         return final_hidden_states
+    def _forward_shared_experts(self, hidden_states):
+        if self.n_shared_experts is not None and self.n_share_experts_fusion == 0:
+            return self.shared_experts(hidden_states)
+        else:
+            return None
 def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     import math
@@ -489,6 +505,7 @@ class DeepseekV2Attention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_local_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )
@@ -669,6 +686,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             num_kv_heads=1,
             layer_id=layer_id,
             v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
             prefix=add_prefix("attn_mqa", prefix),
         )
@@ -679,6 +697,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             num_kv_heads=self.num_local_heads,
             layer_id=layer_id,
             v_head_dim=self.v_head_dim,
+            quant_config=quant_config,
             prefix=add_prefix("attn_mha", prefix),
         )
@@ -689,30 +708,54 @@ class DeepseekV2AttentionMLA(nn.Module):
         self.flashinfer_mla_disable_ragged = global_server_args_dict[
             "flashinfer_mla_disable_ragged"
         ]
+        self.disable_chunked_prefix_cache = global_server_args_dict[
+            "disable_chunked_prefix_cache"
+        ]
         self.attention_backend = global_server_args_dict["attention_backend"]
         self.rocm_fused_decode_mla = os.getenv("SGLANG_ROCM_FUSED_DECODE_MLA") == "1"
-    def no_absorb(self, forward_batch: ForwardBatch) -> bool:
+        # TODO: Design a finer way to determine the threshold
+        self.chunked_prefix_cache_threshold = 8192
+    def dispatch_attn_forward_method(
+        self, forward_batch: ForwardBatch
+    ) -> AttnForwardMethod:
         if self.attention_backend == "flashinfer":
             # Flashinfer MLA: Do not absorb when enabling ragged prefill
-            return (
+            if (
                 not self.flashinfer_mla_disable_ragged
                 and forward_batch.forward_mode.is_extend()
                 and not forward_batch.forward_mode.is_target_verify()
                 and not forward_batch.forward_mode.is_draft_extend()
                 and sum(forward_batch.extend_prefix_lens_cpu) == 0
-            )
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
         elif self.attention_backend == "fa3":
-            # Flash Attention: Keep absorbing for all extend/decode
-            return False
+            # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not self.disable_chunked_prefix_cache
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+                and sum(forward_batch.extend_prefix_lens_cpu)
+                >= self.chunked_prefix_cache_threshold
+            ):
+                return AttnForwardMethod.MHA_CHUNKED_KV
+            else:
+                return AttnForwardMethod.MLA
         else:
             # Triton: Use normal computation for prefill and use weight absorption for extend/decode
-            return (
+            if (
                 forward_batch.forward_mode.is_extend()
                 and not forward_batch.forward_mode.is_target_verify()
                 and not forward_batch.forward_mode.is_draft_extend()
                 and sum(forward_batch.extend_prefix_lens_cpu) == 0
-            )
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
     def forward(
         self,
@@ -726,8 +769,14 @@ class DeepseekV2AttentionMLA(nn.Module):
             ), "short-circuiting allreduce will lead to hangs"
             return hidden_states
-        if self.no_absorb(forward_batch):
+        attn_forward_method = self.dispatch_attn_forward_method(forward_batch)
+        if attn_forward_method == AttnForwardMethod.MHA:
             return self.forward_normal(positions, hidden_states, forward_batch)
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            return self.forward_normal_chunked_kv(
+                positions, hidden_states, forward_batch
+            )
         else:
             if _is_hip:
                 if (
@@ -811,8 +860,8 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.w_kc.to(torch.bfloat16) * self.w_scale,
             )
         elif self.w_kc.dtype == torch.float8_e4m3fn:
-            q_nope_val, q_nope_scale = input_to_float8(
-                q_nope.transpose(0, 1), torch.float8_e4m3fn
+            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+                q_nope.transpose(0, 1), dtype=torch.float8_e4m3fn
             )
             q_nope_out = bmm_fp8(
                 q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
@@ -842,8 +891,8 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.w_vc.to(torch.bfloat16) * self.w_scale,
             )
         elif self.w_vc.dtype == torch.float8_e4m3fn:
-            attn_output_val, attn_output_scale = input_to_float8(
-                attn_output.transpose(0, 1), torch.float8_e4m3fn
+            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+                attn_output.transpose(0, 1), dtype=torch.float8_e4m3fn
             )
             attn_bmm_output = bmm_fp8(
                 attn_output_val,
@@ -889,8 +938,8 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.w_kc.to(torch.bfloat16) * self.w_scale,
             )
         elif self.w_kc.dtype == torch.float8_e4m3fn:
-            q_nope_val, q_nope_scale = input_to_float8(
-                q_nope.transpose(0, 1), torch.float8_e4m3fn
+            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+                q_nope.transpose(0, 1), dtype=torch.float8_e4m3fn
             )
             q_nope_out = bmm_fp8(
                 q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
@@ -985,8 +1034,8 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.w_vc.to(torch.bfloat16) * self.w_scale,
             )
         elif self.w_vc.dtype == torch.float8_e4m3fn:
-            attn_output_val, attn_output_scale = input_to_float8(
-                attn_output.transpose(0, 1), torch.float8_e4m3fn
+            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+                attn_output.transpose(0, 1), dtype=torch.float8_e4m3fn
             )
             attn_bmm_output = bmm_fp8(
                 attn_output_val,
@@ -1002,6 +1051,127 @@ class DeepseekV2AttentionMLA(nn.Module):
         return output
+    def _chunked_prefix_attn_mha(
+        self,
+        q: torch.Tensor,
+        accum_output: torch.Tensor,
+        accum_lse: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        assert forward_batch.num_prefix_chunks is not None
+        for i in range(forward_batch.num_prefix_chunks):
+            forward_batch.set_prefix_chunk_idx(i)
+            # Fetch latent cache from memory pool with precomputed chunked kv indices
+            latent_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+                self.attn_mha.layer_id
+            )
+            latent_cache = latent_cache_buf[
+                forward_batch.prefix_chunk_kv_indices[i]
+            ].contiguous()
+            kv_a_normed, k_pe = latent_cache.split(
+                [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+            kv_a_normed = kv_a_normed.squeeze(1).contiguous()
+            kv = self.kv_b_proj(kv_a_normed)[0]
+            kv = kv.view(
+                -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+            )
+            v = kv[..., self.qk_nope_head_dim :]
+            k_nope = kv[..., : self.qk_nope_head_dim]
+            k = torch.empty(
+                (
+                    k_nope.shape[0],
+                    self.num_local_heads,
+                    self.qk_nope_head_dim + self.qk_rope_head_dim,
+                ),
+                dtype=v.dtype,
+                device=v.device,
+            )
+            k[..., : self.qk_nope_head_dim] = k_nope
+            k[..., self.qk_nope_head_dim :] = k_pe
+            output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+            lse = torch.transpose(lse, 0, 1).contiguous()
+            tmp_output = torch.empty_like(accum_output)
+            tmp_lse = torch.empty_like(accum_lse)
+            merge_state_v2(output, lse, accum_output, accum_lse, tmp_output, tmp_lse)
+            accum_output, accum_lse = tmp_output, tmp_lse
+        return accum_output
+    def forward_normal_chunked_kv(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # In normal mha, the k and v tensors will become overly large when the prefix length is long.
+        # To avoid this, we split the kv cache into chunks and process them one after another.
+        # Since mha is compute friendly, the for loop induced here will not introduce significant overhead.
+        # The top comments in https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+        # will be helpful for understanding the purpose of this function.
+        # First do normal mha forward to get output for extended part
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a.contiguous())
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope = kv[..., : self.qk_nope_head_dim]
+        v = kv[..., self.qk_nope_head_dim :]
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim :] = q_pe
+        k = torch.empty_like(q)
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
+        latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
+        latent_cache[:, :, self.kv_lora_rank :] = k_pe
+        # Save latent cache
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
+        )
+        # Do mha for extended part without prefix
+        forward_batch.set_attn_attend_prefix_cache(False)
+        attn_output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+        lse = torch.transpose(lse, 0, 1).contiguous()
+        # Do mha attention with chunked prefix cache if there are any sequence with prefix
+        if any(forward_batch.extend_prefix_lens_cpu):
+            # Only initialize the info once
+            if forward_batch.num_prefix_chunks is None:
+                forward_batch.prepare_chunked_prefix_cache_info(q.device)
+            forward_batch.set_attn_attend_prefix_cache(True)
+            attn_output = self._chunked_prefix_attn_mha(
+                q=q,
+                accum_output=attn_output,
+                accum_lse=lse,
+                forward_batch=forward_batch,
+            )
+        attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
 class DeepseekV2DecoderLayer(nn.Module):
@@ -1407,27 +1577,34 @@ class DeepseekV2ForCausalLM(nn.Module):
                     w = self_attn.kv_b_proj.weight
                 # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
                 # This may affect the accuracy of fp8 model.
-                if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
+                if w.dtype in (
                     torch.float8_e4m3fn,
                     torch.float8_e4m3fnuz,
                 ):
-                    weight_block_size = self.quant_config.weight_block_size
-                    if weight_block_size is not None:
-                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
-                        if _is_hip:
-                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                                weight=w,
-                                weight_scale=self_attn.kv_b_proj.weight_scale_inv,
-                                input_scale=None,
+                    if hasattr(self.quant_config, "weight_block_size"):
+                        weight_block_size = self.quant_config.weight_block_size
+                        if weight_block_size is not None:
+                            assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                            if _is_hip:
+                                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                    weight=w,
+                                    weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                                    input_scale=None,
+                                )
+                            else:
+                                weight = w
+                                weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                            w, scale = block_quant_to_tensor_quant(
+                                weight, weight_scale, weight_block_size
                             )
-                        else:
-                            weight = w
-                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
-                        w, scale = block_quant_to_tensor_quant(
-                            weight, weight_scale, weight_block_size
-                        )
+                            self_attn.w_scale = scale
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale
+                        w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
                         self_attn.w_scale = scale
                 if w.dtype == torch.int8:
                     if hasattr(self.quant_config, "weight_block_size"):
                         # block-wise int8 need it
@@ -1466,14 +1643,24 @@ class DeepseekV2ForCausalLM(nn.Module):
         if self.n_share_experts_fusion is not None and self.n_share_experts_fusion > 0:
             weights_list = list(weights)
             weights_dict = dict(weights_list)
-            suffix_list = [
-                "down_proj.weight",
-                "down_proj.weight_scale_inv",
-                "gate_proj.weight",
-                "gate_proj.weight_scale_inv",
-                "up_proj.weight",
-                "up_proj.weight_scale_inv",
-            ]
+            if self.quant_config.get_name() == "w8a8_int8":
+                suffix_list = [
+                    "down_proj.weight",
+                    "down_proj.weight_scale",
+                    "gate_proj.weight",
+                    "gate_proj.weight_scale",
+                    "up_proj.weight",
+                    "up_proj.weight_scale",
+                ]
+            else:
+                suffix_list = [
+                    "down_proj.weight",
+                    "down_proj.weight_scale_inv",
+                    "gate_proj.weight",
+                    "gate_proj.weight_scale_inv",
+                    "up_proj.weight",
+                    "up_proj.weight_scale_inv",
+                ]
             names_to_remove = []
             for moe_layer in tqdm(
                 range(

sglang/srt/models/exaone.py CHANGED Viewed

@@ -155,6 +155,7 @@ class ExaoneAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
         )
     def forward(

sglang/srt/models/gemma.py CHANGED Viewed

@@ -137,6 +137,7 @@ class GemmaAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -163,6 +163,7 @@ class Gemma2Attention(nn.Module):
                 if use_sliding_window
                 else None
             ),
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/gemma3_causal.py CHANGED Viewed

@@ -193,6 +193,7 @@ class Gemma3Attention(nn.Module):
             # Module must also define `get_attention_sliding_window_size` to correctly initialize
             # attention backend in `ForwardBatch`.
             sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/gpt2.py CHANGED Viewed

@@ -78,6 +78,7 @@ class GPT2Attention(nn.Module):
             scaling=self.scale,
             num_kv_heads=total_num_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
         )
     def forward(

sglang/srt/models/gpt_bigcode.py CHANGED Viewed

@@ -87,6 +87,7 @@ class GPTBigCodeAttention(nn.Module):
             scaling=self.scale,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/granite.py CHANGED Viewed

@@ -158,6 +158,7 @@ class GraniteAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/grok.py CHANGED Viewed

@@ -215,6 +215,7 @@ class Grok1Attention(nn.Module):
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
             logit_cap=logit_cap,
+            quant_config=quant_config,
         )
     def forward(

sglang/srt/models/internlm2.py CHANGED Viewed

@@ -145,6 +145,7 @@ class InternLM2Attention(nn.Module):
             self.scaling,
             self.num_kv_heads,
             layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )

sglang/srt/models/llama.py CHANGED Viewed

@@ -63,6 +63,7 @@ class LlamaMLP(nn.Module):
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -78,6 +79,7 @@ class LlamaMLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -168,6 +170,7 @@ class LlamaAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            quant_config=quant_config,
             prefix=add_prefix("attn", prefix),
         )
@@ -281,7 +284,7 @@ class LlamaModel(nn.Module):
         self.layers = make_layers(
             config.num_hidden_layers,
             lambda idx, prefix: LlamaDecoderLayer(
-                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
             ),
             prefix="model.layers",
         )
@@ -375,9 +378,7 @@ class LlamaForCausalLM(nn.Module):
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = LlamaModel(
-            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
-        )
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
         # Llama 3.2 1B Instruct set tie_word_embeddings to True
         # Llama 3.1 8B Instruct set tie_word_embeddings to False
         if self.config.tie_word_embeddings:
@@ -402,6 +403,14 @@ class LlamaForCausalLM(nn.Module):
         self.capture_aux_hidden_states = False
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return LlamaModel(config, quant_config=quant_config, prefix=prefix)
     @torch.no_grad()
     def forward(
         self,

sglang 0.4.4.post4__py3-none-any.whl → 0.4.5.post1__py3-none-any.whl

sglang 0.4.4.post4py3-none-any.whl → 0.4.5.post1py3-none-any.whl