PyPI - sglang - Versions diffs - 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -16,6 +16,7 @@
 # https://github.com/vllm-project/vllm/blob/fb6af8bc086328ca6659e72d11ffd4309ce4de22/vllm/model_executor/models/deepseek_v2.py
 """Inference-only DeepseekV2 model."""
+import os
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
@@ -31,6 +32,9 @@ from sglang.srt.distributed import (
     tensor_model_parallel_all_reduce,
 )
 from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope import (
+    decode_attention_fwd_grouped_rope,
+)
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -47,6 +51,9 @@ from sglang.srt.layers.quantization.fp8_utils import (
     input_to_float8,
     normalize_e4m3fn_to_e4m3fnuz,
 )
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope, get_rope_wrapper
 from sglang.srt.layers.vocab_parallel_embedding import (
@@ -56,7 +63,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_cuda_available, is_hip
+from sglang.srt.utils import add_prefix, is_cuda_available, is_hip
 is_hip_ = is_hip()
@@ -72,10 +79,15 @@ class DeepseekV2MLP(nn.Module):
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
@@ -83,6 +95,7 @@ class DeepseekV2MLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -99,7 +112,11 @@ class DeepseekV2MLP(nn.Module):
 class MoEGate(nn.Module):
-    def __init__(self, config):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+    ):
         super().__init__()
         self.weight = nn.Parameter(
             torch.empty((config.n_routed_experts, config.hidden_size))
@@ -122,6 +139,7 @@ class DeepseekV2MoE(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -140,7 +158,7 @@ class DeepseekV2MoE(nn.Module):
                 "Only silu is supported for now."
             )
-        self.gate = MoEGate(config=config)
+        self.gate = MoEGate(config=config, prefix=add_prefix("gate", prefix))
         MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
         self.experts = MoEImpl(
@@ -154,6 +172,7 @@ class DeepseekV2MoE(nn.Module):
             num_expert_group=config.n_group,
             topk_group=config.topk_group,
             correction_bias=self.gate.e_score_correction_bias,
+            prefix=add_prefix("experts", prefix),
         )
         if config.n_shared_experts is not None:
@@ -164,6 +183,7 @@ class DeepseekV2MoE(nn.Module):
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
             )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -210,6 +230,7 @@ class DeepseekV2Attention(nn.Module):
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
         layer_id=None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.layer_id = layer_id
@@ -234,6 +255,7 @@ class DeepseekV2Attention(nn.Module):
                 self.q_lora_rank,
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("q_a_proj", prefix),
             )
             self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = ColumnParallelLinear(
@@ -241,6 +263,7 @@ class DeepseekV2Attention(nn.Module):
                 self.num_heads * self.qk_head_dim,
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("q_b_proj", prefix),
             )
         else:
             self.q_proj = ColumnParallelLinear(
@@ -248,6 +271,7 @@ class DeepseekV2Attention(nn.Module):
                 self.num_heads * self.qk_head_dim,
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
             )
         self.kv_a_proj_with_mqa = ReplicatedLinear(
@@ -255,8 +279,7 @@ class DeepseekV2Attention(nn.Module):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
-            # FIXME: quick fix for skip quantization
-            prefix=f"self_attn.kv_a_proj_with_mqa",
+            prefix=add_prefix("kv_a_proj_with_mqa", prefix),
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = ColumnParallelLinear(
@@ -264,6 +287,7 @@ class DeepseekV2Attention(nn.Module):
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
             bias=False,
             quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
         )
         # O projection.
         self.o_proj = RowParallelLinear(
@@ -271,6 +295,7 @@ class DeepseekV2Attention(nn.Module):
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
         )
         rope_scaling["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope_wrapper(
@@ -296,6 +321,7 @@ class DeepseekV2Attention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_local_heads,
             layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
         )
     def forward(
@@ -361,6 +387,7 @@ class DeepseekV2AttentionMLA(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         layer_id=None,
         use_dp=False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.layer_id = layer_id
@@ -387,6 +414,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.q_lora_rank,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_a_proj", prefix),
                 )
                 self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
                 self.q_b_proj = ReplicatedLinear(
@@ -394,6 +422,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.num_heads * self.qk_head_dim,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_b_proj", prefix),
                 )
             else:
                 self.q_proj = ReplicatedLinear(
@@ -401,12 +430,14 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.num_heads * self.qk_head_dim,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_proj", prefix),
                 )
             self.kv_b_proj = ReplicatedLinear(
                 self.kv_lora_rank,
                 self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("kv_b_proj", prefix),
             )
             # O projection.
             self.o_proj = ReplicatedLinear(
@@ -414,6 +445,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.hidden_size,
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("o_proj", prefix),
             )
         else:
             # For tensor parallel attention
@@ -423,6 +455,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.q_lora_rank,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_a_proj", prefix),
                 )
                 self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
                 self.q_b_proj = ColumnParallelLinear(
@@ -430,6 +463,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.num_heads * self.qk_head_dim,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_b_proj", prefix),
                 )
             else:
                 self.q_proj = ColumnParallelLinear(
@@ -437,12 +471,14 @@ class DeepseekV2AttentionMLA(nn.Module):
                     self.num_heads * self.qk_head_dim,
                     bias=False,
                     quant_config=quant_config,
+                    prefix=add_prefix("q_proj", prefix),
                 )
             self.kv_b_proj = ColumnParallelLinear(
                 self.kv_lora_rank,
                 self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("kv_b_proj", prefix),
             )
             # O projection.
             self.o_proj = RowParallelLinear(
@@ -450,6 +486,7 @@ class DeepseekV2AttentionMLA(nn.Module):
                 self.hidden_size,
                 bias=False,
                 quant_config=quant_config,
+                prefix=add_prefix("o_proj", prefix),
             )
         self.kv_a_proj_with_mqa = ReplicatedLinear(
@@ -457,8 +494,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=False,
             quant_config=quant_config,
-            # FIXME: quick fix for skip quantization
-            prefix=f"self_attn.kv_a_proj_with_mqa",
+            prefix=add_prefix("kv_a_proj_with_mqa", prefix),
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
@@ -489,6 +525,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             num_kv_heads=1,
             layer_id=layer_id,
             v_head_dim=self.kv_lora_rank,
+            prefix=add_prefix("attn_mqa", prefix),
         )
         self.attn_mha = RadixAttention(
@@ -498,6 +535,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             num_kv_heads=self.num_local_heads,
             layer_id=layer_id,
             v_head_dim=self.v_head_dim,
+            prefix=add_prefix("attn_mha", prefix),
         )
         self.w_kc = None
@@ -510,20 +548,37 @@ class DeepseekV2AttentionMLA(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
     ) -> torch.Tensor:
-        if global_server_args_dict["enable_flashinfer_mla"]:
-            if forward_batch.forward_mode.is_extend():
-                return self.forward_normal(positions, hidden_states, forward_batch)
+        def no_absorb() -> bool:
+            if global_server_args_dict["enable_flashinfer_mla"]:
+                # Flashinfer MLA: Do not absorb when enabling ragged prefill
+                return (
+                    not global_server_args_dict["flashinfer_mla_disable_ragged"]
+                    and forward_batch.forward_mode.is_extend()
+                    and forward_batch.extend_prefix_lens.sum() == 0
+                )
             else:
-                return self.forward_absorb(positions, hidden_states, forward_batch)
+                # Triton: Use normal computation for prefill and use weight absorption for extend/decode
+                return (
+                    forward_batch.forward_mode.is_extend()
+                    and not forward_batch.forward_mode.is_target_verify()
+                    and not forward_batch.forward_mode.is_draft_extend()
+                    and forward_batch.extend_prefix_lens.sum() == 0
+                )
+        if no_absorb():
+            return self.forward_normal(positions, hidden_states, forward_batch)
         else:
-            # Triton: Use normal computation for prefill and use weight absorption for extend/decode
-            if (
-                forward_batch.forward_mode.is_extend()
-                and not forward_batch.forward_mode.is_target_verify()
-                and not forward_batch.forward_mode.is_draft_extend()
-                and forward_batch.extend_prefix_lens.sum() == 0
-            ):
-                return self.forward_normal(positions, hidden_states, forward_batch)
+            if is_hip_:
+                if (
+                    os.getenv("SGLANG_ROCM_FUSED_DECODE_MLA") == "1"
+                    and forward_batch.forward_mode.is_decode()
+                ):
+                    return self.forward_absorb_fused_mla_rope(
+                        positions, hidden_states, forward_batch
+                    )
+                else:
+                    return self.forward_absorb(positions, hidden_states, forward_batch)
             else:
                 return self.forward_absorb(positions, hidden_states, forward_batch)
@@ -644,6 +699,149 @@ class DeepseekV2AttentionMLA(nn.Module):
         return output
+    def forward_absorb_fused_mla_rope(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        enable_rope_fusion = (
+            os.getenv("SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION", "1") == "1"
+        )
+        q_len = hidden_states.shape[0]
+        q_input = hidden_states.new_empty(
+            q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        if self.w_kc.dtype == torch.float8_e4m3fnuz:
+            # TODO(kernel): add bmm_fp8 for torch.float8_e4m3fnuz
+            q_nope_out = torch.bmm(
+                q_nope.to(torch.bfloat16).transpose(0, 1),
+                self.w_kc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_kc.dtype == torch.float8_e4m3fn:
+            q_nope_val, q_nope_scale = input_to_float8(
+                q_nope.transpose(0, 1), torch.float8_e4m3fn
+            )
+            q_nope_out = bmm_fp8(
+                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+            )
+        else:
+            q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+        q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        v_input = latent_cache[..., : self.kv_lora_rank]
+        v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
+        k_input = latent_cache.unsqueeze(1)
+        k_input[..., : self.kv_lora_rank] = v_input
+        if not enable_rope_fusion:
+            k_pe = k_input[..., self.kv_lora_rank :]
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+            q_input[..., self.kv_lora_rank :] = q_pe
+            k_input[..., self.kv_lora_rank :] = k_pe
+            k_pe_output = None
+        else:
+            k_pe_output = torch.empty_like(k_input[..., self.kv_lora_rank :])
+        q_input[..., self.kv_lora_rank :] = q_pe
+        # attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
+        # Use Fused ROPE with use_rope=OFF.
+        attn_output = torch.empty(
+            (q_len, self.num_local_heads, self.kv_lora_rank),
+            dtype=q.dtype,
+            device=q.device,
+        )
+        attn_logits, _, kv_indptr, kv_indices, _, _, _ = (
+            forward_batch.attn_backend.forward_metadata
+        )
+        cos_sin_cache = self.rotary_emb.cos_sin_cache
+        num_kv_split = forward_batch.attn_backend.num_kv_splits
+        sm_scale = self.attn_mqa.scaling
+        if attn_logits is None:
+            attn_logits = torch.empty(
+                (
+                    forward_batch.batch_size,
+                    self.num_local_heads,
+                    num_kv_split,
+                    self.kv_lora_rank + 1,
+                ),
+                dtype=torch.float32,
+                device=q.device,
+            )
+        # save current latent cache.
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+        )
+        key_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+            self.attn_mqa.layer_id
+        )
+        val_cache_buf = key_cache_buf[..., : self.kv_lora_rank]
+        decode_attention_fwd_grouped_rope(
+            q_input,
+            key_cache_buf,
+            val_cache_buf,
+            attn_output,
+            kv_indptr,
+            kv_indices,
+            k_pe_output,
+            self.kv_lora_rank,
+            self.rotary_emb.rotary_dim,
+            cos_sin_cache,
+            positions,
+            attn_logits,
+            num_kv_split,
+            sm_scale,
+            logit_cap=self.attn_mqa.logit_cap,
+            use_rope=enable_rope_fusion,
+            is_neox_style=self.rotary_emb.is_neox_style,
+        )
+        if enable_rope_fusion:
+            k_input[..., self.kv_lora_rank :] = k_pe_output
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+            )
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+        if self.w_vc.dtype == torch.float8_e4m3fnuz:
+            # TODO(kernel): add bmm_fp8 for torch.float8_e4m3fnuz
+            attn_bmm_output = torch.bmm(
+                attn_output.to(torch.bfloat16).transpose(0, 1),
+                self.w_vc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_vc.dtype == torch.float8_e4m3fn:
+            attn_output_val, attn_output_scale = input_to_float8(
+                attn_output.transpose(0, 1), torch.float8_e4m3fn
+            )
+            attn_bmm_output = bmm_fp8(
+                attn_output_val,
+                self.w_vc,
+                attn_output_scale,
+                self.w_scale,
+                torch.bfloat16,
+            )
+        else:
+            attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+        attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+        output, _ = self.o_proj(attn_output)
+        return output
 def all_gather(
     input_tensor: torch.Tensor, forward_batch: ForwardBatch, rank, world_size, group
@@ -651,16 +849,14 @@ def all_gather(
     if world_size == 1:
         return input_tensor
-    all_lens = forward_batch.global_num_tokens
-    max_len = max(forward_batch.global_num_tokens)
+    all_lens = forward_batch.global_num_tokens_cpu
+    max_len = max(forward_batch.global_num_tokens_cpu)
     padded_tensor = torch.nn.functional.pad(
         input_tensor, (0, 0, 0, max_len - input_tensor.shape[0])
     )
-    torch.distributed.all_gather_into_tensor(
-        forward_batch.gathered_buffer, padded_tensor, group=group
-    )
+    group.all_gather_into_tensor(forward_batch.gathered_buffer, padded_tensor)
     gathered_tensors = torch.concat(
         [
@@ -683,6 +879,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         layer_id: int,
         quant_config: Optional[QuantizationConfig] = None,
         is_nextn: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -696,7 +893,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         if self.enable_dp_attention:
             self.tp_rank = get_tensor_model_parallel_rank()
             self.tp_size = get_tensor_model_parallel_world_size()
-            self.tp_group = get_tp_group().device_group
+            self.tp_group = get_tp_group()
         if not global_server_args_dict["disable_mla"]:
             self.self_attn = DeepseekV2AttentionMLA(
                 config=config,
@@ -715,6 +912,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 layer_id=layer_id,
                 use_dp=self.enable_dp_attention,
+                prefix=add_prefix("self_attn", prefix),
             )
         else:
             self.self_attn = DeepseekV2Attention(
@@ -733,19 +931,25 @@ class DeepseekV2DecoderLayer(nn.Module):
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 layer_id=layer_id,
+                prefix=add_prefix("self_attn", prefix),
             )
         if is_nextn or (
             config.n_routed_experts is not None
             and layer_id >= config.first_k_dense_replace
             and layer_id % config.moe_layer_freq == 0
         ):
-            self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config)
+            self.mlp = DeepseekV2MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
         else:
             self.mlp = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
             )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -797,6 +1001,7 @@ class DeepseekV2Model(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_id = config.pad_token_id
@@ -813,6 +1018,7 @@ class DeepseekV2Model(nn.Module):
                     config,
                     layer_id,
                     quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]
@@ -843,21 +1049,28 @@ class DeepseekV2ForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekV2Model(config, quant_config)
+        self.model = DeepseekV2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
         if global_server_args_dict["enable_dp_attention"]:
             self.lm_head = ReplicatedLinear(
                 config.hidden_size,
                 config.vocab_size,
                 bias=False,
+                prefix=add_prefix("lm_head", prefix),
             )
             self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
         else:
             self.lm_head = ParallelLMHead(
-                config.vocab_size, config.hidden_size, quant_config=quant_config
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
             )
             self.logits_processor = LogitsProcessor(config)
@@ -989,6 +1202,18 @@ class DeepseekV2ForCausalLM(nn.Module):
                             weight, weight_scale, weight_block_size
                         )
                         self_attn.w_scale = scale
+                if (
+                    hasattr(self.quant_config, "weight_block_size")
+                    and w.dtype == torch.int8
+                ):
+                    weight_block_size = self.quant_config.weight_block_size
+                    if weight_block_size is not None:
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                        w = int8_block_dequant(
+                            weight, weight_scale, weight_block_size
+                        ).to(torch.bfloat16)
                 w_kc, w_vc = w.unflatten(
                     0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
                 ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
@@ -1002,6 +1227,17 @@ class DeepseekV2ForCausalLM(nn.Module):
                     if is_hip_:
                         self_attn.w_scale *= 2.0
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
     pass

sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl