PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +0 -4
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +26 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +62 -6
sglang/srt/disaggregation/mini_lb.py +5 -1
sglang/srt/disaggregation/mooncake/conn.py +32 -62
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/prefill.py +40 -4
sglang/srt/disaggregation/utils.py +15 -0
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +114 -71
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +15 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +8 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -57
sglang/srt/layers/quantization/fp8_utils.py +187 -262
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +3 -2
sglang/srt/layers/sampler.py +5 -10
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +1 -0
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +2 -4
sglang/srt/managers/scheduler.py +12 -71
sglang/srt/managers/tokenizer_manager.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -1
sglang/srt/mem_cache/memory_pool.py +7 -2
sglang/srt/model_executor/cuda_graph_runner.py +2 -2
sglang/srt/model_executor/model_runner.py +20 -27
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +289 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +29 -201
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +34 -32
sglang/srt/speculative/eagle_worker.py +4 -7
sglang/srt/utils.py +16 -1
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +3 -3
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +92 -91
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/bert.py ADDED Viewed

@@ -0,0 +1,398 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, Iterable, Optional, Set, Tuple
+import torch
+from torch import nn
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+BertConfig = None
+class BertEmbedding(nn.Module):
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.position_embeddings = VocabParallelEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        self.token_type_embeddings = VocabParallelEmbedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)),
+        )
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        # Input embeddings.
+        inputs_embeds = self.word_embeddings(input_ids)
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_ids = torch.zeros(
+            input_shape, dtype=torch.long, device=inputs_embeds.device
+        )
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+class BertEncoder(nn.Module):
+    def __init__(
+        self,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(
+                    config=config,
+                    layer_id=layer_idx,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        for layer in self.layer:
+            hidden_states = layer(hidden_states, forward_batch)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(
+        self,
+        config: BertConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.attention = BertAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+        self.intermediate = BertIntermediate(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.intermediate",
+        )
+        self.output = BertOutput(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+    def forward(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        attn_output = self.attention(hidden_states, forward_batch)
+        intermediate_output = self.intermediate(attn_output)
+        output = self.output(intermediate_output, attn_output)
+        return output
+class BertAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_norm_eps: float,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.self_attn = BertSelfAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+        self.output = BertSelfOutput(
+            hidden_size=hidden_size,
+            layer_norm_eps=layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        self_output = self.self_attn(hidden_states, forward_batch)
+        return self.output(self_output, hidden_states)
+class BertSelfAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.attn = RadixAttention(
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            scaling=self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_ONLY,
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        output = self.attn(q, k, v, forward_batch)
+        return output
+class BertSelfOutput(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertIntermediate(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.intermediate_act_fn = get_act_fn(hidden_act)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embeddings = BertEmbedding(config)
+        self.encoder = BertEncoder(
+            config=config, quant_config=quant_config, prefix=f"encoder"
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # self.pooler = BertPooler(config)
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+        # Your tokenized IDs
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=positions,
+        )
+        hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
+        return self.pooler(hidden_states, forward_batch)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            name = name.replace("self", "self_attn")
+            if "pooler" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+class Contriever(BertModel):
+    pass
+EntryClass = [BertModel, Contriever]

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -170,7 +170,7 @@ class DeepseekMoE(nn.Module):
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(
+        final_hidden_states = fused_moe.fused_moe(
             hidden_states,
             self.w1,
             self.w2,

sglang/srt/models/deepseek_nextn.py CHANGED Viewed

@@ -40,7 +40,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
-from sglang.srt.utils import add_prefix, is_cuda, is_hip
+from sglang.srt.utils import BumpAllocator, add_prefix, is_cuda, is_hip
 _is_hip = is_hip()
 _is_cuda = is_cuda()
@@ -48,7 +48,7 @@ _is_cuda = is_cuda()
 if _is_cuda:
     from sgl_kernel import awq_dequantize
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import awq_dequantize
 class DeepseekModelNextN(nn.Module):
@@ -91,6 +91,14 @@ class DeepseekModelNextN(nn.Module):
         forward_batch: ForwardBatch,
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
+        zero_allocator = BumpAllocator(
+            buffer_size=2,
+            dtype=torch.float32,
+            device=(
+                input_embeds.device if input_embeds is not None else input_ids.device
+            ),
+        )
         if input_embeds is None:
             hidden_states = self.embed_tokens(input_ids)
         else:
@@ -108,7 +116,7 @@ class DeepseekModelNextN(nn.Module):
         residual = None
         hidden_states, residual = self.decoder(
-            positions, hidden_states, forward_batch, residual
+            positions, hidden_states, forward_batch, residual, zero_allocator
         )
         if not forward_batch.forward_mode.is_idle():
@@ -262,79 +270,75 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
                     )
                     weight_loader(param, loaded_weight)
-        if not global_server_args_dict["disable_mla"]:
-            self_attn = self.model.decoder.self_attn
-            if hasattr(self_attn.kv_b_proj, "qweight"):
-                # AWQ compatible
-                if _is_cuda:
-                    w = awq_dequantize(
-                        self_attn.kv_b_proj.qweight,
-                        self_attn.kv_b_proj.scales,
-                        self_attn.kv_b_proj.qzeros,
-                    ).T
-                else:
-                    w = ops.awq_dequantize(
-                        self_attn.kv_b_proj.qweight,
-                        self_attn.kv_b_proj.scales,
-                        self_attn.kv_b_proj.qzeros,
-                        0,
-                        0,
-                        0,
-                    ).T
+        self_attn = self.model.decoder.self_attn
+        if hasattr(self_attn.kv_b_proj, "qweight"):
+            # AWQ compatible
+            if _is_cuda:
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                ).T
             else:
-                w = self_attn.kv_b_proj.weight
-            # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
-            # This may affect the accuracy of fp8 model.
-            if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
-                torch.float8_e4m3fn,
-                torch.float8_e4m3fnuz,
-            ):
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                    0,
+                    0,
+                    0,
+                ).T
+        else:
+            w = self_attn.kv_b_proj.weight
+        # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
+        # This may affect the accuracy of fp8 model.
+        if hasattr(self.quant_config, "weight_block_size") and w.dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ):
+            weight_block_size = self.quant_config.weight_block_size
+            if weight_block_size is not None:
+                assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                if _is_hip:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=w,
+                        weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                        input_scale=None,
+                    )
+                else:
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                w, scale = block_quant_to_tensor_quant(
+                    weight, weight_scale, weight_block_size
+                )
+                self_attn.w_scale = scale
+        if w.dtype == torch.int8:
+            if hasattr(self.quant_config, "weight_block_size"):
+                # block-wise int8 need it
                 weight_block_size = self.quant_config.weight_block_size
                 if weight_block_size is not None:
                     assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
-                    if _is_hip:
-                        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
-                            weight=w,
-                            weight_scale=self_attn.kv_b_proj.weight_scale_inv,
-                            input_scale=None,
-                        )
-                    else:
-                        weight = w
-                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
-                    w, scale = block_quant_to_tensor_quant(
-                        weight, weight_scale, weight_block_size
-                    )
-                    self_attn.w_scale = scale
-            if w.dtype == torch.int8:
-                if hasattr(self.quant_config, "weight_block_size"):
-                    # block-wise int8 need it
-                    weight_block_size = self.quant_config.weight_block_size
-                    if weight_block_size is not None:
-                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
-                        weight = w
-                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
-                        w = int8_block_dequant(
-                            weight, weight_scale, weight_block_size
-                        ).to(torch.bfloat16)
-                else:
-                    # channel-wise int8 need it
-                    assert hasattr(self_attn.kv_b_proj, "weight_scale")
-                    w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                    w = int8_block_dequant(weight, weight_scale, weight_block_size).to(
                         torch.bfloat16
                     )
-            w_kc, w_vc = w.unflatten(
-                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
-            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
-            self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
-            self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
-            if (
-                hasattr(self_attn.kv_b_proj, "weight_scale")
-                and self_attn.w_scale is None
-            ):
-                self_attn.w_scale = self_attn.kv_b_proj.weight_scale
-                if _is_hip:
-                    self_attn.w_scale *= 2.0
+            else:
+                # channel-wise int8 need it
+                assert hasattr(self_attn.kv_b_proj, "weight_scale")
+                w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                    torch.bfloat16
+                )
+        w_kc, w_vc = w.unflatten(
+            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+        ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+        self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+        self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+        if hasattr(self_attn.kv_b_proj, "weight_scale") and self_attn.w_scale is None:
+            self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+            if _is_hip:
+                self_attn.w_scale *= 2.0
 EntryClass = [DeepseekV3ForCausalLMNextN]

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post2py3-none-any.whl