PyPI - sglang - Versions diffs - 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +1 -0
sglang/bench_serving.py +9 -1
sglang/check_env.py +140 -48
sglang/lang/backend/runtime_endpoint.py +1 -0
sglang/lang/chat_template.py +32 -0
sglang/llama3_eval.py +316 -0
sglang/srt/aio_rwlock.py +100 -0
sglang/srt/configs/model_config.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +4 -1
sglang/srt/layers/attention/flashinfer_backend.py +51 -5
sglang/srt/layers/attention/triton_backend.py +16 -25
sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
sglang/srt/layers/linear.py +20 -2
sglang/srt/layers/logits_processor.py +133 -95
sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +18 -39
sglang/srt/layers/moe/fused_moe_native.py +46 -0
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +174 -119
sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +17 -49
sglang/srt/layers/moe/topk.py +191 -0
sglang/srt/layers/quantization/__init__.py +5 -50
sglang/srt/layers/quantization/fp8.py +221 -36
sglang/srt/layers/quantization/fp8_kernel.py +278 -0
sglang/srt/layers/quantization/fp8_utils.py +90 -1
sglang/srt/layers/radix_attention.py +8 -1
sglang/srt/layers/sampler.py +27 -5
sglang/srt/layers/torchao_utils.py +31 -0
sglang/srt/managers/detokenizer_manager.py +37 -17
sglang/srt/managers/io_struct.py +39 -10
sglang/srt/managers/schedule_batch.py +54 -34
sglang/srt/managers/schedule_policy.py +64 -5
sglang/srt/managers/scheduler.py +171 -136
sglang/srt/managers/tokenizer_manager.py +184 -133
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +2 -2
sglang/srt/mem_cache/memory_pool.py +15 -8
sglang/srt/mem_cache/radix_cache.py +12 -2
sglang/srt/model_executor/cuda_graph_runner.py +25 -11
sglang/srt/model_executor/model_runner.py +28 -14
sglang/srt/model_parallel.py +66 -5
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +67 -18
sglang/srt/models/gemma2.py +34 -0
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/granite.py +517 -0
sglang/srt/models/grok.py +73 -9
sglang/srt/models/llama.py +22 -0
sglang/srt/models/llama_classification.py +11 -23
sglang/srt/models/llama_reward.py +0 -2
sglang/srt/models/llava.py +37 -14
sglang/srt/models/mixtral.py +2 -2
sglang/srt/models/olmoe.py +1 -1
sglang/srt/models/qwen2.py +20 -0
sglang/srt/models/qwen2_moe.py +1 -1
sglang/srt/models/xverse_moe.py +1 -1
sglang/srt/openai_api/adapter.py +8 -0
sglang/srt/openai_api/protocol.py +9 -4
sglang/srt/server.py +2 -1
sglang/srt/server_args.py +19 -9
sglang/srt/utils.py +40 -54
sglang/test/test_block_fp8.py +341 -0
sglang/test/test_utils.py +3 -2
sglang/utils.py +10 -3
sglang/version.py +1 -1
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/METADATA +12 -7
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/RECORD +73 -67
sglang/srt/layers/fused_moe_patch.py +0 -133
/sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
/sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
{sglang-0.4.0.post1.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -20,15 +20,17 @@ from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable
 import torch
+import tqdm
+from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
-from sglang.srt.layers.fused_moe_patch import fused_moe_forward_native
 from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
     LogitsProcessorOutput,
 )
+from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.utils import maybe_torch_compile, monkey_patch_vllm_all_gather
@@ -127,7 +129,7 @@ class CudaGraphRunner:
         # Batch sizes to capture
         if model_runner.server_args.disable_cuda_graph_padding:
-            self.capture_bs = list(range(1, 32)) + [64, 128]
+            self.capture_bs = list(range(1, 33)) + [64, 128]
         else:
             self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
@@ -255,7 +257,12 @@ class CudaGraphRunner:
     def capture(self):
         with graph_capture() as graph_capture_context:
             self.stream = graph_capture_context.stream
-            for bs in self.capture_bs:
+            capture_bs = (
+                tqdm.tqdm(self.capture_bs)
+                if get_tensor_model_parallel_rank() == 0
+                else self.capture_bs
+            )
+            for bs in capture_bs:
                 with patch_model(
                     self.model_runner.model,
                     bs in self.compile_bs,
@@ -387,8 +394,14 @@ class CudaGraphRunner:
         # Extract logprobs
         if forward_batch.return_logprob:
-            next_token_logprobs = torch.nn.functional.log_softmax(
-                next_token_logits, dim=-1
+            logits_metadata = LogitsMetadata(
+                forward_mode=ForwardMode.DECODE,
+                top_logprobs_nums=forward_batch.top_logprobs_nums,
+            )
+            next_token_logprobs = (
+                LogitsProcessor.compute_temp_top_p_normalized_logprobs(
+                    next_token_logits, logits_metadata
+                )
             )
             logits_output = LogitsProcessorOutput(
                 next_token_logits=next_token_logits,
@@ -396,13 +409,14 @@ class CudaGraphRunner:
             )
             return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
             if return_top_logprob:
-                logits_metadata = LogitsMetadata(
-                    forward_mode=ForwardMode.DECODE,
-                    top_logprobs_nums=forward_batch.top_logprobs_nums,
-                )
-                logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
+                (
+                    logits_output.output_top_logprobs_val,
+                    logits_output.output_top_logprobs_idx,
+                ) = LogitsProcessor.get_top_logprobs(
                     next_token_logprobs, logits_metadata
-                )[1]
+                )[
+                    2:4
+                ]
         else:
             logits_output = LogitsProcessorOutput(
                 next_token_logits=next_token_logits,

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -95,6 +95,12 @@ class ModelRunner:
         ):
             logger.info("MLA optimization is turned on. Use triton backend.")
             self.server_args.attention_backend = "triton"
+            # FIXME(HandH1998)
+            if (
+                "DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
+                and not self.server_args.disable_cuda_graph
+            ):
+                self.server_args.disable_cuda_graph = True
         if self.server_args.enable_double_sparsity:
             logger.info(
@@ -111,17 +117,20 @@ class ModelRunner:
             )
         if self.is_multimodal:
-            server_args.chunked_prefill_size = -1
             self.mem_fraction_static *= 0.95
-            logger.info(
-                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static} "
-                f"and turn off chunked prefill "
-                f"because this is a multimodal model."
-            )
+            if self.model_config.hf_config.architectures == [
+                "MllamaForConditionalGeneration"
+            ]:
+                logger.info("Automatically turn off --chunked-prefill-size for mllama.")
+                server_args.chunked_prefill_size = -1
             # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
             ]:
+                logger.info(
+                    "Automatically turn off --chunked-prefill-size and disable radix cache for qwen2-vl."
+                )
+                server_args.chunked_prefill_size = -1
                 server_args.disable_radix_cache = True
         # Global vars
@@ -154,6 +163,11 @@ class ModelRunner:
         self.sampler = Sampler()
         self.load_model()
+        # Apply torchao quantization
+        apply_torchao_config_to_model(
+            self.model, global_server_args_dict["torchao_config"]
+        )
         # Apply torch TP if the model supports it
         supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
         if self.tp_size > 1 and supports_torch_tp:
@@ -162,10 +176,6 @@ class ModelRunner:
         else:
             self.torch_tp_applied = False
-        apply_torchao_config_to_model(
-            self.model, global_server_args_dict["torchao_config"]
-        )
         # Init memory pool and attention backends
         if server_args.lora_paths is not None:
             self.init_lora_manager()
@@ -242,20 +252,22 @@ class ModelRunner:
                 if torch.cuda.get_device_capability()[1] < 5:
                     raise RuntimeError("SGLang only supports sm75 and above.")
-        # Prepare the vllm model config
+        # Prepare the model config
         self.load_config = LoadConfig(
             load_format=self.server_args.load_format,
             download_dir=self.server_args.download_dir,
         )
         if self.server_args.load_format == "gguf":
             monkey_patch_vllm_gguf_config()
+        # Load the model
         self.model = get_model(
             model_config=self.model_config,
             load_config=self.load_config,
             device_config=DeviceConfig(self.device),
         )
+        # Parse other args
         self.sliding_window_size = (
             self.model.get_attention_sliding_window_size()
             if hasattr(self.model, "get_attention_sliding_window_size")
@@ -270,8 +282,10 @@ class ModelRunner:
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
-    def update_weights_from_disk(self, model_path: str, load_format: str):
-        """Update engine weights online from disk."""
+    def update_weights_from_disk(
+        self, model_path: str, load_format: str
+    ) -> tuple[bool, str]:
+        """Update engine weights in-place from the disk."""
         from sglang.srt.model_loader.loader import (
             DefaultModelLoader,
             device_loading_context,

sglang/srt/model_parallel.py CHANGED Viewed

@@ -2,18 +2,18 @@
 Common utilities for torch model parallelism.
 """
-from typing import Optional
+from typing import Optional, Sequence
 import torch
+import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
 try:
-    from torch.distributed.tensor import DTensor, Shard
+    import torch.distributed.tensor as dt
 except ImportError:
     # torch 2.4 or older
-    from torch.distributed._tensor import DTensor, Shard
+    import torch.distributed._tensor as dt
-from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     RowwiseParallel,
@@ -21,6 +21,50 @@ from torch.distributed.tensor.parallel import (
 )
+def _shard_tensor(
+    full_tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Sequence[dt.Shard],
+) -> "dt.DTensor":
+    """
+    Locally shards a full tensor based on indicated sharding arrangement, and
+    returns a DTensor containing the local shard.
+    .. warning:: This is a private API that is subject to change. It skips the
+        communication otherwise required by `distribute_tensor`. It is only
+        applicable to cases where all ranks have the same `full_tensor`. For
+        example, in distributed inference all ranks load from the same
+        checkpoint. This API will not check for data equality between ranks, it
+        is thus user's responsibility to ensure the `full_tensor` is the same
+        across ranks.
+    Args:
+        full_tensor (torch.Tensor): the full tensor to be sharded.
+        device_mesh (:class:`DeviceMesh`): DeviceMesh to place the
+            DTensor.  Must have same dimension as the number of placements.
+        placements (Sequence[:class:`Shard`]): the placements that
+            describes how to place the local tensor on DeviceMesh.
+    Returns:
+        A :class:`DTensor` object with the shard as its local tensor.
+    Examples:
+        >>> # xdoctest: +SKIP("need world_size and rank")
+        >>> device_mesh = dist.init_device_mesh("cuda", (world_size,))
+        >>> full_tensor = torch.arange(world_size, device=f"cuda:{rank}")
+        >>> dtensor = _shard_tensor(full_tensor, device_mesh, [Shard(1)])
+    """
+    shape, offset = dt._utils.compute_local_shape_and_global_offset(
+        full_tensor.shape, device_mesh, placements
+    )
+    slices = [
+        slice(cur_offset, cur_offset + cur_shape)
+        for cur_shape, cur_offset in zip(shape, offset)
+    ]
+    local_tensor = full_tensor[slices]
+    return dt.DTensor.from_local(local_tensor, device_mesh, placements)
 class ColwiseParallelSharded(ColwiseParallel):
     """
     A version of ColwiseParallel where the local weight has been already
@@ -34,7 +78,7 @@ class ColwiseParallelSharded(ColwiseParallel):
         # means Colwise as Linear is input * weight^T + bias, where
         # weight would become Shard(1)
         for name, param in module.named_parameters():
-            dtensor = DTensor.from_local(param, device_mesh, [Shard(0)])
+            dtensor = dt.DTensor.from_local(param, device_mesh, [dt.Shard(0)])
             dist_param = torch.nn.Parameter(dtensor, requires_grad=False)
             module.register_parameter(name, dist_param)
@@ -47,6 +91,23 @@ class RowwiseParallelMaybeWait(RowwiseParallel):
     AsyncCollectiveTensor and custom ops, such as `class RMSNorm(CustomOp)`.
     """
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        module.register_parameter(
+            "weight",
+            nn.Parameter(_shard_tensor(module.weight, device_mesh, [dt.Shard(1)])),
+        )
+        if getattr(module, "bias", None) is not None:
+            # The Linear module has bias
+            module.register_parameter(
+                "bias",
+                nn.Parameter(
+                    dt.distribute_tensor(module.bias, device_mesh, [dt.Replicate()])
+                ),
+            )
     @staticmethod
     def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
         outputs = super(

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -27,13 +27,13 @@ from vllm.distributed import (
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
-from sglang.srt.layers.fused_moe_triton import fused_moe
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.vocab_parallel_embedding import (

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -29,7 +29,6 @@ from vllm.distributed import (
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.activation import SiluAndMul
-from sglang.srt.layers.fused_moe_triton import fused_moe
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
@@ -38,6 +37,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.vocab_parallel_embedding import (

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -19,6 +19,7 @@
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
+import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
 from vllm import _custom_ops as ops
@@ -31,8 +32,6 @@ from vllm.distributed import (
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.activation import SiluAndMul
-from sglang.srt.layers.ep_moe.layer import EPMoE
-from sglang.srt.layers.fused_moe_triton import FusedMoE
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -41,7 +40,13 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_to_tensor_quant,
+    input_to_float8,
+)
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
@@ -90,6 +95,24 @@ class DeepseekV2MLP(nn.Module):
         return x
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        if config.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((config.n_routed_experts))
+            )
+        else:
+            self.e_score_correction_bias = None
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states, self.weight, None)
+        return logits
 class DeepseekV2MoE(nn.Module):
     def __init__(
@@ -114,6 +137,8 @@ class DeepseekV2MoE(nn.Module):
                 "Only silu is supported for now."
             )
+        self.gate = MoEGate(config=config)
         MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
         self.experts = MoEImpl(
             num_experts=config.n_routed_experts,
@@ -125,11 +150,9 @@ class DeepseekV2MoE(nn.Module):
             use_grouped_topk=True,
             num_expert_group=config.n_group,
             topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
         )
-        self.gate = ReplicatedLinear(
-            config.hidden_size, config.n_routed_experts, bias=False, quant_config=None
-        )
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             self.shared_experts = DeepseekV2MLP(
@@ -146,7 +169,7 @@ class DeepseekV2MoE(nn.Module):
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
+        router_logits = self.gate(hidden_states)
         final_hidden_states = (
             self.experts(hidden_states=hidden_states, router_logits=router_logits)
             * self.routed_scaling_factor
@@ -167,15 +190,6 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
     return 0.1 * mscale * math.log(scale) + 1.0
-def input_to_float8(x, dtype=torch.float8_e4m3fn):
-    finfo = torch.finfo(dtype)
-    min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
-    scale = finfo.max / amax
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
-    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 class DeepseekV2Attention(nn.Module):
     def __init__(
@@ -439,7 +453,10 @@ class DeepseekV2AttentionMLA(nn.Module):
             quant_config=quant_config,
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
-        rope_scaling["rope_type"] = "deepseek_yarn"
+        if rope_scaling:
+            rope_scaling["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
@@ -454,6 +471,8 @@ class DeepseekV2AttentionMLA(nn.Module):
             scaling_factor = rope_scaling["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
+        else:
+            self.rotary_emb.forward = self.rotary_emb.forward_native
         self.attn_mqa = RadixAttention(
             self.num_local_heads,
@@ -845,6 +864,16 @@ class DeepseekV2ForCausalLM(nn.Module):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
+            # TODO(HandH1998): Modify it when nextn is supported.
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                if num_nextn_layers > 0 and name.startswith("model.layers"):
+                    name_list = name.split(".")
+                    if (
+                        len(name_list) >= 3
+                        and int(name_list[2]) >= self.config.num_hidden_layers
+                    ):
+                        continue
             if "rotary_emb.inv_freq" in name:
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -909,13 +938,33 @@ class DeepseekV2ForCausalLM(nn.Module):
                     ).T
                 else:
                     w = self_attn.kv_b_proj.weight
+                # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
+                # This may affect the accuracy of fp8 model.
+                if (
+                    hasattr(self.quant_config, "weight_block_size")
+                    and w.dtype == torch.float8_e4m3fn
+                ):
+                    weight_block_size = self.quant_config.weight_block_size
+                    if weight_block_size is not None:
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        w, scale = block_quant_to_tensor_quant(
+                            w, self_attn.kv_b_proj.weight_scale_inv, weight_block_size
+                        )
+                        self_attn.w_scale = scale
                 w_kc, w_vc = w.unflatten(
                     0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
                 ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
                 self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
                 self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
-                if hasattr(self_attn.kv_b_proj, "weight_scale"):
+                if (
+                    hasattr(self_attn.kv_b_proj, "weight_scale")
+                    and self_attn.w_scale is None
+                ):
                     self_attn.w_scale = self_attn.kv_b_proj.weight_scale
-EntryClass = DeepseekV2ForCausalLM
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
+EntryClass = [DeepseekV2ForCausalLM, DeepseekV3ForCausalLM]

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -355,6 +355,40 @@ class Gemma2ForCausalLM(nn.Module):
             input_ids, hidden_states, self.model.embed_tokens, forward_batch
         )
+    def get_hidden_dim(self, module_name):
+        # return input_dim, output_dim
+        if module_name in ["q_proj", "qkv_proj"]:
+            return (
+                self.config.hidden_size,
+                self.config.head_dim * self.config.num_attention_heads,
+            )
+        elif module_name in ["o_proj"]:
+            return (
+                self.config.head_dim * self.config.num_attention_heads,
+                self.config.hidden_size,
+            )
+        elif module_name in ["kv_proj"]:
+            return (
+                self.config.hidden_size,
+                self.config.head_dim * self.config.num_key_value_heads,
+            )
+        elif module_name == "gate_up_proj":
+            return self.config.hidden_size, self.config.intermediate_size
+        elif module_name == "down_proj":
+            return self.config.intermediate_size, self.config.hidden_size
+        else:
+            raise NotImplementedError()
+    def get_module_name(self, name):
+        params_mapping = {
+            "q_proj": "qkv_proj",
+            "k_proj": "qkv_proj",
+            "v_proj": "qkv_proj",
+            "gate_proj": "gate_up_proj",
+            "up_proj": "gate_up_proj",
+        }
+        return params_mapping.get(name, name)
     def get_attention_sliding_window_size(self):
         return get_attention_sliding_window_size(self.config)

sglang/srt/models/gemma2_reward.py CHANGED Viewed

@@ -32,7 +32,6 @@ class Gemma2ForSequenceClassification(nn.Module):
     ) -> None:
         super().__init__()
         self.config = config
-        self.torchao_config = None
         self.quant_config = quant_config
         self.num_labels = config.num_labels
         self.model = Gemma2Model(config, quant_config=quant_config)

sglang 0.4.0.post1__py3-none-any.whl → 0.4.1__py3-none-any.whl

sglang 0.4.0.post1py3-none-any.whl → 0.4.1py3-none-any.whl