PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -28,8 +28,9 @@ else:
 import logging
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-if _is_hip:
+if _use_aiter:
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         set_weight_attrs(w2_weight, extra_weight_attrs)
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+        if _use_aiter:
             layer.w13_weight = torch.nn.Parameter(
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             use_grouped_topk=use_grouped_topk,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             activation=activation,
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -179,13 +183,27 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
         )
-        if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
+        if _use_aiter:
             assert not no_combine, "unsupported"
+            if apply_router_weight_on_input:
+                assert (
+                    topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+                _, topk = topk_weights.shape
+                assert (
+                    topk == 1
+                ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+                x = x * topk_weights.to(x.dtype)
+                topk_weights = torch.ones_like(
+                    topk_weights, dtype=torch.float32
+                )  # topk_weights must be FP32 (float32)
             return ck_moe_2stages(
                 x,
                 layer.w13_weight,
@@ -207,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 no_combine=no_combine,
+                routed_scaling_factor=routed_scaling_factor,
             )
     def forward_cpu(
@@ -219,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         inplace: bool = True,
@@ -232,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
             renormalize,
             topk_group,
             num_expert_group,
+            num_fused_shared_experts,
             custom_routing_function,
             correction_bias,
         )
@@ -270,11 +291,13 @@ class FusedMoE(torch.nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
+        layer_id: Optional[int] = None,
         params_dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         renormalize: bool = True,
         use_grouped_topk: bool = False,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         topk_group: Optional[int] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
@@ -293,6 +316,7 @@ class FusedMoE(torch.nn.Module):
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.hidden_size = hidden_size
         self.tp_size = (
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
@@ -307,6 +331,7 @@ class FusedMoE(torch.nn.Module):
         if self.use_grouped_topk:
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
+        self.num_fused_shared_experts = num_fused_shared_experts
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
         self.correction_bias = correction_bias
@@ -532,7 +557,8 @@ class FusedMoE(torch.nn.Module):
             loaded_weight = loaded_weight.to(param.data.device)
             if (
-                param.data[expert_id] != 1
+                "compressed" in self.quant_method.__class__.__name__.lower()
+                and param.data[expert_id] != 1
                 and (param.data[expert_id] - loaded_weight).abs() > 1e-5
             ):
                 raise ValueError(
@@ -556,6 +582,23 @@ class FusedMoE(torch.nn.Module):
                 tp_rank=tp_rank,
             )
             return
+        if "ModelOpt" in self.quant_method.__class__.__name__:
+            if "weight_scale_2" in weight_name or "input_scale" in weight_name:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
+            elif "weight" in weight_name:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=tp_rank,
+                )
+            return
         # Case weight scales and zero_points
         if "scale" in weight_name or "zero" in weight_name:
@@ -637,6 +680,7 @@ class FusedMoE(torch.nn.Module):
             use_grouped_topk=self.use_grouped_topk,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
             custom_routing_function=self.custom_routing_function,
             correction_bias=self.correction_bias,
             activation=self.activation,

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -18,7 +18,15 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F
-from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
+from sglang.srt.managers import expert_location_dispatch
+from sglang.srt.managers.expert_distribution import (
+    ExpertDistributionRecorder,
+    get_global_expert_distribution_recorder,
+)
+from sglang.srt.managers.expert_location_dispatch import (
+    ExpertLocationDispatchInfo,
+    topk_ids_logical_to_physical,
+)
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
@@ -32,9 +40,6 @@ if _is_cuda or _is_hip:
     from sgl_kernel import topk_softmax
-expert_distribution_recorder = ExpertDistributionRecorder()
 def fused_topk_native(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -61,6 +66,8 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
@@ -82,9 +89,27 @@ def fused_topk(
     )
     del token_expert_indicies
+    return _fused_topk_postprocess(
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        renormalize=renormalize,
+        expert_location_dispatch_info=expert_location_dispatch_info,
+        num_token_non_padded=num_token_non_padded,
+    )
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def _fused_topk_postprocess(
+    topk_weights,
+    topk_ids,
+    renormalize,
+    expert_location_dispatch_info,
+    num_token_non_padded,
+):
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
     return topk_weights, topk_ids
@@ -97,8 +122,10 @@ def grouped_topk(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
@@ -120,10 +147,10 @@ def grouped_topk(
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
     topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -133,12 +160,15 @@ def grouped_topk(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
-    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
 def biased_grouped_topk_impl(
@@ -149,8 +179,10 @@ def biased_grouped_topk_impl(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
@@ -179,10 +211,10 @@ def biased_grouped_topk_impl(
     _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
     topk_weights = scores.gather(1, topk_ids)
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -192,18 +224,31 @@ def biased_grouped_topk_impl(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
-    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
 def is_power_of_two(n):
     return n > 0 and math.log2(n).is_integer()
+def _mask_topk_ids_padded_region(
+    topk_ids: torch.Tensor,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+):
+    if num_token_non_padded is None:
+        return
+    indices = torch.arange(0, topk_ids.shape[0], device=topk_ids.device)
+    topk_ids[indices >= num_token_non_padded, :] = -1
 def biased_grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -213,28 +258,39 @@ def biased_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     compiled: bool = True,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
     assert (
         routed_scaling_factor is not None
     ), "routed_scaling_factor is required for biased_grouped_topk"
-    # TODO: moe_fused_gate kernel is not supported for n_share_experts_fusion > 0 now.
+    # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
     if (
         _is_cuda
         and gating_output.shape[1] // num_expert_group
         <= 32  # moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
         and is_power_of_two(correction_bias.shape[0])
     ):
-        return moe_fused_gate(
+        topk_weights, topk_ids = moe_fused_gate(
             gating_output,
             correction_bias,
             num_expert_group,
             topk_group,
             topk,
-            n_share_experts_fusion,
+            num_fused_shared_experts,
             routed_scaling_factor,
         )
+        # TODO merge into kernel for this branch
+        topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+        # TODO will fuse this into kernel, thus use slow manual operation now
+        if num_token_non_padded is None:
+            return topk_weights, topk_ids
+        torch.compile(
+            _mask_topk_ids_padded_region, dynamic=True, backend=get_compiler_backend()
+        )(topk_ids, num_token_non_padded)
+        return topk_weights, topk_ids
     else:
         biased_grouped_topk_fn = (
             torch.compile(
@@ -251,8 +307,10 @@ def biased_grouped_topk(
             renormalize,
             num_expert_group,
             topk_group,
-            n_share_experts_fusion=n_share_experts_fusion,
+            num_fused_shared_experts=num_fused_shared_experts,
             routed_scaling_factor=routed_scaling_factor,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
         )
@@ -264,12 +322,22 @@ def select_experts(
     renormalize: bool,
     topk_group: Optional[int] = None,
     num_expert_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
     torch_native: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
-    n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
+    router_logits, correction_bias = (
+        expert_location_dispatch.transform_select_experts_inputs(
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            info=expert_location_dispatch_info,
+        )
+    )
     # DeepSeek V2/V3/R1 series models use grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
@@ -282,8 +350,10 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
             )
         else:
             topk_weights, topk_ids = biased_grouped_topk(
@@ -294,10 +364,16 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
             )
     elif torch_native and custom_routing_function is None:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in fused_topk_native"
+        assert expert_location_dispatch_info is None
         topk_weights, topk_ids = fused_topk_native(
             hidden_states=hidden_states,
             gating_output=router_logits,
@@ -305,13 +381,20 @@ def select_experts(
             renormalize=renormalize,
         )
     elif custom_routing_function is None:
+        # Qwen3MOE uses fused_topk
         topk_weights, topk_ids = fused_topk(
             hidden_states=hidden_states,
             gating_output=router_logits,
             topk=top_k,
             renormalize=renormalize,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
         )
     else:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in custom_routing_function"
+        assert expert_location_dispatch_info is None
         topk_weights, topk_ids = custom_routing_function(
             hidden_states=hidden_states,
             gating_output=router_logits,
@@ -319,6 +402,6 @@ def select_experts(
             renormalize=renormalize,
         )
-    expert_distribution_recorder.record_new_token(topk_ids)
+    get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
     return topk_weights, topk_ids

sglang/srt/layers/multimodal.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logits processing."""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def hash_kernel(
+    input_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+    PRIME: tl.constexpr,
+    XCONST: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
+    mixed = data ^ (offsets.to(tl.int64) + XCONST)
+    hash_val = mixed * PRIME
+    hash_val = hash_val ^ (hash_val >> 16)
+    hash_val = hash_val * (PRIME ^ XCONST)
+    hash_val = hash_val ^ (hash_val >> 13)
+    tl.store(output_ptr + offsets, hash_val, mask=mask)
+PRIME_1 = -(11400714785074694791 ^ 0xFFFFFFFFFFFFFFFF) - 1
+PRIME_2 = -(14029467366897019727 ^ 0xFFFFFFFFFFFFFFFF) - 1
+def gpu_tensor_hash(tensor: torch.Tensor) -> int:
+    assert tensor.is_cuda
+    tensor = tensor.contiguous().view(torch.int32)
+    n = tensor.numel()
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(n, BLOCK_SIZE),)
+    intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
+    hash_kernel[grid](
+        tensor,
+        intermediate_hashes,
+        n,
+        BLOCK_SIZE=BLOCK_SIZE,
+        PRIME=PRIME_1,
+        XCONST=PRIME_2,
+    )
+    # TODO: threads can't be synced on triton kernel
+    final_hash = intermediate_hashes.sum().item()
+    return final_hash

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -25,7 +25,6 @@ try:
     from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
     from vllm.model_executor.layers.quantization.gptq_marlin import (
         GPTQMarlinLinearMethod,
-        GPTQMarlinMoEMethod,
     )
     from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
         GPTQMarlin24Config,
@@ -58,12 +57,17 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
     CompressedTensorsConfig,
 )
 from sglang.srt.layers.quantization.fp8 import Fp8Config
-from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.layers.quantization.gptq import (
+    GPTQConfig,
+    GPTQMarlinConfig,
+    GPTQMarlinMoEMethod,
+)
 from sglang.srt.layers.quantization.modelopt_quant import (
     ModelOptFp4Config,
     ModelOptFp8Config,
 )
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+from sglang.srt.layers.quantization.qoq import QoQConfig
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
@@ -77,6 +81,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "w8a8_fp8": W8A8Fp8Config,
     "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
+    "qoq": QoQConfig,
 }
 # VLLM-dependent quantization methods
@@ -109,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
         raise ValueError(
             f"{quantization} quantization requires some operators from vllm. "
-            "Please install vllm by `pip install vllm==0.8.4`"
+            "Please install vllm by `pip install vllm==0.9.0.1`"
         )
     return QUANTIZATION_METHODS[quantization]
@@ -284,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -310,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
         if correction_bias is not None:
             if not has_correction_bias:
                 raise ValueError(
-                    "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
+                    "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
                 )
             kwargs["e_score_correction_bias"] = correction_bias
         return original_apply(**kwargs)

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
+            routed_scaling_factor=routed_scaling_factor,
         )
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             correction_bias=correction_bias,

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl