PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +36 -2
sglang/srt/conversation.py +56 -3
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +50 -18
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +20 -5
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
sglang/srt/layers/moe/ep_moe/layer.py +141 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +35 -3
sglang/srt/managers/mm_utils.py +59 -96
sglang/srt/managers/schedule_batch.py +17 -6
sglang/srt/managers/scheduler.py +38 -6
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +176 -101
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +78 -19
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +372 -82
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +63 -61
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +26 -4
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +191 -48
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
+import importlib
 from abc import abstractmethod
 from enum import Enum
 from typing import Callable, List, Optional, Tuple
@@ -19,6 +20,7 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_loader.weight_utils import narrow_padded_param_and_loaded_weight
 from sglang.srt.utils import (
     cpu_has_amx_support,
@@ -29,8 +31,15 @@ from sglang.srt.utils import (
     use_intel_amx_backend,
 )
+has_triton_kernels = importlib.util.find_spec("triton_kernels") is not None
 if torch.cuda.is_available():
     from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+    if has_triton_kernels:
+        from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+            triton_kernel_moe_forward,
+        )
 else:
     fused_experts = None  # type: ignore
@@ -87,6 +96,10 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
+    def __init__(self, use_triton_kernels: bool = False):
+        super().__init__()
+        self.use_triton_kernels = use_triton_kernels
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -97,20 +110,25 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         **extra_weight_attrs,
     ):
         # Fused gate_up_proj (column parallel)
+        w13_weight_n, w13_weight_k = 2 * intermediate_size, hidden_size
+        if self.use_triton_kernels:
+            w13_weight_n, w13_weight_k = w13_weight_k, w13_weight_n
         w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts, 2 * intermediate_size, hidden_size, dtype=params_dtype
-            ),
+            torch.empty(num_experts, w13_weight_n, w13_weight_k, dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w13_weight", w13_weight)
         set_weight_attrs(w13_weight, extra_weight_attrs)
         # down_proj (row parallel)
+        w2_weight_n, w2_weight_k = (
+            hidden_size,
+            intermediate_size,
+        )
+        if self.use_triton_kernels:
+            w2_weight_n, w2_weight_k = w2_weight_k, w2_weight_n
         w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts, hidden_size, intermediate_size, dtype=params_dtype
-            ),
+            torch.empty(num_experts, w2_weight_n, w2_weight_k, dtype=params_dtype),
             requires_grad=False,
         )
         layer.register_parameter("w2_weight", w2_weight)
@@ -192,59 +210,72 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            num_fused_shared_experts=num_fused_shared_experts,
-            custom_routing_function=custom_routing_function,
-            correction_bias=correction_bias,
-            routed_scaling_factor=routed_scaling_factor,
-        )
-        if _use_aiter:
-            assert not no_combine, "unsupported"
-            if apply_router_weight_on_input:
-                assert (
-                    topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
-                _, topk = topk_weights.shape
-                assert (
-                    topk == 1
-                ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-                x = x * topk_weights.to(x.dtype)
-                topk_weights = torch.ones_like(
-                    topk_weights, dtype=torch.float32
-                )  # topk_weights must be FP32 (float32)
-            return fused_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=(
-                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
-                ),
-            )
-        else:
-            return fused_experts(
+        if self.use_triton_kernels:
+            return triton_kernel_moe_forward(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=inplace and not no_combine,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                no_combine=no_combine,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+            )
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                custom_routing_function=custom_routing_function,
+                correction_bias=correction_bias,
                 routed_scaling_factor=routed_scaling_factor,
             )
+            if _use_aiter:
+                assert not no_combine, "unsupported"
+                if apply_router_weight_on_input:
+                    assert (
+                        topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+                    _, topk = topk_weights.shape
+                    assert (
+                        topk == 1
+                    ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+                    x = x * topk_weights.to(x.dtype)
+                    topk_weights = torch.ones_like(
+                        topk_weights, dtype=torch.float32
+                    )  # topk_weights must be FP32 (float32)
+                return fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                )
+            else:
+                return fused_experts(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    inplace=inplace and not no_combine,
+                    activation=activation,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    no_combine=no_combine,
+                    routed_scaling_factor=routed_scaling_factor,
+                )
     def forward_cpu(
         self,
         layer: torch.nn.Module,
@@ -286,9 +317,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
-                topk_weights.to(
-                    torch.float
-                ),  # TODO: the topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bfloat16 while the kernel requires it to be float32
+                topk_weights,
                 topk_ids,
                 False,  # inplace # See [Note] inplace should be False in fused_experts.
                 False,  # use_int8_w8a8
@@ -475,9 +504,13 @@ class FusedMoE(torch.nn.Module):
         self.inplace = inplace
         self.no_combine = no_combine
+        self.use_triton_kernels = (
+            not _is_cpu and global_server_args_dict["enable_triton_kernel_moe"]
+        )
         if quant_config is None:
-            self.quant_method: Optional[QuantizeMethodBase] = (
-                UnquantizedFusedMoEMethod()
+            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedFusedMoEMethod(
+                self.use_triton_kernels
             )
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix)
@@ -485,6 +518,7 @@ class FusedMoE(torch.nn.Module):
                 self.quant_method.enable_flashinfer_moe = self.enable_flashinfer_moe
         assert self.quant_method is not None
+        self.quant_config = quant_config
         self.quant_method.create_weights(
             layer=self,
             num_experts=self.local_num_experts,
@@ -597,6 +631,8 @@ class FusedMoE(torch.nn.Module):
             )
         else:
             if not self.use_presharded_weights:
+                if self.use_triton_kernels:
+                    loaded_weight = loaded_weight.transpose(-2, -1)
                 loaded_weight = loaded_weight.narrow(
                     shard_dim, shard_size * tp_rank, shard_size
                 )
@@ -612,6 +648,31 @@ class FusedMoE(torch.nn.Module):
         loaded_weight: torch.tensor,
         tp_rank: int,
     ):
+        """Load w2 weights for down projection.
+        Args:
+            expert_data: The expert data tensor to load into
+            shard_dim: The dimension to shard along
+            shard_id: The shard ID (must be "w2")
+            loaded_weight: The weight tensor to load from
+            tp_rank: The tensor parallel rank
+        """
+        if not isinstance(expert_data, torch.Tensor) or not isinstance(
+            loaded_weight, torch.Tensor
+        ):
+            raise ValueError("expert_data and loaded_weight must be torch.Tensor")
+        if (
+            self.quant_config is not None
+            and "modelopt" in self.quant_config.get_name()
+            and (expert_data.dim() != 2 or loaded_weight.dim() != 2)
+        ):
+            raise ValueError(
+                f"Expected 2D tensors, got expert_data shape {expert_data.shape} and loaded_weight shape {loaded_weight.shape}"
+            )
+        if shard_id != "w2":
+            raise ValueError(f"shard_id must be 'w2', got {shard_id}")
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
@@ -630,6 +691,12 @@ class FusedMoE(torch.nn.Module):
             )
         else:
             if not self.use_presharded_weights:
+                if self.use_triton_kernels:
+                    loaded_weight = loaded_weight.transpose(-2, -1)
+                if shard_size * tp_rank + shard_size > loaded_weight.shape[shard_dim]:
+                    raise ValueError(
+                        f"Shard size {shard_size} at rank {tp_rank} exceeds loaded_weight dimension {loaded_weight.shape[shard_dim]}"
+                    )
                 loaded_weight = loaded_weight.narrow(
                     shard_dim, shard_size * tp_rank, shard_size
                 )
@@ -716,6 +783,8 @@ class FusedMoE(torch.nn.Module):
         # should be whatever dimension intermediate_size is
         is_transposed = getattr(param, "is_transposed", False)
         shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
+        if self.use_triton_kernels:
+            is_transposed = True
         if is_transposed:
             shard_dim = int(not shard_dim)
@@ -754,8 +823,21 @@ class FusedMoE(torch.nn.Module):
                 tp_rank=tp_rank,
             )
             return
         if "ModelOpt" in self.quant_method.__class__.__name__:
-            if "weight_scale_2" in weight_name or "input_scale" in weight_name:
+            # Determine per-tensor weight scale patterns based on variant
+            is_fp4_variant = (
+                "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__
+            )
+            # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name
+                if is_fp4_variant
+                else "weight_scale" in weight_name
+            ) or "input_scale" in weight_name
+            if per_tensor_conditions:
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
                     param=param,
@@ -773,7 +855,7 @@ class FusedMoE(torch.nn.Module):
             return
         # Case weight scales and zero_points
-        if "scale" in weight_name or "zero" in weight_name:
+        if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
             # load the weight scales and zp based on the quantization scheme
             # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported

sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py ADDED Viewed

@@ -0,0 +1,176 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/18595/files#diff-f426a6de78c82ffec568eff6811bfbf0043dab5f87f1a8c0cffdbdcb8a81e035
+from typing import Optional
+import torch
+from sgl_kernel import gelu_and_mul, silu_and_mul
+from triton_kernels.matmul_ogs import matmul_ogs
+from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx, routing
+from sglang.srt.utils import direct_register_custom_op
+def triton_kernel_moe_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    if not renormalize:
+        gating_output = torch.softmax(gating_output, dim=-1)
+    routing_data, gather_idx, scatter_idx = routing(gating_output, topk, renormalize)
+    return triton_kernel_fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        inplace=inplace,
+        activation=activation,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    routing_data: RoutingData,
+    gather_indx: GatherIndx,
+    scatter_indx: ScatterIndx,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
+    assert per_channel_quant == False, "per_channel_quant is not supported"
+    assert expert_map == None, "expert_map is not supported"
+    assert w1_scale == None, "w1_scale is not supported"
+    assert w2_scale == None, "w2_scale is not supported"
+    assert a1_scale == None, "a1_scale is not supported"
+    assert a2_scale == None, "a2_scale is not supported"
+    assert block_shape == None, "block_shape is not supported"
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    assert w1.dtype == torch.bfloat16, "w1 must be bfloat16"
+    assert w2.dtype == torch.bfloat16, "w2 must be bfloat16"
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        w2.shape[-1] == w1.shape[1]
+    ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+    # feature check
+    assert inplace == False, "Inplace is not supported in new triton MoE kernel"
+    M, K = hidden_states.shape
+    E, _, N = w1.shape
+    n_expts_act = routing_data.n_expts_act
+    dtype = hidden_states.dtype
+    if global_num_experts == -1:
+        global_num_experts = E
+    # consistent with default implementation
+    intermediate_cache2 = torch.empty(
+        (M * n_expts_act, N // 2), device="cuda", dtype=dtype
+    )
+    intermediate_cache1 = matmul_ogs(
+        hidden_states,
+        w1,
+        None,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=routing_data.gate_scal if apply_router_weight_on_input else None,
+    )
+    if activation == "silu":
+        silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    elif activation == "gelu":
+        gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+    intermediate_cache3 = matmul_ogs(
+        intermediate_cache2,
+        w2,
+        None,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else routing_data.gate_scal,
+    )
+    return intermediate_cache3
+def triton_kernel_moe_forward_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+direct_register_custom_op(
+    op_name="forward_cuda_triton",
+    op_func=triton_kernel_moe_forward,
+    mutates_args=[],
+    fake_impl=triton_kernel_moe_forward_fake,
+)

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -83,13 +83,18 @@ def fused_topk_cpu(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
-    return torch.ops.sgl_kernel.topk_softmax_cpu(
+    topk_weights, topk_ids = torch.ops.sgl_kernel.topk_softmax_cpu(
         hidden_states=hidden_states,
         gating_output=gating_output,
         topk=topk,
         renormalize=renormalize,
     )
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
 def fused_topk(
@@ -303,7 +308,7 @@ def biased_grouped_topk_gpu(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    compiled: bool = True,
+    compiled: bool = not _is_npu,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
@@ -411,6 +416,7 @@ if _is_cpu and _is_cpu_amx_available:
     biased_grouped_topk = biased_grouped_topk_cpu
     grouped_topk = grouped_topk_cpu
     fused_topk_native = fused_topk_cpu
+    fused_topk = fused_topk_cpu
 else:
     biased_grouped_topk = biased_grouped_topk_gpu
     grouped_topk = grouped_topk_gpu

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -187,10 +187,26 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         param_data = self.data
         shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
         param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
-        if not use_presharded_weights:
-            loaded_weight = loaded_weight.narrow(
-                self.output_dim, shard_id * shard_size, shard_size
+        if _is_cpu:
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                shard_id * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
             )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, shard_id * shard_size, shard_size
+                )
         assert (
             param_data.shape == loaded_weight.shape

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -68,6 +68,7 @@ from sglang.srt.layers.quantization.modelopt_quant import (
 )
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
@@ -82,6 +83,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
     "qoq": QoQConfig,
+    "w4afp8": W4AFp8Config,
 }
 # VLLM-dependent quantization methods

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 import logging
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import torch.nn.functional as F
@@ -88,7 +88,7 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-if _is_hip:
+if _is_hip and (_use_aiter or _use_hip_int4):
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
@@ -200,7 +200,7 @@ class Fp8LinearMethod(LinearMethodBase):
         quant_config: The quantization config.
     """
-    def __init__(self, quant_config: Fp8Config):
+    def __init__(self, quant_config: Union["Fp8Config", "W4AFp8Config"]):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
@@ -286,7 +286,10 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             if self.block_quant:
-                assert self.quant_config.activation_scheme == "dynamic"
+                if hasattr(self.quant_config, "activation_scheme"):
+                    assert self.quant_config.activation_scheme == "dynamic"
+                elif hasattr(self.quant_config, "linear_activation_scheme"):
+                    assert self.quant_config.linear_activation_scheme == "dynamic"
                 scale = BlockQuantScaleParameter(
                     data=torch.empty(
                         (output_size_per_partition + block_n - 1) // block_n,
@@ -308,7 +311,13 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("weight_scale", scale)
             # INPUT ACTIVATION SCALE
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 scale = PerTensorScaleParameter(
                     data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
                     weight_loader=weight_loader,
@@ -371,7 +380,13 @@ class Fp8LinearMethod(LinearMethodBase):
             layer.weight_scale = torch.nn.Parameter(
                 layer.weight_scale.data, requires_grad=False
             )
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = torch.nn.Parameter(
                     layer.input_scale.data, requires_grad=False
                 )
@@ -405,7 +420,13 @@ class Fp8LinearMethod(LinearMethodBase):
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = Parameter(
                     layer.input_scale.max(), requires_grad=False
                 )

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -160,8 +160,8 @@ def _per_token_group_quant_fp8_colmajor(
     """
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * group_size
-    y_q_ptr += g_id * group_size
+    y_ptr += g_id.to(tl.int64) * group_size
+    y_q_ptr += g_id.to(tl.int64) * group_size
     # Convert g_id the flattened block coordinate to 2D so we can index
     # into the output y_scales matrix

sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl