PyPI - sglang - Versions diffs - 0.4.1.post7__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl - Mend

sglang 0.4.1.post7py3-none-any.whl → 0.4.2.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

sglang/bench_offline_throughput.py +17 -11
sglang/bench_one_batch.py +14 -6
sglang/bench_serving.py +47 -44
sglang/lang/chat_template.py +31 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
sglang/srt/entrypoints/engine.py +5 -2
sglang/srt/entrypoints/http_server.py +24 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
sglang/srt/layers/attention/vision.py +243 -40
sglang/srt/layers/dp_attention.py +3 -1
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +24 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +20 -12
sglang/srt/layers/moe/fused_moe_native.py +17 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
sglang/srt/layers/parameter.py +16 -7
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +11 -1
sglang/srt/layers/rotary_embedding.py +34 -13
sglang/srt/layers/sampler.py +33 -10
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/managers/detokenizer_manager.py +1 -0
sglang/srt/managers/image_processor.py +77 -38
sglang/srt/managers/io_struct.py +36 -5
sglang/srt/managers/schedule_batch.py +31 -25
sglang/srt/managers/scheduler.py +78 -38
sglang/srt/managers/tokenizer_manager.py +4 -0
sglang/srt/mem_cache/base_prefix_cache.py +4 -0
sglang/srt/mem_cache/chunk_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +30 -1
sglang/srt/model_executor/cuda_graph_runner.py +23 -25
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +7 -4
sglang/srt/model_loader/loader.py +75 -0
sglang/srt/model_loader/weight_utils.py +91 -5
sglang/srt/models/commandr.py +14 -2
sglang/srt/models/dbrx.py +9 -1
sglang/srt/models/deepseek_v2.py +3 -3
sglang/srt/models/gemma2.py +9 -1
sglang/srt/models/grok.py +1 -0
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/minicpmv.py +129 -76
sglang/srt/models/mllama.py +16 -56
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_vl.py +18 -8
sglang/srt/models/torch_native_llama.py +17 -4
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +5 -4
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +4 -14
sglang/srt/server.py +2 -2
sglang/srt/server_args.py +26 -1
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +62 -67
sglang/test/test_programs.py +1 -0
sglang/test/test_utils.py +81 -22
sglang/utils.py +42 -0
sglang/version.py +1 -1
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/METADATA +8 -8
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/RECORD +78 -67
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Optional
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange, repeat
 from sglang.srt.distributed import parallel_state
@@ -63,7 +64,20 @@ def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.T
 class VisionAttention(nn.Module):
-    """Multi-headed attention without any cache, mostly used for ViT."""
+    r"""
+        Multi-headed attention without any cache, mostly used for ViT.
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        use_context_forward (bool, default to True):
+            if ``True``, a flash_attn style attention will be applied
+            Otherwise, a full-sequence attention will be applied.
+        use_full_precision_softmax (bool, default to False):
+            if ``True``, the softmax will be performed in full-precision
+            Otherwise, it will be performed in half-precision
+    """
     def __init__(
         self,
@@ -72,25 +86,39 @@ class VisionAttention(nn.Module):
         projection_size: int,
         use_qkv_parallel: bool,
         quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        use_context_forward: bool = True,
+        use_full_precision_softmax: bool = False,
+        flatten_batch: bool = False,
         prefix: str = "",
     ):
         super().__init__()
+        self.use_context_forward = use_context_forward
         world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads
         )
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, world_size
         )
-        # self.tp_size = get_tensor_model_parallel_world_size()
-        # num_heads = self.num_heads_per_partition
+        if self.use_context_forward:
+            self.qkv_backend = VisionTritonAttention()
+        else:
+            self.qkv_backend = VisionSdpaAttention(
+                head_size=self.head_size,
+                dropout=dropout,
+                flatten_batch=flatten_batch,
+                use_full_precision_softmax=use_full_precision_softmax,
+            )
         self.use_qkv_parallel = use_qkv_parallel
         if use_qkv_parallel:
-            self.head_dim = embed_dim // num_heads
             self.qkv_proj = QKVParallelLinear(
                 hidden_size=embed_dim,
-                head_size=self.head_dim,
+                head_size=self.head_size,
                 total_num_heads=num_heads,
                 quant_config=quant_config,
                 prefix=f"{prefix}.qkv_proj",
@@ -114,12 +142,15 @@ class VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: Optional[torch.Tensor] = None,
         rotary_pos_emb: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, num_heads * head]
         """
-        Input shape: [b, s, embed_dim]
-        Output shape: [s, b, num_heads * head_size]
-        """
         bsz, s, _ = x.shape
         if self.use_qkv_parallel:
             # [b, s, embed_dim] --> [b, s, embed_dim]
@@ -136,19 +167,19 @@ class VisionAttention(nn.Module):
         else:
             # [b, s, embed_dim] --> [s, b, embed_dim]
             x = rearrange(x, "b s ... -> s b ...")
-            # [s, b, embed_dim] --> [s, b, head * 3 * head_dim]
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
             qkv, _ = self.qkv_proj(x)
-            # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
             new_x_shape = qkv.size()[:-1] + (
                 self.num_attention_heads_per_partition,
                 3 * self.hidden_size_per_attention_head,
             )
             qkv = qkv.view(*new_x_shape)
-            # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
             q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
-            # [s, b, head, head_dim] --> [b, s, head, head_dim]
+            # [s, b, head, head_size] --> [b, s, head, head_size]
             q, k, v = [
                 rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
             ]
@@ -160,45 +191,217 @@ class VisionAttention(nn.Module):
         if self.use_qkv_parallel:
             pass
         else:
-            # [b, s, head, head_dim] --> [b * s, head, head_dim]
+            # [b, s, head, head_size] --> [b * s, head, head_size]
             q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        # [b * s, num_heads, head_size]
-        output = torch.empty_like(q)
-        seq_lens = (cu_seqlens[1:] - cu_seqlens[:-1]).cuda()
-        max_seqlen = seq_lens.max().item()
-        context_attention_fwd(
-            q,
-            k,
-            v,
-            output,
-            cu_seqlens.cuda(),
-            seq_lens,
-            max_seqlen,
-            is_causal=False,
-        )
+        output = self.qkv_backend.forward(q, k, v, bsz, cu_seqlens, attention_mask)
         if self.use_qkv_parallel:
-            # [b * s, head, head_dim] --> [b, s, head * head_dim]
+            # [b * s, h, head_size] --> [b, s, h * head_size]
             output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
-            # [b, s, head, head_dim] --> [b, s, head, head_dim]
+            # [b, s, h * head_size] --> [b, s, h * head_size]
             output, _ = self.proj(output)
         else:
-            # [b * s, head, head_dim] --> [b, s, head, head_dim]
-            context_layer = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-            # [s, b, num_heads * head_size]
+            # [b * s, h, head_size] --> [s, b, h * head_size]
             context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
             ).contiguous()
-            # [s, b, num_heads * head_size] --> [s, b, num_heads * head_size]
+            # [s, b, h * head_size] --> [s, b, h * head_size]
             output, _ = self.proj(context_layer)
+            # [s, b, h * head_size] --> [b, s, h * head_size]
             output = output.view(bsz, s, -1)
         return output
+class VisionSdpaAttention(nn.Module):
+    r"""
+    Scaled Dot Product Attention inner product
+    """
+    # TODO: Should it be released after used?
+    _mask_cache = {}
+    def __init__(
+        self,
+        head_size: int,
+        dropout: float = 0.0,
+        flatten_batch: bool = False,
+        use_full_precision_softmax: bool = False,
+    ):
+        super().__init__()
+        self.head_size = head_size
+        self.flatten_batch = flatten_batch
+        self.use_full_precision_softmax = use_full_precision_softmax
+        self.dropout = dropout
+    def generate_patch_attention_mask(
+        self,
+        s: int,
+        bsz: int,
+        device,
+        cu_seqlens: Optional[torch.Tensor],
+        flatten_batch: bool = False,
+        dtype=torch.bfloat16,
+    ) -> torch.Tensor:
+        r"""
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        When `flatten_batch` is True:
+            - All sequences in the batch are flattened into a single dimension
+            - `s` represents the total number of tokens across all sequences in the batch
+            - Returns a unified mask of shape `(1, 1, s, s)`
+        When `flatten_batch` is False:
+            - Each sequence has its own attention mask
+            - `s` represents the maximum sequence length in the batch
+            - Returns separate masks of shape `(b, 1, s, s)`
+        Args:
+            flatten_batch: (bool):
+                If True, treats all sequences in the batch as a single flattened sequence
+                If False, generates separate masks for each sequence
+        Returns:
+            Tensor of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        """
+        cache_key = (s, bsz, flatten_batch, tuple(cu_seqlens.cpu().tolist()))
+        if cache_key in VisionSdpaAttention._mask_cache:
+            cached_mask = VisionSdpaAttention._mask_cache[cache_key]
+            # print(f"cache hit for key: {cache_key}")
+            return cached_mask.to(device=device, dtype=dtype)
+        if cu_seqlens is None:
+            raise ValueError("Internal Error: cu_seqlens cannot be None")
+        if flatten_batch:
+            mask = torch.zeros([1, s, s], device=device, dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                start = cu_seqlens[i - 1]
+                end = cu_seqlens[i]
+                mask[
+                    ...,
+                    start:end,
+                    start:end,
+                ] = True
+        else:
+            # [1, 1, 1, s]
+            row_indices = torch.arange(s, device=device).view(1, 1, 1, s)
+            # [1, 1, s, 1]
+            col_indices = torch.arange(s, device=device).view(1, 1, s, 1)
+            # [b, 1, 1, 1]
+            seq_lens = (
+                (cu_seqlens[1:] - cu_seqlens[:-1]).to(device=device).view(-1, 1, 1, 1)
+            )
+            mask = (row_indices < seq_lens) & (col_indices < seq_lens)
+        # Convert to attention mask format (False -> 0, True -> -inf)
+        mask = (~mask).to(dtype) * torch.finfo(dtype).min
+        VisionSdpaAttention._mask_cache[cache_key] = mask
+        return mask
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz: int,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        s = q.shape[0] // bsz
+        # [b, 1, s, s]
+        if attention_mask is None:
+            attention_mask = self.generate_patch_attention_mask(
+                s, bsz, q.device, cu_seqlens, self.flatten_batch, q.dtype
+            )
+        q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
+        # [b, 1, s]
+        if self.use_full_precision_softmax:
+            scale = self.head_size**-0.5
+            k_transposed = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k_transposed) * scale
+            del k, k_transposed
+            attn_weights = attn_weights + attention_mask
+            del attention_mask
+            # full-precision
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_weights = nn.functional.dropout(
+                attn_weights, p=self.dropout, training=False
+            )
+            output = torch.matmul(attn_weights, v)
+            del attn_weights, v
+        else:
+            # SDPA
+            # [b, h, s, head_size]
+            output = F.scaled_dot_product_attention(
+                q, k, v, attention_mask, dropout_p=self.dropout
+            )
+        # [b, h, s, head_size] --> [b * s, h, head_size]
+        output = rearrange(output, "b h s d -> (b s) h d")
+        return output
+class VisionTritonAttention(nn.Module):
+    """
+    Triton-implemented attention without a causal mask
+    """
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        _bsz: int,
+        cu_seqlens: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        # [b * s, head, head_size]
+        output = torch.empty_like(q)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+        context_attention_fwd(
+            q,
+            k,
+            v,
+            output,
+            cu_seqlens.cuda(),
+            seq_lens.cuda(),
+            max_seqlen,
+            is_causal=False,
+        )
+        return output

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -22,6 +22,8 @@ def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_si
 def initialize_dp_attention(enable_dp_attention, tp_rank, tp_size, dp_size):
     global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK, _DP_SIZE
+    from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
     _ATTN_TP_RANK, _ATTN_TP_SIZE, _DP_RANK = compute_dp_attention_world_info(
         enable_dp_attention, tp_rank, tp_size, dp_size
     )
@@ -35,7 +37,7 @@ def initialize_dp_attention(enable_dp_attention, tp_rank, tp_size, dp_size):
         ],
         tp_rank,
         torch.distributed.get_backend(tp_group.device_group),
-        False,
+        SYNC_TOKEN_IDS_ACROSS_TP,
         False,
         False,
         False,

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -19,10 +19,10 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_cuda_available
-if is_flashinfer_available():
-    from flashinfer.norm import (
+if is_cuda_available():
+    from sgl_kernel import (
         fused_add_rmsnorm,
         gemma_fused_add_rmsnorm,
         gemma_rmsnorm,
@@ -121,8 +121,8 @@ class GemmaRMSNorm(CustomOp):
         return out
-if not is_flashinfer_available():
+if not is_cuda_available():
     logger.info(
-        "FlashInfer is not available on Non-NV platforms. Fallback to other kernel libraries."
+        "sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries."
     )
     from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

sglang/srt/layers/linear.py CHANGED Viewed

@@ -329,12 +329,14 @@ class ColumnParallelLinear(LinearBase):
         prefix: str = "",
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
         )
         self.gather_output = gather_output
+        self.use_presharded_weights = use_presharded_weights
         # Divide the weight matrix along the last dimension.
         if tp_rank is None:
@@ -402,7 +404,8 @@ class ColumnParallelLinear(LinearBase):
         if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
             start_idx = self.tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            if not self.use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -418,7 +421,11 @@ class ColumnParallelLinear(LinearBase):
         if len(loaded_weight.shape) == 0:
             assert loaded_weight.numel() == 1
             loaded_weight = loaded_weight.reshape(1)
-        param.load_column_parallel_weight(loaded_weight, tp_rank=self.tp_rank)
+        param.load_column_parallel_weight(
+            loaded_weight,
+            tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
+        )
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -499,7 +506,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             prefix=prefix,
             tp_rank=tp_rank,
             tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
         )
+        self.prefix = prefix
     def weight_loader(
         self,
@@ -743,6 +752,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
+        load_presharded_attn: bool = False,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -772,6 +782,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
             self.num_kv_heads * self.head_size * tp_size,  # v_proj
         ]
+        self.use_presharded_weights = load_presharded_attn
         super().__init__(
             input_size=input_size,
@@ -784,6 +795,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             prefix=prefix,
             tp_rank=tp_rank,
             tp_size=tp_size,
+            use_presharded_weights=self.use_presharded_weights,
         )
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
@@ -842,9 +854,10 @@ class QKVParallelLinear(ColumnParallelLinear):
                     shard_size=shard_size, shard_offset=shard_offset
                 )
-            loaded_weight_shard = loaded_weight.narrow(
-                param.output_dim, shard_offset, shard_size
-            )
+            if not self.use_presharded_weights:
+                loaded_weight_shard = loaded_weight.narrow(
+                    param.output_dim, shard_offset, shard_size
+                )
             self.weight_loader_v2(param, loaded_weight_shard, shard_id)
     def weight_loader_v2(
@@ -882,6 +895,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_offset=shard_offset,
             shard_size=shard_size,
             tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
         )
     def weight_loader(
@@ -987,9 +1001,10 @@ class QKVParallelLinear(ColumnParallelLinear):
                         param, orig_qkv_offsets, shard_id
                     )
-                loaded_weight_shard = loaded_weight.narrow(
-                    output_dim, shard_offset, shard_size
-                )
+                if not self.use_presharded_weights:
+                    loaded_weight_shard = loaded_weight.narrow(
+                        output_dim, shard_offset, shard_size
+                    )
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
@@ -1049,7 +1064,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for for AQLM codebooks.

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -296,7 +296,7 @@ def fused_softcap_kernel(
     n_elements,
     BLOCK_SIZE: tl.constexpr,
 ):
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(tl.int64)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -114,6 +114,8 @@ class EPMoE(torch.nn.Module):
         tp_size: Optional[int] = None,
         prefix: str = "",
         correction_bias: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        activation: str = "silu",
     ):
         super().__init__()
@@ -140,6 +142,8 @@ class EPMoE(torch.nn.Module):
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         self.correction_bias = correction_bias
+        self.custom_routing_function = custom_routing_function
+        self.activation = activation
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = UnquantizedEPMoEMethod()
@@ -166,6 +170,7 @@ class EPMoE(torch.nn.Module):
     def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
         assert self.quant_method is not None
+        assert self.activation == "silu"
         if self.grouped_gemm_runner is None:
             self.grouped_gemm_runner = GroupedGemmRunner(
@@ -181,6 +186,7 @@ class EPMoE(torch.nn.Module):
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
             correction_bias=self.correction_bias,
+            custom_routing_function=self.custom_routing_function,
         )
         reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
@@ -254,16 +260,20 @@ class EPMoE(torch.nn.Module):
                 dtype=torch.float32,
                 device=hidden_states.device,
             )
-        silu_and_mul_triton_kernel[(gateup_output.shape[0],)](
-            gateup_output,
-            down_input,
-            gateup_output.shape[1],
-            reorder_topk_ids,
-            self.w2_input_scale,
-            self.start_expert_id,
-            self.end_expert_id,
-            BLOCK_SIZE=512,
-        )
+        if self.activation == "silu":
+            silu_and_mul_triton_kernel[(gateup_output.shape[0],)](
+                gateup_output,
+                down_input,
+                gateup_output.shape[1],
+                reorder_topk_ids,
+                self.w2_input_scale,
+                self.start_expert_id,
+                self.end_expert_id,
+                BLOCK_SIZE=512,
+            )
+        else:
+            raise ValueError(f"Unsupported activation: {self.activation=}")
         # GroupGemm-1
         down_output = torch.empty(
@@ -309,7 +319,6 @@ class EPMoE(torch.nn.Module):
         ckpt_up_proj_name: str,
         num_experts: int,
     ) -> List[Tuple[str, str, int, str]]:
         return [
             # (param_name, weight_name, expert_id, shard_id)
             (
@@ -354,7 +363,6 @@ class EPMoE(torch.nn.Module):
             )
             return
-        expert_data = param.data[expert_id]
         if shard_id == "w2":
             param.data[expert_id] = loaded_weight
         elif shard_id == "w1":

sglang/srt/layers/moe/fused_moe_native.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Callable, Optional
 import torch
 from torch.nn import functional as F
-from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.activation import GeluAndMul, SiluAndMul
 from sglang.srt.layers.moe.topk import select_experts
@@ -23,6 +23,7 @@ def fused_moe_forward_native(
     num_expert_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
+    activation: str = "silu",
 ) -> torch.Tensor:
     topk_weights, topk_ids = select_experts(
         hidden_states=x,
@@ -41,7 +42,12 @@ def fused_moe_forward_native(
     w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
     w2_weights = layer.w2_weight[topk_ids]
     x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
-    x1 = F.silu(x1)
+    if activation == "silu":
+        x1 = F.silu(x1)
+    elif activation == "gelu":
+        x1 = F.gelu(x1)
+    else:
+        raise ValueError(f"Unsupported activation: {activation=}")
     x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
     expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
     return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
@@ -58,6 +64,7 @@ def moe_forward_native(
     num_expert_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
+    activation: str = "silu",
 ) -> torch.Tensor:
     topk_weights, topk_ids = select_experts(
@@ -84,6 +91,13 @@ def moe_forward_native(
     sorted_tokens = x[idxs // topk_ids.shape[1]]
     tokens_per_expert = tokens_per_expert.cpu().numpy()
+    if activation == "silu":
+        act = SiluAndMul()
+    elif activation == "gelu":
+        act = GeluAndMul()
+    else:
+        raise ValueError(f"Unsupported activation: {activation=}")
     outputs = []
     start_idx = 0
     for i, num_tokens in enumerate(tokens_per_expert):
@@ -96,7 +110,7 @@ def moe_forward_native(
         layer_w2_weight = layer.w2_weight[i]
         gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
-        gate_up = SiluAndMul()(gate_up)
+        gate_up = act(gate_up)
         expert_out = F.linear(gate_up, layer_w2_weight)
         outputs.append(expert_out)
         start_idx = end_idx

sglang 0.4.1.post7__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl

sglang 0.4.1.post7py3-none-any.whl → 0.4.2.post1py3-none-any.whl