PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -1,12 +1,12 @@
 from __future__ import annotations
 from functools import lru_cache
-from typing import Optional
+from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 from sglang.srt.distributed import parallel_state
 from sglang.srt.distributed import utils as dist_utils
@@ -22,47 +22,29 @@ from sglang.srt.layers.quantization import QuantizationConfig
 from sglang.srt.utils import add_prefix
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
+# Copied from transformers, modeling_qwen2_vl.py
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
-) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
-    return output
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
 class VisionAttention(nn.Module):
@@ -75,8 +57,8 @@ class VisionAttention(nn.Module):
         use_context_forward (bool, default to True):
             if ``True``, a flash_attn style attention will be applied
             Otherwise, a full-sequence attention will be applied.
-        use_full_precision_softmax (bool, default to False):
-            if ``True``, the softmax will be performed in full-precision
+        softmax_in_single_precision (bool, default to False):
+            if ``True``, the softmax will be performed in single-precision
             Otherwise, it will be performed in half-precision
     """
@@ -90,7 +72,7 @@ class VisionAttention(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         dropout: float = 0.0,
         use_context_forward: bool = True,
-        use_full_precision_softmax: bool = False,
+        softmax_in_single_precision: bool = False,
         flatten_batch: bool = False,
         prefix: str = "",
     ):
@@ -113,7 +95,7 @@ class VisionAttention(nn.Module):
                 head_size=self.head_size,
                 dropout=dropout,
                 flatten_batch=flatten_batch,
-                use_full_precision_softmax=use_full_precision_softmax,
+                softmax_in_single_precision=softmax_in_single_precision,
             )
         self.use_qkv_parallel = use_qkv_parallel
@@ -143,7 +125,7 @@ class VisionAttention(nn.Module):
         self,
         x: torch.Tensor,
         cu_seqlens: Optional[torch.Tensor] = None,
-        rotary_pos_emb: torch.Tensor = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         r"""
@@ -151,21 +133,17 @@ class VisionAttention(nn.Module):
             x: [b, s, embed_dim]
             cu_seqlens: [b]
         Returns:
-             [s, b, num_heads * head]
+             [s, b, head * head_size]
         """
         bsz, s, _ = x.shape
+        head = self.num_attention_heads_per_partition
         if self.use_qkv_parallel:
             # [b, s, embed_dim] --> [b, s, embed_dim]
             qkv, _ = self.qkv_proj(x)
             q, k, v = qkv.chunk(3, dim=-1)
-            # [b, s, embed_dim] --> [b * s, num_heads, head_size]
-            q, k, v = [
-                x.reshape(
-                    bsz * s, self.num_attention_heads_per_partition, -1
-                ).contiguous()
-                for x in (q, k, v)
-            ]
+            # [b, s, embed_dim] --> [b * s, head, head_size]
+            q, k, v = [x.reshape(bsz * s, head, -1).contiguous() for x in (q, k, v)]
         else:
             # [b, s, embed_dim] --> [s, b, embed_dim]
             x = rearrange(x, "b s ... -> s b ...")
@@ -173,7 +151,7 @@ class VisionAttention(nn.Module):
             qkv, _ = self.qkv_proj(x)
             # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
             new_x_shape = qkv.size()[:-1] + (
-                self.num_attention_heads_per_partition,
+                head,
                 3 * self.hidden_size_per_attention_head,
             )
             qkv = qkv.view(*new_x_shape)
@@ -186,9 +164,12 @@ class VisionAttention(nn.Module):
                 rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
             ]
-        if rotary_pos_emb is not None:
-            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
-            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            original_shape = q.shape
+            q, k = q.view(s, head, -1), k.view(s, head, -1)
+            q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+            q, k = q.reshape(original_shape), k.reshape(original_shape)
         if self.use_qkv_parallel:
             pass
@@ -230,12 +211,12 @@ class VisionSdpaAttention(nn.Module):
         head_size: int,
         dropout: float = 0.0,
         flatten_batch: bool = False,
-        use_full_precision_softmax: bool = False,
+        softmax_in_single_precision: bool = False,
     ):
         super().__init__()
         self.head_size = head_size
         self.flatten_batch = flatten_batch
-        self.use_full_precision_softmax = use_full_precision_softmax
+        self.softmax_in_single_precision = softmax_in_single_precision
         self.dropout = dropout
     @staticmethod
@@ -319,14 +300,14 @@ class VisionSdpaAttention(nn.Module):
             )
         if attention_mask is None:
-            if self.use_full_precision_softmax:
+            if self.softmax_in_single_precision:
                 raise RuntimeError("Empty attention mask")
         else:
             attention_mask = attention_mask.to(device=q.device)
         q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
-        if self.use_full_precision_softmax:
+        if self.softmax_in_single_precision:
             scale = self.head_size**-0.5
             k_transposed = rearrange(k, "b h s d -> b h d s")
             attn_weights = torch.matmul(q, k_transposed) * scale

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import functools
+import logging
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Union
 import torch
@@ -14,6 +16,8 @@ from sglang.srt.distributed import (
     tensor_model_parallel_all_reduce,
 )
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -86,6 +90,27 @@ def get_attention_dp_size():
     return _DP_SIZE
+@contextmanager
+def disable_dp_size():
+    """Patch the tp group temporarily until this function ends.
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _DP_SIZE
+    assert _DP_SIZE is not None, "dp attention not initialized!"
+    old_dp_size = _DP_SIZE
+    _DP_SIZE = 1
+    try:
+        yield
+    finally:
+        _DP_SIZE = old_dp_size
 def get_dp_local_info(forward_batch: ForwardBatch):
     dp_rank = get_attention_dp_rank()
@@ -159,7 +184,8 @@ def dp_gather(
         layer_id != "embedding" or get_attention_tp_rank() == 0
     ):
         assert (
-            global_tokens.storage().data_ptr() != local_tokens.storage().data_ptr()
+            global_tokens.untyped_storage().data_ptr()
+            != local_tokens.untyped_storage().data_ptr()
         ), "aliasing between global_tokens and local_tokens not allowed"
         memcpy_triton(
             global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
@@ -174,8 +200,9 @@ def dp_gather(
         torch.ops.sglang.inplace_all_reduce(
             global_tokens, group_name=get_tp_group().unique_name
         )
     else:
-        global_tokens = tensor_model_parallel_all_reduce(global_tokens)
+        global_tokens[:] = tensor_model_parallel_all_reduce(global_tokens)
 def dp_scatter(
@@ -186,6 +213,7 @@ def dp_scatter(
     # local_num_tokens is not necessarily the same as local_tokens.shape[0],
     # since local_tokens may be padded for cuda graph
     local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
     local_tokens.fill_(0)
     assert local_tokens.is_contiguous()
     assert global_tokens.is_contiguous()

sglang/srt/layers/elementwise.py ADDED Viewed

@@ -0,0 +1,411 @@
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+fused_softcap_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 32768}, num_warps=32),
+    ],
+    key=["n_ele"],
+)
+@triton.jit
+def fused_softcap_kernel(
+    output_ptr,
+    input_ptr,
+    n_ele,
+    softcap_const: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_ele
+    x = tl.load(input_ptr + offsets, mask=mask)
+    fx = x.to(tl.float32)
+    fxs = fx / softcap_const
+    exped = tl.exp(2 * fxs)
+    top = exped - 1
+    bottom = exped + 1
+    output = top / bottom * softcap_const
+    tl.store(output_ptr + offsets, output, mask=mask)
+fused_softcap_kernel_autotuned = fused_softcap_autotune(fused_softcap_kernel)
+def fused_softcap(x, softcap_const, autotune=False):
+    output = torch.empty_like(x, dtype=torch.float32)
+    n_elements = output.numel()
+    if autotune:
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        fused_softcap_kernel_autotuned[grid](output, x, n_elements, softcap_const)
+    else:
+        fused_softcap_kernel[(triton.cdiv(n_elements, 128),)](
+            output, x, n_elements, softcap_const, BLOCK_SIZE=128, num_warps=8
+        )
+    return output
+# cast to float + softcap
+class Softcap:
+    def __init__(self, softcap_const: float):
+        self.softcap_const = softcap_const
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.is_cuda:
+            return self.forward_cuda(x)
+        else:
+            return self.forward_native(x)
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.tanh(x.float() / self.softcap_const) * self.softcap_const
+    def forward_cuda(self, x: torch.Tensor, autotune=False) -> torch.Tensor:
+        return fused_softcap(x, self.softcap_const, autotune=autotune)
+rmsnorm_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=4),
+    ],
+    key=["hidden_dim"],
+)
+@triton.jit
+def fused_dual_residual_rmsnorm_kernel(
+    output_ptr,
+    mid_ptr,
+    activ_ptr,
+    residual_ptr,
+    weight1_ptr,
+    weight2_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+    r = tl.load(residual_ptr + input_start + offsets, mask=mask, other=0.0)
+    w1_ = tl.load(weight1_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+    a2r = r + (a / rms * w1).to(r.dtype)
+    tl.store(
+        mid_ptr + input_start + offsets,
+        a2r,
+        mask=mask,
+    )
+    a2r = a2r.to(tl.float32)
+    rms2 = tl.sqrt(tl.sum(a2r * a2r, axis=0) / hidden_dim + eps)
+    w2_ = tl.load(weight2_ptr + offsets, mask=mask, other=0.0)
+    w2 = w2_.to(tl.float32)
+    tl.store(
+        output_ptr + input_start + offsets,
+        a2r / rms2 * w2,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+fused_dual_residual_rmsnorm_kernel_autotune = rmsnorm_autotune(
+    fused_dual_residual_rmsnorm_kernel
+)
+def fused_dual_residual_rmsnorm(x, residual, weight1, weight2, eps, autotune=False):
+    assert len(x.shape) == 2
+    assert x.shape == residual.shape and x.dtype == residual.dtype
+    output, mid = torch.empty_like(x), torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    if autotune:
+        fused_dual_residual_rmsnorm_kernel_autotune[(bs,)](
+            output, mid, x, residual, weight1, weight2, eps=eps, hidden_dim=hidden_dim
+        )
+    else:
+        config = {
+            "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+            "num_warps": max(
+                min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), 32), 4
+            ),
+        }
+        fused_dual_residual_rmsnorm_kernel[(bs,)](
+            output,
+            mid,
+            x,
+            residual,
+            weight1,
+            weight2,
+            eps=eps,
+            hidden_dim=hidden_dim,
+            **config,
+        )
+    return output, mid
+@triton.jit
+def fused_rmsnorm_kernel(
+    output_ptr,
+    activ_ptr,
+    weight_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+    w1_ = tl.load(weight_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+    a_rms = a / rms * w1
+    tl.store(
+        output_ptr + input_start + offsets,
+        a_rms,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+def fused_rmsnorm(x, weight, eps, autotune=False, inplace=False):
+    assert len(x.shape) == 2
+    if inplace:
+        output = x
+    else:
+        output = torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), 32), 4
+        ),
+    }
+    fused_rmsnorm_kernel[(bs,)](
+        output, x, weight, eps=eps, hidden_dim=hidden_dim, **config
+    )
+    return output
+class FusedDualResidualRMSNorm:
+    """
+    Fused implementation of
+    y = RMSNorm2(RMSNorm1(x) + residual))
+    """
+    def __init__(self, rmsnorm1, rmsnorm2) -> None:  # the one after rmsnorm1
+        self.rmsnorm1 = rmsnorm1
+        self.rmsnorm2 = rmsnorm2
+        self.variance_epsilon = self.rmsnorm1.variance_epsilon
+        assert self.rmsnorm1.variance_epsilon == self.rmsnorm2.variance_epsilon
+        assert self.rmsnorm1.weight.shape == self.rmsnorm2.weight.shape
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_flashinfer(x, residual)
+    def forward_cuda(
+        self, x: torch.Tensor, residual: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_dual_residual_rmsnorm(
+            x,
+            residual,
+            self.rmsnorm1.weight,
+            self.rmsnorm2.weight,
+            self.variance_epsilon,
+            autotune=autotune,
+        )
+    def forward_flashinfer(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1(x)
+        residual = normed1 + residual
+        return self.rmsnorm2(residual), residual
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1.forward_native(x)
+        residual = normed1 + residual
+        return self.rmsnorm2.forward_native(residual), residual
+# gelu on first half of vector
+@triton.jit
+def gelu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    # gelu
+    # cast down before mul to better match training?
+    gelu_x1 = 0.5 * (1.0 + tl.erf(x1 * 0.7071067811865475)) * x1
+    out = x3 * gelu_x1.to(hidden_states_ptr.dtype.element_ty)
+    if quant_max is not None:
+        raise NotImplementedError()
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+def gelu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), 32), 4
+        ),
+    }
+    gelu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None

sglang/srt/layers/linear.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sglang.srt.distributed import (
 )
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
+    BlockQuantScaleParameter,
     PackedColumnParameter,
     PackedvLLMParameter,
     PerTensorScaleParameter,
@@ -27,7 +28,6 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_utils import BlockQuantScaleParameter
 from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl