PyPI - sglang - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl - Mend

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -49,11 +49,9 @@ def _fwd_kernel_stage1(
     K_Buffer,
     V_Buffer,
     sm_scale,
-    Req_to_tokens,
-    B_req_idx,
-    B_Seqlen,
+    kv_indptr,
+    kv_indices,
     Att_Out,
-    stride_req_to_tokens_b,
     stride_qbs,
     stride_qh,
     stride_buf_kbs,
@@ -82,8 +80,9 @@ def _fwd_kernel_stage1(
     offs_dv = tl.arange(0, BLOCK_DV)
     mask_d = offs_d < Lk
     mask_dv = offs_dv < Lv
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-    cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
     off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
     q = tl.load(Q + off_q, mask=mask_d, other=0.0)
@@ -100,7 +99,7 @@ def _fwd_kernel_stage1(
         for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
             offs_n = start_n + tl.arange(0, BLOCK_N)
             kv_loc = tl.load(
-                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+                kv_indices + cur_batch_kv_start_idx + offs_n,
                 mask=offs_n < split_kv_end,
                 other=0,
             )
@@ -173,19 +172,21 @@ def _decode_att_m_fwd(
     k_buffer,
     v_buffer,
     att_out,
-    Req_to_tokens,
-    B_req_idx,
-    B_Seqlen,
+    kv_indptr,
+    kv_indices,
     num_kv_splits,
     sm_scale,
     logit_cap,
 ):
     BLOCK = 64
+    # [TODO] work around SGPR limit on MI3xx
+    if is_hip_:
+        BLOCK = 8
     NUM_KV_SPLITS = num_kv_splits
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
-    batch, head_num = B_req_idx.shape[0], q.shape[1]
+    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
     grid = (batch, head_num, NUM_KV_SPLITS)
     kv_group_num = q.shape[1] // k_buffer.shape[1]
@@ -194,6 +195,8 @@ def _decode_att_m_fwd(
         num_warps = 4
     else:
         num_warps = 2
+        if is_hip_:
+            num_warps = 1
     BLOCK_DMODEL = triton.next_power_of_2(Lk)
     BLOCK_DV = triton.next_power_of_2(Lv)
@@ -203,11 +206,9 @@ def _decode_att_m_fwd(
         k_buffer,
         v_buffer,
         sm_scale,
-        Req_to_tokens,
-        B_req_idx,
-        B_Seqlen,
+        kv_indptr,
+        kv_indices,
         att_out,
-        Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
         k_buffer.stride(0),
@@ -236,11 +237,9 @@ def _fwd_grouped_kernel_stage1(
     K_Buffer,
     V_Buffer,
     sm_scale,
-    Req_to_tokens,
-    B_req_idx,
-    B_Seqlen,
+    kv_indptr,
+    kv_indices,
     Att_Out,
-    stride_req_to_tokens_b,
     stride_qbs,
     stride_qh,
     stride_buf_kbs,
@@ -279,8 +278,9 @@ def _fwd_grouped_kernel_stage1(
     offs_dv = tl.arange(0, BLOCK_DV)
     mask_d = offs_d < Lk
     mask_dv = offs_dv < Lv
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-    cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
     offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
     q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0)
@@ -307,7 +307,7 @@ def _fwd_grouped_kernel_stage1(
         for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
             offs_n = start_n + tl.arange(0, BLOCK_N)
             kv_loc = tl.load(
-                Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+                kv_indices + cur_batch_kv_start_idx + offs_n,
                 mask=offs_n < split_kv_end,
                 other=0,
             )
@@ -395,9 +395,8 @@ def _decode_grouped_att_m_fwd(
     k_buffer,
     v_buffer,
     att_out,
-    Req_to_tokens,
-    B_req_idx,
-    B_Seqlen,
+    kv_indptr,
+    kv_indices,
     num_kv_splits,
     sm_scale,
     logit_cap,
@@ -421,7 +420,7 @@ def _decode_grouped_att_m_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    batch, head_num = B_req_idx.shape[0], q.shape[1]
+    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     BLOCK_H = 16
@@ -433,21 +432,21 @@ def _decode_grouped_att_m_fwd(
     )
     extra_kargs = {}
+    num_stages = 2
     if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
-        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1
     _fwd_grouped_kernel_stage1[grid](
         q,
         k_buffer,
         v_buffer,
         sm_scale,
-        Req_to_tokens,
-        B_req_idx,
-        B_Seqlen,
+        kv_indptr,
+        kv_indices,
         att_out,
-        Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
         k_buffer.stride(0),
@@ -467,7 +466,7 @@ def _decode_grouped_att_m_fwd(
         NUM_KV_SPLITS=NUM_KV_SPLITS,
         logit_cap=logit_cap,
         num_warps=4,
-        num_stages=2,
+        num_stages=num_stages,
         Lk=Lk,
         Lv=Lv,
         **extra_kargs,
@@ -478,7 +477,7 @@ def _decode_grouped_att_m_fwd(
 def _fwd_kernel_stage2(
     Mid_O,
     O,
-    B_Seqlen,
+    kv_indptr,
     stride_mid_ob,
     stride_mid_oh,
     stride_mid_os,
@@ -491,7 +490,9 @@ def _fwd_kernel_stage2(
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
-    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load(
+        kv_indptr + cur_batch
+    )
     offs_d = tl.arange(0, BLOCK_DV)
     mask_d = offs_d < Lv
@@ -535,7 +536,7 @@ def _decode_softmax_reducev_fwd(
     q,
     o,
     v_buffer,
-    b_seq_len,
+    kv_indptr,
     num_kv_splits,
 ):
     batch, head_num = q.shape[0], q.shape[1]
@@ -554,7 +555,7 @@ def _decode_softmax_reducev_fwd(
     _fwd_kernel_stage2[grid](
         logits,
         o,
-        b_seq_len,
+        kv_indptr,
         logits.stride(0),
         logits.stride(1),
         logits.stride(2),
@@ -574,9 +575,8 @@ def decode_attention_fwd_normal(
     k_buffer,
     v_buffer,
     o,
-    req_to_token,
-    b_req_idx,
-    b_seq_len,
+    kv_indptr,
+    kv_indices,
     attn_logits,
     num_kv_splits,
     sm_scale,
@@ -587,14 +587,13 @@ def decode_attention_fwd_normal(
         k_buffer,
         v_buffer,
         attn_logits,
-        req_to_token,
-        b_req_idx,
-        b_seq_len,
+        kv_indptr,
+        kv_indices,
         num_kv_splits,
         sm_scale,
         logit_cap,
     )
-    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len, num_kv_splits)
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, kv_indptr, num_kv_splits)
 def decode_attention_fwd_grouped(
@@ -602,9 +601,8 @@ def decode_attention_fwd_grouped(
     k_buffer,
     v_buffer,
     o,
-    req_to_token,
-    b_req_idx,
-    b_seq_len,
+    kv_indptr,
+    kv_indices,
     attn_logits,
     num_kv_splits,
     sm_scale,
@@ -615,14 +613,13 @@ def decode_attention_fwd_grouped(
         k_buffer,
         v_buffer,
         attn_logits,
-        req_to_token,
-        b_req_idx,
-        b_seq_len,
+        kv_indptr,
+        kv_indices,
         num_kv_splits,
         sm_scale,
         logit_cap,
     )
-    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, b_seq_len, num_kv_splits)
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, kv_indptr, num_kv_splits)
 def decode_attention_fwd(
@@ -630,9 +627,8 @@ def decode_attention_fwd(
     k_buffer,
     v_buffer,
     o,
-    req_to_token,
-    b_req_idx,
-    b_seq_len,
+    kv_indptr,
+    kv_indices,
     attn_logits,
     num_kv_splits,
     sm_scale,
@@ -648,9 +644,8 @@ def decode_attention_fwd(
             k_buffer,
             v_buffer,
             o,
-            req_to_token,
-            b_req_idx,
-            b_seq_len,
+            kv_indptr,
+            kv_indices,
             attn_logits,
             num_kv_splits,
             sm_scale,
@@ -663,9 +658,8 @@ def decode_attention_fwd(
             k_buffer,
             v_buffer,
             o,
-            req_to_token,
-            b_req_idx,
-            b_seq_len,
+            kv_indptr,
+            kv_indices,
             attn_logits,
             num_kv_splits,
             sm_scale,

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -166,6 +166,12 @@ def _fwd_kernel(
 def context_attention_fwd(
     q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
 ):
+    """
+    q, k, v: [b * s, head, head_dim]
+    b_start_loc: [b]
+    b_seq_len: [b]
+    out: [b * s, head, head_dim]
+    """
     if is_cuda_available and CUDA_CAPABILITY[0] > 8:
         BLOCK = 128
     else:

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Optional
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange, repeat
 from sglang.srt.distributed import parallel_state
@@ -63,7 +64,20 @@ def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.T
 class VisionAttention(nn.Module):
-    """Multi-headed attention without any cache, mostly used for ViT."""
+    r"""
+        Multi-headed attention without any cache, mostly used for ViT.
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        use_context_forward (bool, default to True):
+            if ``True``, a flash_attn style attention will be applied
+            Otherwise, a full-sequence attention will be applied.
+        use_full_precision_softmax (bool, default to False):
+            if ``True``, the softmax will be performed in full-precision
+            Otherwise, it will be performed in half-precision
+    """
     def __init__(
         self,
@@ -72,25 +86,39 @@ class VisionAttention(nn.Module):
         projection_size: int,
         use_qkv_parallel: bool,
         quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        use_context_forward: bool = True,
+        use_full_precision_softmax: bool = False,
+        flatten_batch: bool = False,
         prefix: str = "",
     ):
         super().__init__()
+        self.use_context_forward = use_context_forward
         world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads
         )
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, world_size
         )
-        # self.tp_size = get_tensor_model_parallel_world_size()
-        # num_heads = self.num_heads_per_partition
+        if self.use_context_forward:
+            self.qkv_backend = VisionTritonAttention()
+        else:
+            self.qkv_backend = VisionSdpaAttention(
+                head_size=self.head_size,
+                dropout=dropout,
+                flatten_batch=flatten_batch,
+                use_full_precision_softmax=use_full_precision_softmax,
+            )
         self.use_qkv_parallel = use_qkv_parallel
         if use_qkv_parallel:
-            self.head_dim = embed_dim // num_heads
             self.qkv_proj = QKVParallelLinear(
                 hidden_size=embed_dim,
-                head_size=self.head_dim,
+                head_size=self.head_size,
                 total_num_heads=num_heads,
                 quant_config=quant_config,
                 prefix=f"{prefix}.qkv_proj",
@@ -114,12 +142,15 @@ class VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: Optional[torch.Tensor] = None,
         rotary_pos_emb: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, num_heads * head]
         """
-        Input shape: [b, s, embed_dim]
-        Output shape: [s, b, num_heads * head_size]
-        """
         bsz, s, _ = x.shape
         if self.use_qkv_parallel:
             # [b, s, embed_dim] --> [b, s, embed_dim]
@@ -136,19 +167,19 @@ class VisionAttention(nn.Module):
         else:
             # [b, s, embed_dim] --> [s, b, embed_dim]
             x = rearrange(x, "b s ... -> s b ...")
-            # [s, b, embed_dim] --> [s, b, head * 3 * head_dim]
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
             qkv, _ = self.qkv_proj(x)
-            # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
             new_x_shape = qkv.size()[:-1] + (
                 self.num_attention_heads_per_partition,
                 3 * self.hidden_size_per_attention_head,
             )
             qkv = qkv.view(*new_x_shape)
-            # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
             q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
-            # [s, b, head, head_dim] --> [b, s, head, head_dim]
+            # [s, b, head, head_size] --> [b, s, head, head_size]
             q, k, v = [
                 rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
             ]
@@ -160,45 +191,217 @@ class VisionAttention(nn.Module):
         if self.use_qkv_parallel:
             pass
         else:
-            # [b, s, head, head_dim] --> [b * s, head, head_dim]
+            # [b, s, head, head_size] --> [b * s, head, head_size]
             q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        # [b * s, num_heads, head_size]
-        output = torch.empty_like(q)
-        seq_lens = (cu_seqlens[1:] - cu_seqlens[:-1]).cuda()
-        max_seqlen = seq_lens.max().item()
-        context_attention_fwd(
-            q,
-            k,
-            v,
-            output,
-            cu_seqlens.cuda(),
-            seq_lens,
-            max_seqlen,
-            is_causal=False,
-        )
+        output = self.qkv_backend.forward(q, k, v, bsz, cu_seqlens, attention_mask)
         if self.use_qkv_parallel:
-            # [b * s, head, head_dim] --> [b, s, head * head_dim]
+            # [b * s, h, head_size] --> [b, s, h * head_size]
             output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
-            # [b, s, head, head_dim] --> [b, s, head, head_dim]
+            # [b, s, h * head_size] --> [b, s, h * head_size]
             output, _ = self.proj(output)
         else:
-            # [b * s, head, head_dim] --> [b, s, head, head_dim]
-            context_layer = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-            # [s, b, num_heads * head_size]
+            # [b * s, h, head_size] --> [s, b, h * head_size]
             context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
             ).contiguous()
-            # [s, b, num_heads * head_size] --> [s, b, num_heads * head_size]
+            # [s, b, h * head_size] --> [s, b, h * head_size]
             output, _ = self.proj(context_layer)
+            # [s, b, h * head_size] --> [b, s, h * head_size]
             output = output.view(bsz, s, -1)
         return output
+class VisionSdpaAttention(nn.Module):
+    r"""
+    Scaled Dot Product Attention inner product
+    """
+    # TODO: Should it be released after used?
+    _mask_cache = {}
+    def __init__(
+        self,
+        head_size: int,
+        dropout: float = 0.0,
+        flatten_batch: bool = False,
+        use_full_precision_softmax: bool = False,
+    ):
+        super().__init__()
+        self.head_size = head_size
+        self.flatten_batch = flatten_batch
+        self.use_full_precision_softmax = use_full_precision_softmax
+        self.dropout = dropout
+    def generate_patch_attention_mask(
+        self,
+        s: int,
+        bsz: int,
+        device,
+        cu_seqlens: Optional[torch.Tensor],
+        flatten_batch: bool = False,
+        dtype=torch.bfloat16,
+    ) -> torch.Tensor:
+        r"""
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        When `flatten_batch` is True:
+            - All sequences in the batch are flattened into a single dimension
+            - `s` represents the total number of tokens across all sequences in the batch
+            - Returns a unified mask of shape `(1, 1, s, s)`
+        When `flatten_batch` is False:
+            - Each sequence has its own attention mask
+            - `s` represents the maximum sequence length in the batch
+            - Returns separate masks of shape `(b, 1, s, s)`
+        Args:
+            flatten_batch: (bool):
+                If True, treats all sequences in the batch as a single flattened sequence
+                If False, generates separate masks for each sequence
+        Returns:
+            Tensor of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        """
+        cache_key = (s, bsz, flatten_batch, tuple(cu_seqlens.cpu().tolist()))
+        if cache_key in VisionSdpaAttention._mask_cache:
+            cached_mask = VisionSdpaAttention._mask_cache[cache_key]
+            # print(f"cache hit for key: {cache_key}")
+            return cached_mask.to(device=device, dtype=dtype)
+        if cu_seqlens is None:
+            raise ValueError("Internal Error: cu_seqlens cannot be None")
+        if flatten_batch:
+            mask = torch.zeros([1, s, s], device=device, dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                start = cu_seqlens[i - 1]
+                end = cu_seqlens[i]
+                mask[
+                    ...,
+                    start:end,
+                    start:end,
+                ] = True
+        else:
+            # [1, 1, 1, s]
+            row_indices = torch.arange(s, device=device).view(1, 1, 1, s)
+            # [1, 1, s, 1]
+            col_indices = torch.arange(s, device=device).view(1, 1, s, 1)
+            # [b, 1, 1, 1]
+            seq_lens = (
+                (cu_seqlens[1:] - cu_seqlens[:-1]).to(device=device).view(-1, 1, 1, 1)
+            )
+            mask = (row_indices < seq_lens) & (col_indices < seq_lens)
+        # Convert to attention mask format (False -> 0, True -> -inf)
+        mask = (~mask).to(dtype) * torch.finfo(dtype).min
+        VisionSdpaAttention._mask_cache[cache_key] = mask
+        return mask
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz: int,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        s = q.shape[0] // bsz
+        # [b, 1, s, s]
+        if attention_mask is None:
+            attention_mask = self.generate_patch_attention_mask(
+                s, bsz, q.device, cu_seqlens, self.flatten_batch, q.dtype
+            )
+        q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
+        # [b, 1, s]
+        if self.use_full_precision_softmax:
+            scale = self.head_size**-0.5
+            k_transposed = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k_transposed) * scale
+            del k, k_transposed
+            attn_weights = attn_weights + attention_mask
+            del attention_mask
+            # full-precision
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_weights = nn.functional.dropout(
+                attn_weights, p=self.dropout, training=False
+            )
+            output = torch.matmul(attn_weights, v)
+            del attn_weights, v
+        else:
+            # SDPA
+            # [b, h, s, head_size]
+            output = F.scaled_dot_product_attention(
+                q, k, v, attention_mask, dropout_p=self.dropout
+            )
+        # [b, h, s, head_size] --> [b * s, h, head_size]
+        output = rearrange(output, "b h s d -> (b s) h d")
+        return output
+class VisionTritonAttention(nn.Module):
+    """
+    Triton-implemented attention without a causal mask
+    """
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        _bsz: int,
+        cu_seqlens: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        # [b * s, head, head_size]
+        output = torch.empty_like(q)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+        context_attention_fwd(
+            q,
+            k,
+            v,
+            output,
+            cu_seqlens.cuda(),
+            seq_lens.cuda(),
+            max_seqlen,
+            is_causal=False,
+        )
+        return output

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -29,14 +29,11 @@ if is_cuda_available():
         rmsnorm,
     )
-from vllm.model_executor.custom_op import CustomOp
-from sglang.srt.layers.custom_op_util import register_custom_op
+from sglang.srt.custom_op import CustomOp
 logger = logging.getLogger(__name__)
-@register_custom_op("sglang_rmsnorm")
 class RMSNorm(CustomOp):
     def __init__(
         self,
@@ -79,7 +76,6 @@ class RMSNorm(CustomOp):
             return x, residual
-@register_custom_op("sglang_gemma_rmsnorm")
 class GemmaRMSNorm(CustomOp):
     def __init__(
         self,

sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

sglang 0.4.2py3-none-any.whl → 0.4.2.post2py3-none-any.whl