PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

sglang/bench_one_batch.py +1 -11
sglang/bench_serving.py +149 -1
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +17 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +30 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +14 -2
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +4 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +5 -0
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/lora/lora_manager.py +10 -13
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/schedule_batch.py +19 -1
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +28 -13
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +9 -12
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/model_executor/model_runner.py +44 -33
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +55 -20
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +1 -1
sglang/srt/models/llama4.py +53 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +24 -40
sglang/srt/openai_api/protocol.py +28 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +30 -6
sglang/srt/utils.py +35 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +14 -6
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +90 -80
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
-from functools import lru_cache
+import math
+from functools import lru_cache, wraps
 from typing import Optional, Tuple
 import torch
@@ -8,6 +9,13 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from sglang.srt.utils import is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel.flash_attn import flash_attn_varlen_func
 from sglang.srt.distributed import parallel_state
 from sglang.srt.distributed import utils as dist_utils
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
@@ -19,166 +27,31 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.quantization import QuantizationConfig
-from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb, rotate_half
-from sglang.srt.utils import add_prefix
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import add_prefix, logger
-class VisionAttention(nn.Module):
-    r"""
-        Multi-headed attention without any cache, mostly used for ViT.
+ROTARY_EMBED_CLASSES = {
+    "normal": apply_rotary_pos_emb,
+}
-    Args:
-        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
-        use_context_forward (bool, default to True):
-            if ``True``, a flash_attn style attention will be applied
-            Otherwise, a full-sequence attention will be applied.
-        softmax_in_single_precision (bool, default to False):
-            if ``True``, the softmax will be performed in single-precision
-            Otherwise, it will be performed in half-precision
+def execute_once(func):
+    has_run = None
-    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        nonlocal has_run
+        if not has_run:
+            func(*args, **kwargs)
+            has_run = True
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        projection_size: int,
-        use_qkv_parallel: bool,
-        quant_config: Optional[QuantizationConfig] = None,
-        dropout: float = 0.0,
-        use_context_forward: bool = True,
-        softmax_in_single_precision: bool = False,
-        flatten_batch: bool = False,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.use_context_forward = use_context_forward
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.dropout = dropout
-        self.head_size = embed_dim // num_heads
-        self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads
-        )
-        self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, world_size
-        )
+    return wrapper
-        if self.use_context_forward:
-            self.qkv_backend = VisionTritonAttention()
-        else:
-            self.qkv_backend = VisionSdpaAttention(
-                head_size=self.head_size,
-                dropout=dropout,
-                flatten_batch=flatten_batch,
-                softmax_in_single_precision=softmax_in_single_precision,
-            )
-        self.use_qkv_parallel = use_qkv_parallel
-        if use_qkv_parallel:
-            self.qkv_proj = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.head_size,
-                total_num_heads=num_heads,
-                quant_config=quant_config,
-                prefix=add_prefix("qkv_proj", prefix),
-            )
-        else:
-            self.qkv_proj = ColumnParallelLinear(
-                input_size=embed_dim,
-                output_size=3 * projection_size,
-                quant_config=quant_config,
-                prefix=add_prefix("qkv_proj", prefix),
-            )
-        self.proj = RowParallelLinear(
-            input_size=embed_dim,
-            output_size=embed_dim,
-            quant_config=quant_config,
-            prefix=add_prefix("proj", prefix),
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""
-        Args:
-            x: [b, s, embed_dim]
-            cu_seqlens: [b]
-        Returns:
-             [s, b, head * head_size]
-        """
-        bsz, s, _ = x.shape
-        head = self.num_attention_heads_per_partition
-        if self.use_qkv_parallel:
-            # [b, s, embed_dim] --> [b, s, embed_dim]
-            qkv, _ = self.qkv_proj(x)
-            q, k, v = qkv.chunk(3, dim=-1)
-            # [b, s, embed_dim] --> [b * s, head, head_size]
-            q, k, v = [x.reshape(bsz * s, head, -1).contiguous() for x in (q, k, v)]
-        else:
-            # [b, s, embed_dim] --> [s, b, embed_dim]
-            x = rearrange(x, "b s ... -> s b ...")
-            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
-            qkv, _ = self.qkv_proj(x)
-            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
-            new_x_shape = qkv.size()[:-1] + (
-                head,
-                3 * self.hidden_size_per_attention_head,
-            )
-            qkv = qkv.view(*new_x_shape)
-            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
-            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
-            # [s, b, head, head_size] --> [b, s, head, head_size]
-            q, k, v = [
-                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
-            ]
-        if position_embeddings is not None:
-            cos, sin = position_embeddings
-            original_shape = q.shape
-            # [total_tokens, head, head_size]
-            q = q.view(-1, head, self.head_size)
-            k = k.view(-1, head, self.head_size)
-            q, k = apply_rotary_pos_emb(q, k, cos, sin)
-            q = q.view(original_shape)
-            k = k.view(original_shape)
-        if self.use_qkv_parallel:
-            pass
-        else:
-            # [b, s, head, head_size] --> [b * s, head, head_size]
-            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        output = self.qkv_backend.forward(q, k, v, bsz, cu_seqlens, attention_mask)
-        if self.use_qkv_parallel:
-            # [b * s, h, head_size] --> [b, s, h * head_size]
-            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
-            # [b, s, h * head_size] --> [b, s, h * head_size]
-            output, _ = self.proj(output)
-        else:
-            # [b * s, h, head_size] --> [s, b, h * head_size]
-            context_layer = rearrange(
-                output, "(b s) h d -> s b (h d)", b=bsz, s=s
-            ).contiguous()
-            # [s, b, h * head_size] --> [s, b, h * head_size]
-            output, _ = self.proj(context_layer)
-            # [s, b, h * head_size] --> [b, s, h * head_size]
-            output = output.view(bsz, s, -1)
-        return output
+@execute_once
+def info_once(message: str):
+    logger.info(message)
 class VisionSdpaAttention(nn.Module):
@@ -189,16 +62,22 @@ class VisionSdpaAttention(nn.Module):
     def __init__(
         self,
-        head_size: int,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
         dropout: float = 0.0,
         flatten_batch: bool = False,
         softmax_in_single_precision: bool = False,
+        **kwargs,
     ):
         super().__init__()
-        self.head_size = head_size
+        self.head_size = head_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
         self.flatten_batch = flatten_batch
         self.softmax_in_single_precision = softmax_in_single_precision
         self.dropout = dropout
+        self.scale = 1.0 / math.sqrt(self.head_size)
     @staticmethod
     @lru_cache(maxsize=128)
@@ -212,7 +91,7 @@ class VisionSdpaAttention(nn.Module):
             flatten_batch: whether to flatten batch dimension
             cu_seqlens: tuple of cumulative sequence lengths
         Returns:
-            attention mask tensor
+            attention mask tensor of shape [b, 1, s, s] or [1, s, s]
         """
         if flatten_batch:
             mask = torch.zeros([1, s, s], dtype=torch.bool)
@@ -241,7 +120,7 @@ class VisionSdpaAttention(nn.Module):
         flatten_batch: bool = False,
     ) -> Optional[torch.Tensor]:
         r"""
-        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, s, s)`.
         Args:
             s: sequence length
             cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
@@ -264,6 +143,7 @@ class VisionSdpaAttention(nn.Module):
         bsz: int,
         cu_seqlens: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -274,6 +154,8 @@ class VisionSdpaAttention(nn.Module):
         if self.flatten_batch:
             assert bsz == 1, "flatten_batch is True, bsz must be 1"
+        assert q.dim() == 3, q.shape
         s = q.shape[0] // bsz
         # [b, 1, s, s]
@@ -291,10 +173,10 @@ class VisionSdpaAttention(nn.Module):
         q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
         if self.softmax_in_single_precision:
-            scale = self.head_size**-0.5
-            k_transposed = rearrange(k, "b h s d -> b h d s")
-            attn_weights = torch.matmul(q, k_transposed) * scale
-            del k, k_transposed
+            k = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k) * self.scale
+            del k
+            # masking
             attention_mask = (~attention_mask) * torch.finfo(q.dtype).min
             attn_weights = attn_weights + attention_mask
             del attention_mask
@@ -332,6 +214,7 @@ class VisionTritonAttention(nn.Module):
     def __init__(
         self,
+        **kwargs,
     ):
         super().__init__()
@@ -340,8 +223,8 @@ class VisionTritonAttention(nn.Module):
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        _bsz: int,
         cu_seqlens: Optional[torch.Tensor],
+        **kwargs,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -366,3 +249,247 @@ class VisionTritonAttention(nn.Module):
         )
         return output
+class VisionFlash3Attention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlash3Attention is only available for cuda")
+        super().__init__()
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        cu_seqlens = cu_seqlens.to(dtype=torch.int32).cuda()
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+        output = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+        )
+        return output
+QKV_BACKEND_IMPL = {
+    "triton_attn": VisionTritonAttention,
+    "sdpa": VisionSdpaAttention,
+    "fa3": VisionFlash3Attention,
+}
+class VisionAttention(nn.Module):
+    r"""
+        Multi-headed attention without any cache, mostly used for multimodal transformers.
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        softmax_in_single_precision (bool, default to False):
+            if ``True``, the softmax will be performed in single-precision
+            Otherwise, it will be performed in half-precision
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        qkv_backend: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        softmax_in_single_precision: bool = False,
+        flatten_batch: bool = False,
+        prefix: str = "",
+        proj_bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        self.num_attention_kv_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        self.q_size = self.num_attention_heads_per_partition * self.head_size
+        self.kv_size = self.num_attention_kv_heads_per_partition * self.head_size
+        if global_server_args_dict["mm_attention_backend"] is None:
+            if qkv_backend is None:
+                qkv_backend = "sdpa"
+            info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
+        else:
+            qkv_backend = global_server_args_dict["mm_attention_backend"]
+        info_once(f"Using {qkv_backend} as multimodal attention backend.")
+        self.qkv_backend = QKV_BACKEND_IMPL[qkv_backend](
+            head_dim=self.head_size,
+            num_heads=self.num_attention_heads_per_partition,
+            num_kv_heads=self.num_attention_kv_heads_per_partition,
+            dropout=dropout,
+            flatten_batch=flatten_batch,
+            softmax_in_single_precision=softmax_in_single_precision,
+        )
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_size,
+                total_num_heads=num_heads,
+                total_num_kv_heads=num_heads,
+                quant_config=quant_config,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * projection_size,
+                quant_config=quant_config,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        self.proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=proj_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("proj", prefix),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, head * head_size]
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+        assert x.dim() == 3, x.shape
+        bsz, s, _ = x.shape
+        head = self.num_attention_heads_per_partition
+        kv_head = self.num_attention_kv_heads_per_partition
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            # [b, s, embed_dim] --> [b * s, head, head_size]
+            q = q.reshape(bsz * s, head, -1).contiguous()
+            k = k.reshape(bsz * s, kv_head, -1).contiguous()
+            v = v.reshape(bsz * s, kv_head, -1).contiguous()
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
+            qkv, _ = self.qkv_proj(x)
+            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
+            new_x_shape = qkv.size()[:-1] + (
+                head,
+                3 * self.hidden_size_per_attention_head,
+            )
+            qkv = qkv.view(*new_x_shape)
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
+            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
+            # [s, b, head, head_size] --> [b, s, head, head_size]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            original_shape = q.shape
+            # [total_tokens, head, head_size]
+            q = q.view(-1, head, self.head_size)
+            k = k.view(-1, head, self.head_size)
+            q, k = apply_rotary_pos_emb(q, k, cos, sin)
+            q = q.view(original_shape)
+            k = k.view(original_shape)
+        if q.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            q = rearrange(q, "b s ... -> (b s) ...")
+        if k.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            k = rearrange(k, "b s ... -> (b s) ...")
+        if v.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            v = rearrange(v, "b s ... -> (b s) ...")
+        assert q.dim() == 3, q.dim()
+        assert k.dim() == 3, k.dim()
+        assert v.dim() == 3, v.dim()
+        output = self.qkv_backend.forward(
+            q=q,
+            k=k,
+            v=v,
+            bsz=bsz,
+            cu_seqlens=cu_seqlens,
+            attention_mask=attention_mask,
+        )
+        assert output.dim() == 3, output.shape
+        if self.use_qkv_parallel:
+            # [b * s, h, head_size] --> [b, s, h * head_size]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+            # [b, s, h * head_size] --> [b, s, h * head_size]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, h, head_size] --> [s, b, h * head_size]
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
+            ).contiguous()
+            # [s, b, h * head_size] --> [s, b, h * head_size]
+            output, _ = self.proj(context_layer)
+            # [s, b, h * head_size] --> [b, s, h * head_size]
+            output = output.view(bsz, s, -1)
+        return output

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post3py3-none-any.whl