PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_ops/merge_state.py ADDED Viewed

@@ -0,0 +1,96 @@
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def merge_state_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_merged
+    output_lse,  # [NUM_TOKENS, NUM_HEADS] s_merged
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_a
+    prefix_lse,  # [NUM_TOKENS, NUM_HEADS] s_a
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_b
+    suffix_lse,  # [NUM_TOKENS, NUM_HEADS] s_b
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+    p_lse = tl.load(prefix_lse + token_idx * num_heads + head_idx)
+    s_lse = tl.load(suffix_lse + token_idx * num_heads + head_idx)
+    p_lse = float("-inf") if p_lse == float("inf") else p_lse
+    s_lse = float("-inf") if s_lse == float("inf") else s_lse
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    out_se = tl.exp(p_lse) + tl.exp(s_lse)
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + token_idx * num_heads + head_idx, out_lse)
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(
+        prefix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    s_out = tl.load(
+        suffix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    p_scale = tl.exp(p_lse) / out_se
+    s_scale = tl.exp(s_lse) / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(
+        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        out,
+        mask=head_mask,
+    )
+def merge_state_triton(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    # Avoid creating new tensors if they are already provided
+    if output is None:
+        output = torch.empty_like(prefix_output)
+    if output_lse is None:
+        output_lse = torch.empty_like(prefix_lse)
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+    merge_state_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+    return output, output_lse

sglang/srt/layers/attention/utils.py CHANGED Viewed

@@ -28,7 +28,8 @@ def create_flashinfer_kv_indices_triton(
     num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
     for i in range(num_loop):
-        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        # index into req_to_token_ptr needs to be int64
+        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
         mask = offset < kv_end - kv_start
         data = tl.load(
             req_to_token_ptr
@@ -70,8 +71,9 @@ def create_flashmla_kv_indices_triton(
     num_pages_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
     for i in range(num_pages_loop):
+        # index into req_to_token_ptr needs to be int64
         paged_offset = (
-            tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK
+            tl.arange(0, NUM_PAGE_PER_BLOCK).to(tl.int64) + i * NUM_PAGE_PER_BLOCK
         ) * PAGED_SIZE
         paged_offset_out = tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
-from functools import lru_cache
+import math
+from functools import lru_cache, wraps
 from typing import Optional, Tuple
 import torch
@@ -8,6 +9,13 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from sglang.srt.utils import is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel.flash_attn import flash_attn_varlen_func
 from sglang.srt.distributed import parallel_state
 from sglang.srt.distributed import utils as dist_utils
 from sglang.srt.layers.attention.triton_ops.prefill_attention import (
@@ -19,166 +27,31 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.quantization import QuantizationConfig
-from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb, rotate_half
-from sglang.srt.utils import add_prefix
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import add_prefix, logger
-class VisionAttention(nn.Module):
-    r"""
-        Multi-headed attention without any cache, mostly used for ViT.
+ROTARY_EMBED_CLASSES = {
+    "normal": apply_rotary_pos_emb,
+}
-    Args:
-        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
-        use_context_forward (bool, default to True):
-            if ``True``, a flash_attn style attention will be applied
-            Otherwise, a full-sequence attention will be applied.
-        softmax_in_single_precision (bool, default to False):
-            if ``True``, the softmax will be performed in single-precision
-            Otherwise, it will be performed in half-precision
+def execute_once(func):
+    has_run = None
-    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        nonlocal has_run
+        if not has_run:
+            func(*args, **kwargs)
+            has_run = True
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        projection_size: int,
-        use_qkv_parallel: bool,
-        quant_config: Optional[QuantizationConfig] = None,
-        dropout: float = 0.0,
-        use_context_forward: bool = True,
-        softmax_in_single_precision: bool = False,
-        flatten_batch: bool = False,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.use_context_forward = use_context_forward
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.dropout = dropout
-        self.head_size = embed_dim // num_heads
-        self.hidden_size_per_attention_head = dist_utils.divide(
-            projection_size, num_heads
-        )
-        self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, world_size
-        )
+    return wrapper
-        if self.use_context_forward:
-            self.qkv_backend = VisionTritonAttention()
-        else:
-            self.qkv_backend = VisionSdpaAttention(
-                head_size=self.head_size,
-                dropout=dropout,
-                flatten_batch=flatten_batch,
-                softmax_in_single_precision=softmax_in_single_precision,
-            )
-        self.use_qkv_parallel = use_qkv_parallel
-        if use_qkv_parallel:
-            self.qkv_proj = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.head_size,
-                total_num_heads=num_heads,
-                quant_config=quant_config,
-                prefix=add_prefix("qkv_proj", prefix),
-            )
-        else:
-            self.qkv_proj = ColumnParallelLinear(
-                input_size=embed_dim,
-                output_size=3 * projection_size,
-                quant_config=quant_config,
-                prefix=add_prefix("qkv_proj", prefix),
-            )
-        self.proj = RowParallelLinear(
-            input_size=embed_dim,
-            output_size=embed_dim,
-            quant_config=quant_config,
-            prefix=add_prefix("proj", prefix),
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""
-        Args:
-            x: [b, s, embed_dim]
-            cu_seqlens: [b]
-        Returns:
-             [s, b, head * head_size]
-        """
-        bsz, s, _ = x.shape
-        head = self.num_attention_heads_per_partition
-        if self.use_qkv_parallel:
-            # [b, s, embed_dim] --> [b, s, embed_dim]
-            qkv, _ = self.qkv_proj(x)
-            q, k, v = qkv.chunk(3, dim=-1)
-            # [b, s, embed_dim] --> [b * s, head, head_size]
-            q, k, v = [x.reshape(bsz * s, head, -1).contiguous() for x in (q, k, v)]
-        else:
-            # [b, s, embed_dim] --> [s, b, embed_dim]
-            x = rearrange(x, "b s ... -> s b ...")
-            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
-            qkv, _ = self.qkv_proj(x)
-            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
-            new_x_shape = qkv.size()[:-1] + (
-                head,
-                3 * self.hidden_size_per_attention_head,
-            )
-            qkv = qkv.view(*new_x_shape)
-            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
-            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
-            # [s, b, head, head_size] --> [b, s, head, head_size]
-            q, k, v = [
-                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
-            ]
-        if position_embeddings is not None:
-            cos, sin = position_embeddings
-            original_shape = q.shape
-            # [total_tokens, head, head_size]
-            q = q.view(-1, head, self.head_size)
-            k = k.view(-1, head, self.head_size)
-            q, k = apply_rotary_pos_emb(q, k, cos, sin)
-            q = q.view(original_shape)
-            k = k.view(original_shape)
-        if self.use_qkv_parallel:
-            pass
-        else:
-            # [b, s, head, head_size] --> [b * s, head, head_size]
-            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        output = self.qkv_backend.forward(q, k, v, bsz, cu_seqlens, attention_mask)
-        if self.use_qkv_parallel:
-            # [b * s, h, head_size] --> [b, s, h * head_size]
-            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
-            # [b, s, h * head_size] --> [b, s, h * head_size]
-            output, _ = self.proj(output)
-        else:
-            # [b * s, h, head_size] --> [s, b, h * head_size]
-            context_layer = rearrange(
-                output, "(b s) h d -> s b (h d)", b=bsz, s=s
-            ).contiguous()
-            # [s, b, h * head_size] --> [s, b, h * head_size]
-            output, _ = self.proj(context_layer)
-            # [s, b, h * head_size] --> [b, s, h * head_size]
-            output = output.view(bsz, s, -1)
-        return output
+@execute_once
+def info_once(message: str):
+    logger.info(message)
 class VisionSdpaAttention(nn.Module):
@@ -189,16 +62,22 @@ class VisionSdpaAttention(nn.Module):
     def __init__(
         self,
-        head_size: int,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
         dropout: float = 0.0,
         flatten_batch: bool = False,
         softmax_in_single_precision: bool = False,
+        **kwargs,
     ):
         super().__init__()
-        self.head_size = head_size
+        self.head_size = head_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
         self.flatten_batch = flatten_batch
         self.softmax_in_single_precision = softmax_in_single_precision
         self.dropout = dropout
+        self.scale = 1.0 / math.sqrt(self.head_size)
     @staticmethod
     @lru_cache(maxsize=128)
@@ -212,7 +91,7 @@ class VisionSdpaAttention(nn.Module):
             flatten_batch: whether to flatten batch dimension
             cu_seqlens: tuple of cumulative sequence lengths
         Returns:
-            attention mask tensor
+            attention mask tensor of shape [b, 1, s, s] or [1, s, s]
         """
         if flatten_batch:
             mask = torch.zeros([1, s, s], dtype=torch.bool)
@@ -241,7 +120,7 @@ class VisionSdpaAttention(nn.Module):
         flatten_batch: bool = False,
     ) -> Optional[torch.Tensor]:
         r"""
-        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, s, s)`.
         Args:
             s: sequence length
             cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
@@ -264,6 +143,7 @@ class VisionSdpaAttention(nn.Module):
         bsz: int,
         cu_seqlens: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -274,6 +154,8 @@ class VisionSdpaAttention(nn.Module):
         if self.flatten_batch:
             assert bsz == 1, "flatten_batch is True, bsz must be 1"
+        assert q.dim() == 3, q.shape
         s = q.shape[0] // bsz
         # [b, 1, s, s]
@@ -291,10 +173,10 @@ class VisionSdpaAttention(nn.Module):
         q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
         if self.softmax_in_single_precision:
-            scale = self.head_size**-0.5
-            k_transposed = rearrange(k, "b h s d -> b h d s")
-            attn_weights = torch.matmul(q, k_transposed) * scale
-            del k, k_transposed
+            k = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k) * self.scale
+            del k
+            # masking
             attention_mask = (~attention_mask) * torch.finfo(q.dtype).min
             attn_weights = attn_weights + attention_mask
             del attention_mask
@@ -332,6 +214,7 @@ class VisionTritonAttention(nn.Module):
     def __init__(
         self,
+        **kwargs,
     ):
         super().__init__()
@@ -340,8 +223,8 @@ class VisionTritonAttention(nn.Module):
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        _bsz: int,
         cu_seqlens: Optional[torch.Tensor],
+        **kwargs,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -366,3 +249,247 @@ class VisionTritonAttention(nn.Module):
         )
         return output
+class VisionFlash3Attention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlash3Attention is only available for cuda")
+        super().__init__()
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        cu_seqlens = cu_seqlens.to(dtype=torch.int32).cuda()
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+        output = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+        )
+        return output
+QKV_BACKEND_IMPL = {
+    "triton_attn": VisionTritonAttention,
+    "sdpa": VisionSdpaAttention,
+    "fa3": VisionFlash3Attention,
+}
+class VisionAttention(nn.Module):
+    r"""
+        Multi-headed attention without any cache, mostly used for multimodal transformers.
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        softmax_in_single_precision (bool, default to False):
+            if ``True``, the softmax will be performed in single-precision
+            Otherwise, it will be performed in half-precision
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        qkv_backend: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        softmax_in_single_precision: bool = False,
+        flatten_batch: bool = False,
+        prefix: str = "",
+        proj_bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        self.num_attention_kv_heads_per_partition = dist_utils.divide(
+            num_heads, world_size
+        )
+        self.q_size = self.num_attention_heads_per_partition * self.head_size
+        self.kv_size = self.num_attention_kv_heads_per_partition * self.head_size
+        if global_server_args_dict["mm_attention_backend"] is None:
+            if qkv_backend is None:
+                qkv_backend = "sdpa"
+            info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
+        else:
+            qkv_backend = global_server_args_dict["mm_attention_backend"]
+        info_once(f"Using {qkv_backend} as multimodal attention backend.")
+        self.qkv_backend = QKV_BACKEND_IMPL[qkv_backend](
+            head_dim=self.head_size,
+            num_heads=self.num_attention_heads_per_partition,
+            num_kv_heads=self.num_attention_kv_heads_per_partition,
+            dropout=dropout,
+            flatten_batch=flatten_batch,
+            softmax_in_single_precision=softmax_in_single_precision,
+        )
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_size,
+                total_num_heads=num_heads,
+                total_num_kv_heads=num_heads,
+                quant_config=quant_config,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * projection_size,
+                quant_config=quant_config,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        self.proj = RowParallelLinear(
+            input_size=embed_dim,
+            output_size=embed_dim,
+            bias=proj_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("proj", prefix),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, head * head_size]
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+        assert x.dim() == 3, x.shape
+        bsz, s, _ = x.shape
+        head = self.num_attention_heads_per_partition
+        kv_head = self.num_attention_kv_heads_per_partition
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            # [b, s, embed_dim] --> [b * s, head, head_size]
+            q = q.reshape(bsz * s, head, -1).contiguous()
+            k = k.reshape(bsz * s, kv_head, -1).contiguous()
+            v = v.reshape(bsz * s, kv_head, -1).contiguous()
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
+            qkv, _ = self.qkv_proj(x)
+            # [s, b, head * 3 * head_size] --> [s, b, head, 3 * head_size]
+            new_x_shape = qkv.size()[:-1] + (
+                head,
+                3 * self.hidden_size_per_attention_head,
+            )
+            qkv = qkv.view(*new_x_shape)
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
+            q, k, v = dist_utils.split_tensor_along_last_dim(qkv, 3)
+            # [s, b, head, head_size] --> [b, s, head, head_size]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            original_shape = q.shape
+            # [total_tokens, head, head_size]
+            q = q.view(-1, head, self.head_size)
+            k = k.view(-1, head, self.head_size)
+            q, k = apply_rotary_pos_emb(q, k, cos, sin)
+            q = q.view(original_shape)
+            k = k.view(original_shape)
+        if q.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            q = rearrange(q, "b s ... -> (b s) ...")
+        if k.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            k = rearrange(k, "b s ... -> (b s) ...")
+        if v.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            v = rearrange(v, "b s ... -> (b s) ...")
+        assert q.dim() == 3, q.dim()
+        assert k.dim() == 3, k.dim()
+        assert v.dim() == 3, v.dim()
+        output = self.qkv_backend.forward(
+            q=q,
+            k=k,
+            v=v,
+            bsz=bsz,
+            cu_seqlens=cu_seqlens,
+            attention_mask=attention_mask,
+        )
+        assert output.dim() == 3, output.shape
+        if self.use_qkv_parallel:
+            # [b * s, h, head_size] --> [b, s, h * head_size]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+            # [b, s, h * head_size] --> [b, s, h * head_size]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, h, head_size] --> [s, b, h * head_size]
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
+            ).contiguous()
+            # [s, b, h * head_size] --> [s, b, h * head_size]
+            output, _ = self.proj(context_layer)
+            # [s, b, h * head_size] --> [b, s, h * head_size]
+            output = output.view(bsz, s, -1)
+        return output

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl