PyPI - sglang - Versions diffs - 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from functools import lru_cache
 from typing import Optional
 import torch
@@ -18,6 +19,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.utils import add_prefix
 def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
@@ -121,20 +123,20 @@ class VisionAttention(nn.Module):
                 head_size=self.head_size,
                 total_num_heads=num_heads,
                 quant_config=quant_config,
-                prefix=f"{prefix}.qkv_proj",
+                prefix=add_prefix("qkv_proj", prefix),
             )
         else:
             self.qkv_proj = ColumnParallelLinear(
                 input_size=embed_dim,
                 output_size=3 * projection_size,
                 quant_config=quant_config,
-                prefix=f"{prefix}.qkv_proj",
+                prefix=add_prefix("qkv_proj", prefix),
             )
         self.proj = RowParallelLinear(
             input_size=embed_dim,
             output_size=embed_dim,
             quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
+            prefix=add_prefix("out_proj", prefix),
         )
     def forward(
@@ -223,9 +225,6 @@ class VisionSdpaAttention(nn.Module):
     """
-    # TODO: Should it be released after used?
-    _mask_cache = {}
     def __init__(
         self,
         head_size: int,
@@ -239,75 +238,61 @@ class VisionSdpaAttention(nn.Module):
         self.use_full_precision_softmax = use_full_precision_softmax
         self.dropout = dropout
-    def generate_patch_attention_mask(
-        self,
-        s: int,
-        bsz: int,
-        device,
-        cu_seqlens: Optional[torch.Tensor],
-        flatten_batch: bool = False,
-        dtype=torch.bfloat16,
-    ) -> torch.Tensor:
-        r"""
-        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
-        When `flatten_batch` is True:
-            - All sequences in the batch are flattened into a single dimension
-            - `s` represents the total number of tokens across all sequences in the batch
-            - Returns a unified mask of shape `(1, 1, s, s)`
-        When `flatten_batch` is False:
-            - Each sequence has its own attention mask
-            - `s` represents the maximum sequence length in the batch
-            - Returns separate masks of shape `(b, 1, s, s)`
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _generate_mask_cache(
+        s: int, flatten_batch: bool, cu_seqlens: tuple
+    ) -> torch.BoolTensor:
+        """
+        Generate a boolean attention mask with caching mechanism.
         Args:
-            flatten_batch: (bool):
-                If True, treats all sequences in the batch as a single flattened sequence
-                If False, generates separate masks for each sequence
+            s: sequence length
+            flatten_batch: whether to flatten batch dimension
+            cu_seqlens: tuple of cumulative sequence lengths
         Returns:
-            Tensor of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+            attention mask tensor
         """
-        cache_key = (s, bsz, flatten_batch, tuple(cu_seqlens.cpu().tolist()))
-        if cache_key in VisionSdpaAttention._mask_cache:
-            cached_mask = VisionSdpaAttention._mask_cache[cache_key]
-            # print(f"cache hit for key: {cache_key}")
-            return cached_mask.to(device=device, dtype=dtype)
-        if cu_seqlens is None:
-            raise ValueError("Internal Error: cu_seqlens cannot be None")
         if flatten_batch:
-            mask = torch.zeros([1, s, s], device=device, dtype=torch.bool)
+            mask = torch.zeros([1, s, s], dtype=torch.bool)
             for i in range(1, len(cu_seqlens)):
                 start = cu_seqlens[i - 1]
                 end = cu_seqlens[i]
-                mask[
-                    ...,
-                    start:end,
-                    start:end,
-                ] = True
+                mask[..., start:end, start:end] = True
         else:
             # [1, 1, 1, s]
-            row_indices = torch.arange(s, device=device).view(1, 1, 1, s)
+            row_indices = torch.arange(s).view(1, 1, 1, s)
             # [1, 1, s, 1]
-            col_indices = torch.arange(s, device=device).view(1, 1, s, 1)
+            col_indices = torch.arange(s).view(1, 1, s, 1)
             # [b, 1, 1, 1]
-            seq_lens = (
-                (cu_seqlens[1:] - cu_seqlens[:-1]).to(device=device).view(-1, 1, 1, 1)
-            )
+            seq_lens = torch.tensor(
+                [end - start for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:])],
+            ).view(-1, 1, 1, 1)
             mask = (row_indices < seq_lens) & (col_indices < seq_lens)
-        # Convert to attention mask format (False -> 0, True -> -inf)
-        mask = (~mask).to(dtype) * torch.finfo(dtype).min
+        return mask
+    def generate_patch_attention_mask(
+        self,
+        s: int,
+        cu_seqlens: Optional[torch.Tensor],
+        flatten_batch: bool = False,
+    ) -> Optional[torch.Tensor]:
+        r"""
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        Args:
+            s: sequence length
+            cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
+            flatten_batch: whether to flatten batch dimension
+        Returns:
+            attention mask tensor or None
+        """
+        if cu_seqlens is None:
+            return None
-        VisionSdpaAttention._mask_cache[cache_key] = mask
+        cu_seqlens_tuple = tuple(cu_seqlens.cpu().tolist())
-        return mask
+        return self._generate_mask_cache(s, flatten_batch, cu_seqlens_tuple)
     def forward(
         self,
@@ -330,15 +315,23 @@ class VisionSdpaAttention(nn.Module):
         # [b, 1, s, s]
         if attention_mask is None:
             attention_mask = self.generate_patch_attention_mask(
-                s, bsz, q.device, cu_seqlens, self.flatten_batch, q.dtype
+                s, cu_seqlens, flatten_batch=self.flatten_batch
             )
+        if attention_mask is None:
+            if self.use_full_precision_softmax:
+                raise RuntimeError("Empty attention mask")
+        else:
+            attention_mask = attention_mask.to(device=q.device)
         q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
-        # [b, 1, s]
         if self.use_full_precision_softmax:
             scale = self.head_size**-0.5
             k_transposed = rearrange(k, "b h s d -> b h d s")
             attn_weights = torch.matmul(q, k_transposed) * scale
             del k, k_transposed
+            attention_mask = (~attention_mask) * torch.finfo(q.dtype).min
             attn_weights = attn_weights + attention_mask
             del attention_mask
             # full-precision
@@ -354,7 +347,12 @@ class VisionSdpaAttention(nn.Module):
             # SDPA
             # [b, h, s, head_size]
             output = F.scaled_dot_product_attention(
-                q, k, v, attention_mask, dropout_p=self.dropout
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout,
+                is_causal=False,
             )
         # [b, h, s, head_size] --> [b * s, h, head_size]
@@ -380,7 +378,6 @@ class VisionTritonAttention(nn.Module):
         v: torch.Tensor,
         _bsz: int,
         cu_seqlens: Optional[torch.Tensor],
-        **kwargs,
     ) -> torch.Tensor:
         r"""
         Args:

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -1,6 +1,21 @@
+from __future__ import annotations
+import functools
+from typing import TYPE_CHECKING, Union
 import torch
+import triton
+import triton.language as tl
+from sglang.srt.distributed import (
+    GroupCoordinator,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
-from sglang.srt.distributed import GroupCoordinator, get_tp_group
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 _ATTN_TP_GROUP = None
 _ATTN_TP_RANK = None
@@ -69,3 +84,129 @@ def get_attention_dp_rank():
 def get_attention_dp_size():
     assert _DP_SIZE is not None, "dp attention not initialized!"
     return _DP_SIZE
+def get_dp_local_info(forward_batch: ForwardBatch):
+    dp_rank = get_attention_dp_rank()
+    if forward_batch.dp_local_start_pos is None:
+        cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
+        if dp_rank == 0:
+            local_start_pos = torch.zeros_like(cumtokens[0])
+        else:
+            local_start_pos = cumtokens[dp_rank - 1]
+        local_num_tokens = forward_batch.global_num_tokens_gpu[dp_rank]
+        forward_batch.dp_local_start_pos = local_start_pos
+        forward_batch.dp_local_num_tokens = local_num_tokens
+    return forward_batch.dp_local_start_pos, forward_batch.dp_local_num_tokens
+@triton.jit
+def memcpy_triton_kernel(
+    dst_ptr,
+    src_ptr,
+    offset_ptr,
+    sz_ptr,
+    offset_src,
+    chunk_size,  # multiplied for offset and sz
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0).to(tl.int64)
+    offset = tl.load(offset_ptr).to(tl.int64) * chunk_size
+    sz = tl.load(sz_ptr).to(tl.int64) * chunk_size
+    start_index = pid * BLOCK_SIZE
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = start_index + offs < sz
+    if offset_src:
+        data = tl.load(src_ptr + offset + start_index + offs, mask=mask)
+        tl.store(dst_ptr + start_index + offs, data, mask=mask)
+    else:
+        data = tl.load(src_ptr + start_index + offs, mask=mask)
+        tl.store(dst_ptr + offset + start_index + offs, data, mask=mask)
+def prod(x):
+    return functools.reduce(lambda a, b: a * b, x, 1)
+def memcpy_triton(dst, src, dim, offset, sz, offset_src):
+    max_size = min(src.numel(), dst.numel())
+    assert dim == 0, "dim != 0 unsupported"
+    assert src.shape[1:] == dst.shape[1:], "src and dst must have same shape"
+    chunk_size = prod(src.shape[1:])
+    BLOCK_SIZE = 8192
+    grid = (triton.cdiv(max_size, BLOCK_SIZE),)
+    memcpy_triton_kernel[grid](dst, src, offset, sz, offset_src, chunk_size, BLOCK_SIZE)
+def dp_gather(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    layer_id: Union[str, int],
+):
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+    global_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+    if local_tokens.shape[0] > 0 and (
+        layer_id != "embedding" or get_attention_tp_rank() == 0
+    ):
+        assert (
+            global_tokens.storage().data_ptr() != local_tokens.storage().data_ptr()
+        ), "aliasing between global_tokens and local_tokens not allowed"
+        memcpy_triton(
+            global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
+        )
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
+    NUM_GPUS_PER_NODE = 8
+    if (
+        not local_tokens.dtype.is_floating_point
+        and get_tensor_model_parallel_world_size() <= NUM_GPUS_PER_NODE
+    ):
+        torch.ops.sglang.inplace_all_reduce(
+            global_tokens, group_name=get_tp_group().unique_name
+        )
+    else:
+        global_tokens = tensor_model_parallel_all_reduce(global_tokens)
+def dp_scatter(
+    local_tokens: torch.Tensor,  # output
+    global_tokens: torch.Tensor,  # input
+    forward_batch: ForwardBatch,
+):
+    # local_num_tokens is not necessarily the same as local_tokens.shape[0],
+    # since local_tokens may be padded for cuda graph
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+    local_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+    if local_tokens.shape[0] > 0:
+        assert (
+            local_tokens.untyped_storage().data_ptr()
+            != global_tokens.untyped_storage().data_ptr()
+        ), "aliasing between local_tokens and global_tokens not allowed"
+        memcpy_triton(
+            local_tokens, global_tokens, 0, local_start_pos, local_num_tokens, True
+        )
+def get_do_logits_dp_scatter(forward_batch: ForwardBatch):
+    def do_logits_dp_scatter(logits: torch.Tensor):
+        local_logits = torch.empty(
+            (forward_batch.input_ids.shape[0], *logits.shape[1:]),
+            dtype=logits.dtype,
+            device=logits.device,
+        )
+        dp_scatter(local_logits, logits, forward_batch)
+        return local_logits
+    return do_logits_dp_scatter

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -69,7 +69,7 @@ class RMSNorm(CustomOp):
         variance = x.pow(2).mean(dim=-1, keepdim=True)
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-        x = x.to(orig_dtype) * self.weight
+        x = (x * self.weight).to(orig_dtype)
         if residual is None:
             return x
         else:

sglang/srt/layers/linear.py CHANGED Viewed

@@ -38,6 +38,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "AWQLinearMethod",
     "GPTQMarlinLinearMethod",
     "Fp8LinearMethod",
+    "BlockInt8LinearMethod",
     "MarlinLinearMethod",
     "QQQLinearMethod",
     "GPTQMarlin24LinearMethod",
@@ -425,13 +426,14 @@ class ColumnParallelLinear(LinearBase):
         from sglang.srt.layers.parameter import _ColumnvLLMParameter
         if isinstance(param, _ColumnvLLMParameter):
-            # FIXME: why would we need this special case?
             param.load_column_parallel_weight(
                 loaded_weight,
                 tp_rank=self.tp_rank,
                 use_presharded_weights=self.use_presharded_weights,
             )
         else:
+            # FIXME: This branch is needed to load deepseek v3 awq.
+            # However, we should fix this and avoid the branching here.
             param.load_column_parallel_weight(loaded_weight)
     def forward(self, input_):

sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl