PyPI - sglang - Versions diffs - 0.4.1.post3__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl - Mend

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/srt/layers/attention/torch_native_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 import torch
 from torch.nn.functional import scaled_dot_product_attention
@@ -23,43 +23,6 @@ class TorchNativeAttnBackend(AttentionBackend):
         """Init the metadata for a forward pass."""
         pass
-    def init_cuda_graph_state(self, max_bs: int):
-        # TODO: Support CUDA graph
-        raise ValueError(
-            "Torch native attention does not support CUDA graph for now. Please --disable-cuda-graph"
-        )
-    def init_forward_metadata_capture_cuda_graph(
-        self,
-        bs: int,
-        req_pool_indices: torch.Tensor,
-        seq_lens: torch.Tensor,
-        encoder_lens: Optional[torch.Tensor] = None,
-    ):
-        # TODO: Support CUDA graph
-        raise ValueError(
-            "Torch native attention does not support CUDA graph for now. Please --disable-cuda-graph"
-        )
-    def init_forward_metadata_replay_cuda_graph(
-        self,
-        bs: int,
-        req_pool_indices: torch.Tensor,
-        seq_lens: torch.Tensor,
-        seq_lens_sum: int,
-        encoder_lens: Optional[torch.Tensor] = None,
-    ):
-        # TODO: Support CUDA graph
-        raise ValueError(
-            "Torch native attention does not support CUDA graph for now. Please --disable-cuda-graph"
-        )
-    def get_cuda_graph_seq_len_fill_value(self):
-        # TODO: Support CUDA graph
-        raise ValueError(
-            "Torch native attention does not support CUDA graph for now. Please --disable-cuda-graph"
-        )
     def _run_sdpa_forward_extend(
         self,
         query: torch.Tensor,

sglang/srt/layers/attention/triton_backend.py CHANGED Viewed

@@ -1,15 +1,16 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 import torch
 from sglang.srt.layers.attention import AttentionBackend
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
     from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
 class TritonAttnBackend(AttentionBackend):
@@ -80,11 +81,17 @@ class TritonAttnBackend(AttentionBackend):
     def init_forward_metadata_capture_cuda_graph(
         self,
         bs: int,
+        num_tokens: int,
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
-        encoder_lens=None,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
     ):
-        # NOTE: encoder_lens expected to be zeros or None
+        assert encoder_lens is None, "Not supported"
+        assert forward_mode.is_decode(), "Not supported"
+        assert spec_info is None, "Not supported"
         self.forward_metadata = (
             self.cuda_graph_attn_logits,
             None,
@@ -96,7 +103,9 @@ class TritonAttnBackend(AttentionBackend):
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
-        encoder_lens=None,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
     ):
         # NOTE: encoder_lens expected to be zeros or None
         self.cuda_graph_start_loc.zero_()
@@ -107,9 +116,9 @@ class TritonAttnBackend(AttentionBackend):
     def forward_extend(
         self,
-        q,
-        k,
-        v,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
         layer: RadixAttention,
         forward_batch: ForwardBatch,
         save_kv_cache=True,
@@ -146,9 +155,9 @@ class TritonAttnBackend(AttentionBackend):
     def forward_decode(
         self,
-        q,
-        k,
-        v,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
         layer: RadixAttention,
         forward_batch: ForwardBatch,
         save_kv_cache=True,

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -406,6 +406,10 @@ def _decode_grouped_att_m_fwd(
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
     if Lk == 576:
         BLOCK_DMODEL = 512
         BLOCK_DPE = 64

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -17,6 +17,8 @@ import dataclasses
 from typing import List, Optional, Union
 import torch
+import triton
+import triton.language as tl
 from torch import nn
 from vllm.distributed import (
     get_tensor_model_parallel_world_size,
@@ -33,76 +35,77 @@ from sglang.srt.model_executor.forward_batch_info import (
 @dataclasses.dataclass
 class LogitsProcessorOutput:
+    ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
-    # The logprobs of the next tokens.     shape: [#seq, vocab_size]
-    next_token_logprobs: torch.Tensor = None
+    # Used by speculative decoding (EAGLE)
+    # The last hidden layers
+    hidden_states: Optional[torch.Tensor] = None
+    ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
+    # The logprobs of the next tokens.                              shape: [#seq]
+    next_token_logprobs: Optional[torch.Tensor] = None
+    # The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
+    next_token_top_logprobs_val: Optional[List] = None
+    next_token_top_logprobs_idx: Optional[List] = None
+    ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
     # The normlaized logprobs of prompts.  shape: [#seq]
     normalized_prompt_logprobs: torch.Tensor = None
-    # The logprobs of input tokens.        shape: [#token, vocab_size]
+    # The logprobs of input tokens.        shape: [#token]
     input_token_logprobs: torch.Tensor = None
-    # The logprob and id of the top-k tokens in input positions.  shape [#seq, #token, k]
+    # The logprobs and ids of the top-k tokens in input positions.  shape: [#seq, #token, k]
     input_top_logprobs_val: List = None
     input_top_logprobs_idx: List = None
-    # The logprob and id of the top-k tokens in output positions. shape [#seq, #token, k]
-    output_top_logprobs_val: List = None
-    output_top_logprobs_idx: List = None
-    # Used by speculative decoding (EAGLE)
-    # The output of transformer layers
-    hidden_states: Optional[torch.Tensor] = None
 @dataclasses.dataclass
 class LogitsMetadata:
     forward_mode: ForwardMode
-    top_logprobs_nums: Optional[List[int]]
-    return_logprob: bool = False
-    return_top_logprob: bool = False
+    capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.NULL
+    extend_return_logprob: bool = False
+    extend_return_top_logprob: bool = False
     extend_seq_lens: Optional[torch.Tensor] = None
     extend_seq_lens_cpu: Optional[List[int]] = None
     extend_logprob_start_lens_cpu: Optional[List[int]] = None
     extend_logprob_pruned_lens_cpu: Optional[List[int]] = None
-    capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.NULL
+    top_logprobs_nums: Optional[List[int]] = None
     @classmethod
     def from_forward_batch(cls, forward_batch: ForwardBatch):
-        extend_logprob_pruned_lens_cpu = None
-        if forward_batch.return_logprob:
-            return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
-            if forward_batch.forward_mode.is_extend():
-                extend_logprob_pruned_lens_cpu = [
-                    extend_len - start_len
-                    for extend_len, start_len in zip(
-                        forward_batch.extend_seq_lens_cpu,
-                        forward_batch.extend_logprob_start_lens_cpu,
-                    )
-                ]
-        else:
-            return_top_logprob = False
         if forward_batch.spec_info:
             capture_hidden_mode = forward_batch.spec_info.capture_hidden_mode
         else:
             capture_hidden_mode = CaptureHiddenMode.NULL
+        if forward_batch.forward_mode.is_extend() and forward_batch.return_logprob:
+            extend_return_logprob = True
+            extend_return_top_logprob = any(
+                x > 0 for x in forward_batch.top_logprobs_nums
+            )
+            extend_logprob_pruned_lens_cpu = [
+                extend_len - start_len
+                for extend_len, start_len in zip(
+                    forward_batch.extend_seq_lens_cpu,
+                    forward_batch.extend_logprob_start_lens_cpu,
+                )
+            ]
+        else:
+            extend_return_logprob = extend_return_top_logprob = (
+                extend_logprob_pruned_lens_cpu
+            ) = False
         return cls(
             forward_mode=forward_batch.forward_mode,
-            top_logprobs_nums=forward_batch.top_logprobs_nums,
-            return_logprob=forward_batch.return_logprob,
-            return_top_logprob=return_top_logprob,
+            capture_hidden_mode=capture_hidden_mode,
+            extend_return_logprob=extend_return_logprob,
+            extend_return_top_logprob=extend_return_top_logprob,
             extend_seq_lens=forward_batch.extend_seq_lens,
             extend_seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
             extend_logprob_start_lens_cpu=forward_batch.extend_logprob_start_lens_cpu,
             extend_logprob_pruned_lens_cpu=extend_logprob_pruned_lens_cpu,
-            capture_hidden_mode=capture_hidden_mode,
+            top_logprobs_nums=forward_batch.top_logprobs_nums,
         )
@@ -129,7 +132,6 @@ class LogitsProcessor(nn.Module):
     ):
         if isinstance(logits_metadata, ForwardBatch):
             logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
-        assert isinstance(logits_metadata, LogitsMetadata)
         # Get the last hidden states and last logits for the next token prediction
         if (
@@ -142,18 +144,13 @@ class LogitsProcessor(nn.Module):
             last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
             last_hidden = hidden_states[last_index]
+        # Compute logits
         last_logits = self._get_logits(last_hidden, lm_head)
-        if self.do_tensor_parallel_all_gather:
-            last_logits = tensor_model_parallel_all_gather(last_logits)
-        last_logits = last_logits[:, : self.config.vocab_size].float()
-        if self.final_logit_softcapping:
-            last_logits.div_(self.final_logit_softcapping)
-            torch.tanh(last_logits, out=last_logits)
-            last_logits.mul_(self.final_logit_softcapping)
-        # Return only last_logits if logprob is not requested
-        if not logits_metadata.return_logprob:
+        if (
+            not logits_metadata.extend_return_logprob
+            or logits_metadata.capture_hidden_mode.need_capture()
+        ):
+            # Decode mode or extend mode without return_logprob.
             return LogitsProcessorOutput(
                 next_token_logits=last_logits,
                 hidden_states=(
@@ -167,95 +164,60 @@ class LogitsProcessor(nn.Module):
                 ),
             )
         else:
-            last_logprobs = self.compute_temp_top_p_normalized_logprobs(
-                last_logits, logits_metadata
+            # Slice the requested tokens to compute logprob
+            pt, pruned_states, pruned_input_ids = 0, [], []
+            for start_len, extend_len in zip(
+                logits_metadata.extend_logprob_start_lens_cpu,
+                logits_metadata.extend_seq_lens_cpu,
+            ):
+                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
+                pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
+                pt += extend_len
+            # Compute the logits of all required tokens
+            pruned_states = torch.cat(pruned_states)
+            del hidden_states
+            input_token_logits = self._get_logits(pruned_states, lm_head)
+            del pruned_states
+            # Normalize the logprob w/o temperature, top-p
+            input_logprobs = input_token_logits
+            input_logprobs = self.compute_temp_top_p_normalized_logprobs(
+                input_logprobs, logits_metadata
             )
-            if logits_metadata.forward_mode.is_decode():
-                if logits_metadata.return_top_logprob:
-                    output_top_logprobs_val, output_top_logprobs_idx = (
-                        self.get_top_logprobs(last_logprobs, logits_metadata)[2:4]
-                    )
-                else:
-                    output_top_logprobs_val = output_top_logprobs_idx = None
-                return LogitsProcessorOutput(
-                    next_token_logits=last_logits,
-                    next_token_logprobs=last_logprobs,
-                    output_top_logprobs_val=output_top_logprobs_val,
-                    output_top_logprobs_idx=output_top_logprobs_idx,
-                )
+            # Get the logprob of top-k tokens
+            if logits_metadata.extend_return_top_logprob:
+                (
+                    input_top_logprobs_val,
+                    input_top_logprobs_idx,
+                ) = self.get_top_logprobs(input_logprobs, logits_metadata)
             else:
-                # Slice the requested tokens to compute logprob
-                pt, states, pruned_input_ids = 0, [], []
-                for start_len, extend_len in zip(
-                    logits_metadata.extend_logprob_start_lens_cpu,
-                    logits_metadata.extend_seq_lens_cpu,
-                ):
-                    states.append(hidden_states[pt + start_len : pt + extend_len])
-                    pruned_input_ids.append(input_ids[pt + start_len : pt + extend_len])
-                    pt += extend_len
-                # Compute the logits and logprobs for all required tokens
-                states = torch.cat(states, dim=0)
-                all_logits = self._get_logits(states, lm_head)
-                if self.do_tensor_parallel_all_gather:
-                    all_logits = tensor_model_parallel_all_gather(all_logits)
-                # The LM head's weights may be zero-padded for parallelism. Remove any
-                # extra logits that this padding may have produced.
-                all_logits = all_logits[:, : self.config.vocab_size].float()
-                if self.final_logit_softcapping:
-                    all_logits.div_(self.final_logit_softcapping)
-                    torch.tanh(all_logits, out=all_logits)
-                    all_logits.mul_(self.final_logit_softcapping)
-                all_logprobs = all_logits
-                del all_logits, hidden_states
-                all_logprobs = self.compute_temp_top_p_normalized_logprobs(
-                    all_logprobs, logits_metadata
-                )
-                # Get the logprob of top-k tokens
-                if logits_metadata.return_top_logprob:
-                    (
-                        input_top_logprobs_val,
-                        input_top_logprobs_idx,
-                        output_top_logprobs_val,
-                        output_top_logprobs_idx,
-                    ) = self.get_top_logprobs(all_logprobs, logits_metadata)
-                else:
-                    input_top_logprobs_val = input_top_logprobs_idx = (
-                        output_top_logprobs_val
-                    ) = output_top_logprobs_idx = None
-                # Compute the normalized logprobs for the requested tokens.
-                # Note that we pad a zero at the end for easy batching.
-                input_token_logprobs = all_logprobs[
-                    torch.arange(all_logprobs.shape[0], device="cuda"),
-                    torch.cat(
-                        [
-                            torch.cat(pruned_input_ids)[1:],
-                            torch.tensor([0], device="cuda"),
-                        ]
-                    ),
-                ]
-                normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
-                    input_token_logprobs,
-                    logits_metadata,
-                )
+                input_top_logprobs_val = input_top_logprobs_idx = None
+            # Compute the normalized logprobs for the requested tokens.
+            # Note that we pad a zero at the end for easy batching.
+            input_token_logprobs = input_logprobs[
+                torch.arange(input_logprobs.shape[0], device="cuda"),
+                torch.cat(
+                    [
+                        torch.cat(pruned_input_ids)[1:],
+                        torch.tensor([0], device="cuda"),
+                    ]
+                ),
+            ]
+            normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
+                input_token_logprobs,
+                logits_metadata,
+            )
-                return LogitsProcessorOutput(
-                    next_token_logits=last_logits,
-                    next_token_logprobs=last_logprobs,
-                    normalized_prompt_logprobs=normalized_prompt_logprobs,
-                    input_token_logprobs=input_token_logprobs,
-                    input_top_logprobs_val=input_top_logprobs_val,
-                    input_top_logprobs_idx=input_top_logprobs_idx,
-                    output_top_logprobs_val=output_top_logprobs_val,
-                    output_top_logprobs_idx=output_top_logprobs_idx,
-                )
+            return LogitsProcessorOutput(
+                next_token_logits=last_logits,
+                normalized_prompt_logprobs=normalized_prompt_logprobs,
+                input_token_logprobs=input_token_logprobs,
+                input_top_logprobs_val=input_top_logprobs_val,
+                input_top_logprobs_idx=input_top_logprobs_idx,
+            )
     def _get_logits(
         self,
@@ -269,9 +231,19 @@ class LogitsProcessor(nn.Module):
             # GGUF models
             logits = lm_head.linear_method.apply(lm_head, hidden_states, embedding_bias)
-        # Optional scaling factor
         if self.logit_scale is not None:
-            logits.mul_(self.logit_scale)  # In-place multiply
+            logits.mul_(self.logit_scale)
+        if self.do_tensor_parallel_all_gather:
+            logits = tensor_model_parallel_all_gather(logits)
+        # Compute the normalized logprobs for the requested tokens.
+        # Note that we pad a zero at the end for easy batching.
+        logits = logits[:, : self.config.vocab_size].float()
+        if self.final_logit_softcapping:
+            fused_softcap(logits, self.final_logit_softcapping)
         return logits
     @staticmethod
@@ -302,90 +274,73 @@ class LogitsProcessor(nn.Module):
         values = ret.values.tolist()
         indices = ret.indices.tolist()
-        if logits_metadata.forward_mode.is_decode():
-            output_top_logprobs_val = []
-            output_top_logprobs_idx = []
-            for i, k in enumerate(logits_metadata.top_logprobs_nums):
-                output_top_logprobs_val.append(values[i][:k])
-                output_top_logprobs_idx.append(indices[i][:k])
-            return None, None, output_top_logprobs_val, output_top_logprobs_idx
-        else:
-            input_top_logprobs_val, input_top_logprobs_idx = [], []
-            output_top_logprobs_val, output_top_logprobs_idx = [], []
+        input_top_logprobs_val, input_top_logprobs_idx = [], []
-            pt = 0
-            for k, pruned_len in zip(
-                logits_metadata.top_logprobs_nums,
-                logits_metadata.extend_logprob_pruned_lens_cpu,
-            ):
-                if pruned_len <= 0:
-                    input_top_logprobs_val.append([])
-                    input_top_logprobs_idx.append([])
-                    output_top_logprobs_val.append([])
-                    output_top_logprobs_idx.append([])
-                    continue
-                input_top_logprobs_val.append(
-                    [values[pt + j][:k] for j in range(pruned_len - 1)]
-                )
-                input_top_logprobs_idx.append(
-                    [indices[pt + j][:k] for j in range(pruned_len - 1)]
-                )
-                output_top_logprobs_val.append(
-                    list(
-                        values[pt + pruned_len - 1][:k],
-                    )
-                )
-                output_top_logprobs_idx.append(
-                    list(
-                        indices[pt + pruned_len - 1][:k],
-                    )
-                )
-                pt += pruned_len
+        pt = 0
+        for k, pruned_len in zip(
+            logits_metadata.top_logprobs_nums,
+            logits_metadata.extend_logprob_pruned_lens_cpu,
+        ):
+            if pruned_len <= 0:
+                input_top_logprobs_val.append([])
+                input_top_logprobs_idx.append([])
+                continue
-            return (
-                input_top_logprobs_val,
-                input_top_logprobs_idx,
-                output_top_logprobs_val,
-                output_top_logprobs_idx,
+            input_top_logprobs_val.append(
+                [values[pt + j][:k] for j in range(pruned_len - 1)]
             )
+            input_top_logprobs_idx.append(
+                [indices[pt + j][:k] for j in range(pruned_len - 1)]
+            )
+            pt += pruned_len
+        return input_top_logprobs_val, input_top_logprobs_idx
     @staticmethod
     def compute_temp_top_p_normalized_logprobs(
         last_logits: torch.Tensor, logits_metadata: LogitsMetadata
     ) -> torch.Tensor:
+        # TODO: Implement the temp and top-p normalization
         return torch.nn.functional.log_softmax(last_logits, dim=-1)
-def test():
-    all_logprobs = torch.tensor(
-        #       s                     s                s
-        [[0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [4, 5, 6, 7]],
-        dtype=torch.float32,
-        device="cuda",
+@triton.jit
+def fused_softcap_kernel(
+    full_logits_ptr,
+    softcapping_value,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # Load values
+    x = tl.load(full_logits_ptr + offsets, mask=mask)
+    # Perform operations in-place
+    x = x / softcapping_value
+    # Manual tanh implementation using exp
+    exp2x = tl.exp(2 * x)
+    x = (exp2x - 1) / (exp2x + 1)
+    x = x * softcapping_value
+    # Store result
+    tl.store(full_logits_ptr + offsets, x, mask=mask)
+def fused_softcap(full_logits, final_logit_softcapping):
+    n_elements = full_logits.numel()
+    BLOCK_SIZE = 1024
+    grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE, 1, 1)
+    fused_softcap_kernel[grid](
+        full_logits_ptr=full_logits,
+        softcapping_value=final_logit_softcapping,
+        n_elements=n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
     )
-    seq_lens = torch.tensor([2, 0, 3, 0], dtype=torch.int32, device="cuda")
-    input_ids = torch.tensor([1, 2, 3, 0, 1], dtype=torch.int32, device="cuda")
-    token_logprobs = all_logprobs[
-        torch.arange(all_logprobs.shape[0], device="cuda"),
-        torch.cat([input_ids[1:], torch.tensor([0], device="cuda")]),
-    ]
-    logprobs_cumsum = torch.cumsum(token_logprobs, dim=0, dtype=torch.float32)
-    len_cumsum = torch.cumsum(seq_lens, dim=0)
-    start = torch.cat((torch.tensor([0], device="cuda"), len_cumsum[:-1]), 0)
-    end = start + seq_lens - 2
-    start.clamp_(min=0, max=token_logprobs.shape[0] - 1)
-    end.clamp_(min=0, max=token_logprobs.shape[0] - 1)
-    sum_logp = logprobs_cumsum[end] - logprobs_cumsum[start] + token_logprobs[start]
-    # assert logprobs == [2, _, 2, 4, _]
-    print("token logprobs", token_logprobs)
-    print("start", start)
-    print("end", end)
-    print("sum_logp", sum_logp)
-if __name__ == "__main__":
-    test()
+    return full_logits

sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post4py3-none-any.whl