PyPI - sglang - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

sglang/__init__.py +1 -1
sglang/api.py +26 -0
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +34 -16
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +41 -6
sglang/lang/interpreter.py +5 -1
sglang/lang/ir.py +61 -25
sglang/srt/constrained/__init__.py +3 -2
sglang/srt/hf_transformers_utils.py +7 -3
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +181 -167
sglang/srt/layers/logits_processor.py +55 -19
sglang/srt/layers/radix_attention.py +24 -27
sglang/srt/layers/token_attention.py +4 -1
sglang/srt/managers/controller/infer_batch.py +2 -2
sglang/srt/managers/controller/manager_single.py +1 -1
sglang/srt/managers/controller/model_runner.py +27 -15
sglang/srt/managers/controller/tp_worker.py +31 -14
sglang/srt/managers/detokenizer_manager.py +4 -2
sglang/srt/managers/io_struct.py +1 -1
sglang/srt/managers/tokenizer_manager.py +14 -13
sglang/srt/model_config.py +6 -0
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/llama2.py +3 -3
sglang/srt/models/llama_classification.py +10 -7
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/openai_api_adapter.py +2 -2
sglang/srt/openai_protocol.py +1 -1
sglang/srt/server.py +17 -8
sglang/srt/server_args.py +14 -16
sglang/srt/utils.py +68 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA +19 -13
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD +38 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0

sglang/srt/layers/fused_moe.py CHANGED Viewed

@@ -9,7 +9,6 @@ from typing import Any, Dict, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
@@ -108,12 +107,16 @@ def fused_moe_kernel(
     offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
-                      offs_k[None, :] * stride_ak)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
     off_experts = tl.load(expert_ids_ptr + pid_m)
-    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
-                                                offs_bn[None, :] * stride_bn)
+    b_ptrs = (
+        b_ptr
+        + off_experts * stride_be
+        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    )
     if use_fp8:
         a_scale = tl.load(a_scale_ptr)
@@ -129,13 +132,12 @@ def fused_moe_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
-        a = tl.load(a_ptrs,
-                    mask=token_mask[:, None] &
-                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
-                    other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
         # We accumulate along the K dimension.
         if use_fp8:
             accumulator = tl.dot(a, b, acc=accumulator)
@@ -146,9 +148,7 @@ def fused_moe_kernel(
         b_ptrs += BLOCK_SIZE_K * stride_bk
     if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token,
-                             mask=token_mask,
-                             other=0)
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
         accumulator = accumulator * moe_weight[:, None]
     if use_fp8:
@@ -158,15 +158,14 @@ def fused_moe_kernel(
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
     tl.store(c_ptrs, accumulator, mask=c_mask)
 def moe_align_block_size(
-        topk_ids: torch.Tensor, block_size: int,
-        num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
     size for matrix multiplication.
@@ -205,32 +204,38 @@ def moe_align_block_size(
         by block_size for proper block matrix operations.
     """
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
     sorted_ids.fill_(topk_ids.numel())
     max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    expert_ids = torch.empty((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    ops.moe_align_block_size(
+        topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad
+    )
     return sorted_ids, expert_ids, num_tokens_post_pad
-def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
-                            A_scale: Optional[torch.Tensor],
-                            B_scale: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                            sorted_token_ids: torch.Tensor,
-                            expert_ids: torch.Tensor,
-                            num_tokens_post_padded: torch.Tensor,
-                            mul_routed_weight: bool, top_k: int,
-                            config: Dict[str, Any], compute_type: tl.dtype,
-                            use_fp8: bool) -> None:
+def invoke_fused_moe_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: Dict[str, Any],
+    compute_type: tl.dtype,
+    use_fp8: bool,
+) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -241,8 +246,10 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
         A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         assert B_scale is not None
-    grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
-        'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
+    grid = lambda META: (
+        triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"])
+        * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]),
+    )
     fused_moe_kernel[grid](
         A,
@@ -280,8 +287,7 @@ def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
 @functools.lru_cache
-def get_moe_configs(E: int, N: int,
-                    dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+def get_moe_configs(E: int, N: int, dtype: Optional[str]) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
@@ -296,11 +302,11 @@ def get_moe_configs(E: int, N: int,
     json_file_name = get_config_file_name(E, N, dtype)
     config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
-            logger.info("Using configuration from %s for MoE layer.",
-                        config_file_path)
+            logger.info("Using configuration from %s for MoE layer.", config_file_path)
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
@@ -319,35 +325,35 @@ def get_default_config(
 ) -> Dict[str, int]:
     if dtype == "float8":
         config = {
-            'BLOCK_SIZE_M': 128,
-            'BLOCK_SIZE_N': 256,
-            'BLOCK_SIZE_K': 128,
-            'GROUP_SIZE_M': 32,
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 32,
             "num_warps": 8,
-            "num_stages": 4
+            "num_stages": 4,
         }
         if M <= E:
             config = {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 128,
-                'GROUP_SIZE_M': 1,
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 1,
                 "num_warps": 4,
-                "num_stages": 4
+                "num_stages": 4,
             }
     else:
         config = {
-            'BLOCK_SIZE_M': 64,
-            'BLOCK_SIZE_N': 64,
-            'BLOCK_SIZE_K': 32,
-            'GROUP_SIZE_M': 8
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
         }
         if M <= E:
             config = {
-                'BLOCK_SIZE_M': 16,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 64,
-                'GROUP_SIZE_M': 1
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
             }
     return config
@@ -358,23 +364,17 @@ def fused_topk(
     topk: int,
     renormalize: bool,
 ):
-    assert hidden_states.shape[0] == gating_output.shape[0], (
-        "Number of tokens mismatch")
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     M, _ = hidden_states.shape
-    topk_weights = torch.empty(M,
-                               topk,
-                               dtype=torch.float32,
-                               device=hidden_states.device)
-    topk_ids = torch.empty(M,
-                           topk,
-                           dtype=torch.int32,
-                           device=hidden_states.device)
-    token_expert_indicies = torch.empty(M,
-                                        topk,
-                                        dtype=torch.int32,
-                                        device=hidden_states.device)
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+    token_expert_indicies = torch.empty(
+        M, topk, dtype=torch.int32, device=hidden_states.device
+    )
     ops.topk_softmax(
         topk_weights,
         topk_ids,
@@ -388,27 +388,27 @@ def fused_topk(
     return topk_weights, topk_ids
-def fused_experts(hidden_states: torch.Tensor,
-                  w1: torch.Tensor,
-                  w2: torch.Tensor,
-                  topk_weights: torch.Tensor,
-                  topk_ids: torch.Tensor,
-                  inplace: bool = False,
-                  override_config: Optional[Dict[str, Any]] = None,
-                  use_fp8: bool = False,
-                  w1_scale: Optional[torch.Tensor] = None,
-                  w2_scale: Optional[torch.Tensor] = None,
-                  a1_scale: Optional[torch.Tensor] = None,
-                  a2_scale: Optional[torch.Tensor] = None):
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    override_config: Optional[Dict[str, Any]] = None,
+    use_fp8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+):
     # Check constraints.
     assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
     M, _ = hidden_states.shape
     E, N, _ = w1.shape
@@ -417,8 +417,7 @@ def fused_experts(hidden_states: torch.Tensor,
         config = override_config
     else:
         # First try to load optimal config from the file
-        configs = get_moe_configs(E, w2.shape[2],
-                                  "float8" if use_fp8 else None)
+        configs = get_moe_configs(E, w2.shape[2], "float8" if use_fp8 else None)
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -426,65 +425,76 @@ def fused_experts(hidden_states: torch.Tensor,
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
-            config = get_default_config(M, E, N, w1.shape[2],
-                                        topk_ids.shape[1],
-                                        "float8" if use_fp8 else None)
-    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-    intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
+            config = get_default_config(
+                M, E, N, w1.shape[2], topk_ids.shape[1], "float8" if use_fp8 else None
+            )
+    intermediate_cache1 = torch.empty(
+        (M, topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N // 2),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache3 = torch.empty(
+        (M, topk_ids.shape[1], w2.shape[1]),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
-        topk_ids, config['BLOCK_SIZE_M'], E)
-    compute_type = (tl.bfloat16
-                    if hidden_states.dtype == torch.bfloat16 else tl.float16)
-    invoke_fused_moe_kernel(hidden_states,
-                            w1,
-                            intermediate_cache1,
-                            a1_scale,
-                            w1_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            False,
-                            topk_ids.shape[1],
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
+        topk_ids, config["BLOCK_SIZE_M"], E
+    )
+    compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+    invoke_fused_moe_kernel(
+        hidden_states,
+        w1,
+        intermediate_cache1,
+        a1_scale,
+        w1_scale,
+        topk_weights,
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        False,
+        topk_ids.shape[1],
+        config,
+        compute_type=compute_type,
+        use_fp8=use_fp8,
+    )
     ops.gelu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
-    invoke_fused_moe_kernel(intermediate_cache2,
-                            w2,
-                            intermediate_cache3,
-                            a2_scale,
-                            w2_scale,
-                            topk_weights,
-                            topk_ids,
-                            sorted_token_ids,
-                            expert_ids,
-                            num_tokens_post_padded,
-                            True,
-                            1,
-                            config,
-                            compute_type=compute_type,
-                            use_fp8=use_fp8)
+    invoke_fused_moe_kernel(
+        intermediate_cache2,
+        w2,
+        intermediate_cache3,
+        a2_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        True,
+        1,
+        config,
+        compute_type=compute_type,
+        use_fp8=use_fp8,
+    )
     if inplace:
-        return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                         dim=1,
-                         out=hidden_states)
-    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
-                     dim=1)
+        return torch.sum(
+            intermediate_cache3.view(*intermediate_cache3.shape),
+            dim=1,
+            out=hidden_states,
+        )
+    return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1)
 def fused_moe(
@@ -532,25 +542,28 @@ def fused_moe(
     assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
     if hasattr(ops, "topk_softmax"):
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
+        topk_weights, topk_ids = fused_topk(
+            hidden_states, gating_output, topk, renormalize
+        )
     else:
-        topk_weights, topk_ids = fused_topk_v0_4_3(hidden_states, gating_output, topk,
-                                                   renormalize)
-    return fused_experts(hidden_states,
-                         w1,
-                         w2,
-                         topk_weights,
-                         topk_ids,
-                         inplace=inplace,
-                         override_config=override_config,
-                         use_fp8=use_fp8,
-                         w1_scale=w1_scale,
-                         w2_scale=w2_scale,
-                         a1_scale=a1_scale,
-                         a2_scale=a2_scale)
+        topk_weights, topk_ids = fused_topk_v0_4_3(
+            hidden_states, gating_output, topk, renormalize
+        )
+    return fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace=inplace,
+        override_config=override_config,
+        use_fp8=use_fp8,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )
 def fused_topk_v0_4_3(
@@ -560,6 +573,7 @@ def fused_topk_v0_4_3(
     renormalize: bool,
 ):
     import vllm._moe_C as moe_kernels
     M, _ = hidden_states.shape
     topk_weights = torch.empty(
@@ -579,4 +593,4 @@ def fused_topk_v0_4_3(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    return topk_weights, topk_ids
+    return topk_weights, topk_ids

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Logits processing."""
 import dataclasses
-from typing import List
+from typing import List, Union
 import torch
 from torch import nn
@@ -31,6 +31,27 @@ class LogitProcessorOutput:
     decode_top_logprobs: List
+@dataclasses.dataclass
+class LogitsMetadata:
+    forward_mode: ForwardMode
+    extend_seq_lens: torch.Tensor
+    extend_start_loc: torch.Tensor
+    # For logprobs
+    return_logprob: bool
+    top_logprobs_nums: List[int]
+    @classmethod
+    def from_input_metadata(cls, input_metadata: InputMetadata):
+        return cls(
+            forward_mode=input_metadata.forward_mode,
+            extend_seq_lens=input_metadata.extend_seq_lens,
+            extend_start_loc=input_metadata.extend_start_loc,
+            return_logprob=input_metadata.return_logprob,
+            top_logprobs_nums=input_metadata.top_logprobs_nums,
+        )
 class LogitsProcessor(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -38,14 +59,14 @@ class LogitsProcessor(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
     def _get_normalized_prompt_logprobs(
-        self, prefill_token_logprobs, input_metadata: InputMetadata
+        self, prefill_token_logprobs, logits_metadata: LogitsMetadata
     ):
         logprobs_cumsum = torch.cumsum(
             prefill_token_logprobs, dim=0, dtype=torch.float32
         )
-        start = input_metadata.extend_start_loc.clone()
-        end = start + input_metadata.extend_seq_lens - 2
+        start = logits_metadata.extend_start_loc.clone()
+        end = start + logits_metadata.extend_seq_lens - 2
         start.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
         end.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
         sum_logp = (
@@ -54,17 +75,17 @@ class LogitsProcessor(nn.Module):
             + prefill_token_logprobs[start]
         )
         normalized_prompt_logprobs = sum_logp / (
-            (input_metadata.extend_seq_lens - 1).clamp(min=1)
+            (logits_metadata.extend_seq_lens - 1).clamp(min=1)
         )
         return normalized_prompt_logprobs
-    def _get_top_logprobs(self, all_logprobs, input_metadata: InputMetadata):
+    def _get_top_logprobs(self, all_logprobs, logits_metadata: LogitsMetadata):
         # TODO: vectorize the code below
-        if input_metadata.forward_mode == ForwardMode.DECODE:
+        if logits_metadata.forward_mode == ForwardMode.DECODE:
             decode_top_logprobs = []
             for i in range(all_logprobs.shape[0]):
-                k = input_metadata.top_logprobs_nums[i]
+                k = logits_metadata.top_logprobs_nums[i]
                 t = all_logprobs[i].topk(k)
                 v_cpu = t.values.tolist()
                 p_cpu = t.indices.tolist()
@@ -73,13 +94,13 @@ class LogitsProcessor(nn.Module):
         else:
             prefill_top_logprobs, decode_top_logprobs = [], []
             pt = 0
-            extend_seq_lens_cpu = input_metadata.extend_seq_lens.tolist()
+            extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist()
             for i, extend_seq_len in enumerate(extend_seq_lens_cpu):
                 if extend_seq_len == 0:
                     prefill_top_logprobs.append([])
                     decode_top_logprobs.append([])
                     continue
-                k = input_metadata.top_logprobs_nums[i]
+                k = logits_metadata.top_logprobs_nums[i]
                 t = all_logprobs[pt : pt + extend_seq_len].topk(k)
                 vs_cpu = t.values.tolist()
                 ps_cpu = t.indices.tolist()
@@ -91,14 +112,24 @@ class LogitsProcessor(nn.Module):
             return prefill_top_logprobs, decode_top_logprobs
-    def forward(self, input_ids, hidden_states, weight, input_metadata: InputMetadata):
+    def forward(
+        self,
+        input_ids,
+        hidden_states,
+        weight,
+        logits_metadata: Union[LogitsMetadata, InputMetadata],
+    ):
+        if isinstance(logits_metadata, InputMetadata):
+            logits_metadata = LogitsMetadata.from_input_metadata(logits_metadata)
+        assert isinstance(logits_metadata, LogitsMetadata)
         # Get the last hidden states and last logits for the next token prediction
-        if input_metadata.forward_mode == ForwardMode.DECODE:
+        if logits_metadata.forward_mode == ForwardMode.DECODE:
             last_index = None
             last_hidden = hidden_states
         else:
             last_index = (
-                torch.cumsum(input_metadata.extend_seq_lens, dim=0, dtype=torch.long)
+                torch.cumsum(logits_metadata.extend_seq_lens, dim=0, dtype=torch.long)
                 - 1
             )
             last_hidden = hidden_states[last_index]
@@ -108,8 +139,13 @@ class LogitsProcessor(nn.Module):
             last_logits = tensor_model_parallel_all_gather(last_logits)
         last_logits = last_logits[:, : self.config.vocab_size]
+        if hasattr(self.config, "final_logit_softcapping"):
+            last_logits /= self.config.final_logit_softcapping
+            last_logits = torch.tanh(last_logits)
+            last_logits *= self.config.final_logit_softcapping
         # Return only last_logits if logprob is not requested
-        if not input_metadata.return_logprob:
+        if not logits_metadata.return_logprob:
             return LogitProcessorOutput(
                 next_token_logits=last_logits,
                 next_token_logprobs=None,
@@ -120,7 +156,7 @@ class LogitsProcessor(nn.Module):
             )
         else:
             # When logprob is requested, compute the logits for all tokens.
-            if input_metadata.forward_mode == ForwardMode.DECODE:
+            if logits_metadata.forward_mode == ForwardMode.DECODE:
                 all_logits = last_logits
             else:
                 all_logits = torch.matmul(hidden_states, weight.T)
@@ -133,15 +169,15 @@ class LogitsProcessor(nn.Module):
             all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)
             # Get the logprob of top-k tokens
-            return_top_logprob = any(x > 0 for x in input_metadata.top_logprobs_nums)
+            return_top_logprob = any(x > 0 for x in logits_metadata.top_logprobs_nums)
             if return_top_logprob:
                 prefill_top_logprobs, decode_top_logprobs = self._get_top_logprobs(
-                    all_logprobs, input_metadata
+                    all_logprobs, logits_metadata
                 )
             else:
                 prefill_top_logprobs = decode_top_logprobs = None
-            if input_metadata.forward_mode == ForwardMode.DECODE:
+            if logits_metadata.forward_mode == ForwardMode.DECODE:
                 return LogitProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=all_logprobs,
@@ -161,7 +197,7 @@ class LogitsProcessor(nn.Module):
                 ]
                 normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
-                    prefill_token_logprobs, input_metadata
+                    prefill_token_logprobs, logits_metadata
                 )
                 return LogitProcessorOutput(

sglang 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl