PyPI - sglang - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl - Mend

sglang 0.4.1py3-none-any.whl → 0.4.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sglang/bench_offline_throughput.py +1 -0
sglang/bench_serving.py +11 -3
sglang/lang/backend/openai.py +10 -0
sglang/srt/configs/model_config.py +11 -2
sglang/srt/constrained/xgrammar_backend.py +6 -0
sglang/srt/layers/attention/__init__.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +54 -41
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
sglang/srt/layers/logits_processor.py +30 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -30
sglang/srt/layers/moe/topk.py +14 -0
sglang/srt/layers/quantization/fp8.py +42 -2
sglang/srt/layers/quantization/fp8_kernel.py +91 -18
sglang/srt/layers/quantization/fp8_utils.py +8 -2
sglang/srt/managers/io_struct.py +29 -8
sglang/srt/managers/schedule_batch.py +22 -15
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +71 -34
sglang/srt/managers/session_controller.py +102 -27
sglang/srt/managers/tokenizer_manager.py +95 -55
sglang/srt/managers/tp_worker.py +7 -0
sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
sglang/srt/model_executor/forward_batch_info.py +42 -3
sglang/srt/model_executor/model_runner.py +4 -6
sglang/srt/model_loader/loader.py +22 -11
sglang/srt/models/gemma2.py +19 -0
sglang/srt/models/llama.py +13 -2
sglang/srt/models/llama_eagle.py +132 -0
sglang/srt/openai_api/adapter.py +79 -2
sglang/srt/openai_api/protocol.py +50 -0
sglang/srt/sampling/sampling_params.py +9 -2
sglang/srt/server.py +45 -39
sglang/srt/server_args.py +17 -30
sglang/srt/speculative/spec_info.py +19 -0
sglang/srt/utils.py +62 -0
sglang/version.py +1 -1
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/METADATA +5 -5
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/RECORD +41 -39
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
-from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 from vllm import _custom_ops as ops
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import direct_register_custom_op, get_device_name
+from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
+not_hip = False
+if not is_hip():
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    not_hip = True
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -267,8 +272,14 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    # FIXME(zhyncs)
-    if num_experts >= 256:
+    if not_hip and num_experts >= 224:
+        token_cnts_buffer = torch.empty(
+            (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
+        )
+        cumsum_buffer = torch.empty(
+            num_experts + 1, dtype=torch.int32, device=topk_ids.device
+        )
         sgl_moe_align_block_size(
             topk_ids,
             num_experts,
@@ -276,6 +287,8 @@ def moe_align_block_size(
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            token_cnts_buffer,
+            cumsum_buffer,
         )
     else:
         ops.moe_align_block_size(
@@ -379,14 +392,25 @@ def invoke_fused_moe_kernel(
     )
-def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
+def get_config_file_name(
+    E: int, N: int, dtype: Optional[str], block_shape: Optional[int] = None
+) -> str:
     device_name = get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
-    return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
+    block_shape_selector = (
+        "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
+    )
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"
 @functools.lru_cache
-def get_moe_configs(E: int, N: int, dtype: Optional[str]) -> Optional[Dict[int, Any]]:
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = 0,
+    block_k: Optional[int] = 0,
+) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
@@ -398,7 +422,7 @@ def get_moe_configs(E: int, N: int, dtype: Optional[str]) -> Optional[Dict[int,
     # First look up if an optimized configuration is available in the configs
     # directory
-    json_file_name = get_config_file_name(E, N, dtype)
+    json_file_name = get_config_file_name(E, N, dtype, [block_n, block_k])
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
@@ -429,25 +453,37 @@ def get_default_config(
     topk: int,
     dtype: Optional[str],
     is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
 ) -> Dict[str, int]:
     if dtype == "fp8_w8a8":
-        config = {
-            "BLOCK_SIZE_M": 128,
-            "BLOCK_SIZE_N": 256,
-            "BLOCK_SIZE_K": 128,
-            "GROUP_SIZE_M": 32,
-            "num_warps": 8,
-            "num_stages": 4,
-        }
-        if M <= E:
+        if block_shape is None:
             config = {
-                "BLOCK_SIZE_M": 64,
-                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
                 "BLOCK_SIZE_K": 128,
-                "GROUP_SIZE_M": 1,
-                "num_warps": 4,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 8,
                 "num_stages": 4,
             }
+            if M <= E:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 4,
+                }
+        else:
+            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_shape[0],
+                "BLOCK_SIZE_K": block_shape[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 3,
+            }
     else:
         config = {
             "BLOCK_SIZE_M": 64,
@@ -483,7 +519,9 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
-        configs = get_moe_configs(E, N, dtype)
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
         if configs:
             # If an optimal configuration map has been found, look up the
@@ -491,14 +529,9 @@ def try_get_optimal_moe_config(
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
-            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype, is_marlin)
-    # TODO(HandH1998): Optimize the configs of block-wise quant.
-    # NOTE(HandH1998): For block-wise quant,
-    # BLOCK_K must be divisable by block_shape[1]
-    # BLOCK_N and BLOCK_M has no requirements
-    if block_shape is not None:
-        config["BLOCK_SIZE_N"] = block_shape[0]
-        config["BLOCK_SIZE_K"] = block_shape[1]
+            config = get_default_config(
+                M, E, N, w1_shape[2], top_k, dtype, is_marlin, block_shape
+            )
     return config

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from typing import Callable, Optional
 import torch

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -28,7 +28,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
-from sglang.srt.layers.moe.fused_moe_triton.fused_moe import padding_size
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
@@ -273,6 +272,19 @@ class Fp8LinearMethod(LinearMethodBase):
     def process_weights_after_loading(self, layer: Module) -> None:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if is_hip():
+                # activation_scheme: dynamic
+                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale_inv,
+                    input_scale=None,
+                )
+                layer.weight = torch.nn.Parameter(weight, require_grad=False)
+                layer.weight_scale_inv = torch.nn.Parameter(
+                    weight_scale, require_grad=False
+                )
+                layer.input_scale = None
             return
         layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
         # If checkpoint not serialized fp8, quantize the weights.
@@ -370,7 +382,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 weight=layer.weight,
                 block_size=self.quant_config.weight_block_size,
                 weight_scale=layer.weight_scale_inv,
-                input_scale=layer.input_scale,
+                input_scale=None,
                 bias=bias,
             )
@@ -548,8 +560,36 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+            padding_size,  # Avoid circular import
+        )
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if is_hip():
+                # activation_scheme: dynamic
+                w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.w13_weight,
+                    weight_scale=layer.w13_weight_scale_inv,
+                    input_scale=None,
+                )
+                w2_weight, w2_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.w2_weight,
+                    weight_scale=layer.w2_weight_scale_inv,
+                    input_scale=None,
+                )
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_weight_scale_inv = torch.nn.Parameter(
+                    w13_weight_scale, requires_grad=False
+                )
+                layer.w13_input_scale = None
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_weight_scale_inv = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
+                layer.w2_input_scale = None
             return
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -1,9 +1,34 @@
-from typing import List, Tuple
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
+from sglang.srt.utils import get_device_name, is_hip
+is_hip_ = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
+logger = logging.getLogger(__name__)
 @triton.jit
 def _per_token_group_quant_fp8(
@@ -51,7 +76,7 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: torch.dtype = torch.float8_e4m3fn,
+    dtype: torch.dtype = fp8_type_,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
@@ -73,9 +98,13 @@ def per_token_group_quant_fp8(
     assert x.is_contiguous(), "`x` is not contiguous"
     finfo = torch.finfo(dtype)
-    fp8_min = finfo.min
     fp8_max = finfo.max
+    if is_hip_:
+        fp8_max = 224.0
+    fp8_min = -fp8_max
     x_q = torch.empty_like(x, device=x.device, dtype=dtype)
     M = x.numel() // group_size
     N = group_size
@@ -191,6 +220,48 @@ def _w8a8_block_fp8_matmul(
     tl.store(c_ptrs, c, mask=c_mask)
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block FP8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
 def w8a8_block_fp8_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -231,17 +302,22 @@ def w8a8_block_fp8_matmul(
     C_shape = A.shape[:-1] + (N,)
     C = A.new_empty(C_shape, dtype=output_dtype)
-    # TODO(HandH1998):
-    # BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
-    # BLOCK_SIZE_K must be divisable by block_k
-    # BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
-    BLOCK_SIZE_M = 128
-    if M < BLOCK_SIZE_M:
-        BLOCK_SIZE_M = triton.next_power_of_2(M)
-        BLOCK_SIZE_M = max(BLOCK_SIZE_M, 16)
-    BLOCK_SIZE_K = block_k
-    assert block_k % BLOCK_SIZE_K == 0
-    BLOCK_SIZE_N = block_n
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
     def grid(META):
         return (
@@ -269,10 +345,7 @@ def w8a8_block_fp8_matmul(
         As.stride(-1),
         Bs.stride(1),
         Bs.stride(0),
-        BLOCK_SIZE_M=BLOCK_SIZE_M,
-        BLOCK_SIZE_N=BLOCK_SIZE_N,
-        BLOCK_SIZE_K=BLOCK_SIZE_K,
-        GROUP_SIZE_M=8,
+        **config,
     )
     return C

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -7,6 +7,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,
 )
+from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 def normalize_e4m3fn_to_e4m3fnuz(
@@ -63,8 +66,11 @@ def input_to_float8(
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
-    scale = finfo.max / amax
-    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    fp8_max = finfo.max
+    if is_hip_:
+        fp8_max = 224.0
+    scale = fp8_max / amax
+    x_scl_sat = (x * scale).clamp(min=-fp8_max, max=fp8_max)
     return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -21,10 +21,20 @@ from dataclasses import dataclass
 from enum import Enum
 from typing import Dict, List, Optional, Tuple, Union
+import torch
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
+@dataclass
+class SessionParams:
+    id: Optional[str] = None
+    rid: Optional[str] = None
+    offset: Optional[int] = None
+    replace: Optional[bool] = None
 @dataclass
 class GenerateReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
@@ -56,10 +66,8 @@ class GenerateReqInput:
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    # Session id info for continual prompting
-    session: Optional[
-        Union[List[Tuple[str, Optional[str]]], Tuple[str, Optional[str]]]
-    ] = None
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
     def normalize_batch_and_arguments(self):
         if (
@@ -221,9 +229,8 @@ class TokenizedGenerateReqInput:
     # The input embeds
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
-    # Session id info for continual prompting
-    session_id: Optional[str] = None
-    session_rid: Optional[str] = None
+    # Session info for continual prompting
+    session_params: Optional[SessionParams] = None
 @dataclass
@@ -407,6 +414,18 @@ class UpdateWeightsFromDistributedReqOutput:
     message: str
+@dataclass
+class UpdateWeightsFromTensorReqInput:
+    name: str
+    tensor: torch.Tensor
+@dataclass
+class UpdateWeightsFromTensorReqOutput:
+    success: bool
+    message: str
 @dataclass
 class InitWeightsUpdateGroupReqInput:
     # The master address
@@ -454,6 +473,7 @@ class ProfileReq(Enum):
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int
+    session_id: Optional[str] = None
 @dataclass
@@ -463,4 +483,5 @@ class CloseSessionReqInput:
 @dataclass
 class OpenSessionReqOutput:
-    session_id: str
+    session_id: Optional[str]
+    success: bool

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -29,7 +29,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 import dataclasses
 import logging
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 import numpy as np
 import torch
@@ -209,6 +209,7 @@ class Req:
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
+        eos_token_ids: Optional[Set[int]] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -236,6 +237,7 @@ class Req:
         self.finished_reason = None
         self.to_abort = False
         self.stream = stream
+        self.eos_token_ids = eos_token_ids
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -395,18 +397,23 @@ class Req:
         last_token_id = self.output_ids[-1]
-        matched_eos = False
-        # Check stop token ids
-        if self.sampling_params.stop_token_ids:
-            matched_eos = last_token_id in self.sampling_params.stop_token_ids
-        if self.tokenizer is not None:
-            matched_eos |= last_token_id == self.tokenizer.eos_token_id
-            if self.tokenizer.additional_stop_token_ids:
-                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
-        if matched_eos and not self.sampling_params.ignore_eos:
-            self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
-            return
+        if not self.sampling_params.ignore_eos:
+            matched_eos = False
+            # Check stop token ids
+            if self.sampling_params.stop_token_ids:
+                matched_eos = last_token_id in self.sampling_params.stop_token_ids
+            if self.eos_token_ids:
+                matched_eos |= last_token_id in self.eos_token_ids
+            if self.tokenizer is not None:
+                matched_eos |= last_token_id == self.tokenizer.eos_token_id
+                if self.tokenizer.additional_stop_token_ids:
+                    matched_eos |= (
+                        last_token_id in self.tokenizer.additional_stop_token_ids
+                    )
+            if matched_eos:
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
+                return
         # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:
@@ -836,8 +843,8 @@ class ScheduleBatch:
         # TODO (lianmin): Revisit this. It should be seq_len - 1
         self.extend_logprob_start_lens.extend([0] * running_bs)
-    def check_decode_mem(self):
-        bs = len(self.reqs)
+    def check_decode_mem(self, buf_multiplier=1):
+        bs = len(self.reqs) * buf_multiplier
         if self.token_to_kv_pool.available_size() >= bs:
             return True

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -248,7 +248,7 @@ class PrefillAdder:
         self.can_run_list.append(req)
         self._prefill_one_req(
-            len(req.prefix_indices),
+            0,
             req.extend_input_len,
             (
                 min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)

sglang 0.4.1__py3-none-any.whl → 0.4.1.post2__py3-none-any.whl

sglang 0.4.1py3-none-any.whl → 0.4.1.post2py3-none-any.whl