PyPI - sglang - Versions diffs - 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl - Mend

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/bench_one_batch.py +3 -11
sglang/bench_serving.py +149 -1
sglang/check_env.py +3 -3
sglang/lang/chat_template.py +44 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseekvl2.py +3 -0
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/kimi_vl.py +38 -0
sglang/srt/configs/kimi_vl_moonvit.py +32 -0
sglang/srt/configs/model_config.py +32 -0
sglang/srt/constrained/xgrammar_backend.py +11 -19
sglang/srt/conversation.py +151 -3
sglang/srt/disaggregation/decode.py +4 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +9 -18
sglang/srt/disaggregation/nixl/conn.py +241 -71
sglang/srt/disaggregation/utils.py +44 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +58 -24
sglang/srt/entrypoints/http_server.py +28 -1
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +97 -0
sglang/srt/hf_transformers_utils.py +22 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +146 -50
sglang/srt/layers/attention/flashinfer_backend.py +129 -94
sglang/srt/layers/attention/flashinfer_mla_backend.py +88 -30
sglang/srt/layers/attention/flashmla_backend.py +3 -0
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +5 -2
sglang/srt/layers/moe/ep_moe/kernels.py +342 -7
sglang/srt/layers/moe/ep_moe/layer.py +120 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +98 -57
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -5
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +6 -1
sglang/srt/layers/quantization/fp8.py +108 -95
sglang/srt/layers/quantization/fp8_kernel.py +79 -60
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/utils.py +35 -0
sglang/srt/lora/layers.py +35 -9
sglang/srt/lora/lora_manager.py +81 -35
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +52 -34
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
sglang/srt/managers/schedule_batch.py +44 -16
sglang/srt/managers/schedule_policy.py +11 -5
sglang/srt/managers/scheduler.py +291 -72
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
sglang/srt/managers/tokenizer_manager.py +24 -13
sglang/srt/managers/tp_worker.py +60 -28
sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +70 -36
sglang/srt/model_executor/cuda_graph_runner.py +82 -19
sglang/srt/model_executor/forward_batch_info.py +31 -1
sglang/srt/model_executor/model_runner.py +159 -90
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_nextn.py +2 -277
sglang/srt/models/deepseek_v2.py +132 -37
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/kimi_vl.py +308 -0
sglang/srt/models/kimi_vl_moonvit.py +639 -0
sglang/srt/models/llama.py +93 -31
sglang/srt/models/llama4.py +54 -7
sglang/srt/models/llama_eagle.py +4 -1
sglang/srt/models/llama_eagle3.py +4 -1
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_moe.py +8 -3
sglang/srt/models/qwen2_vl.py +4 -16
sglang/srt/models/qwen3_moe.py +8 -3
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +58 -62
sglang/srt/openai_api/protocol.py +38 -16
sglang/srt/reasoning_parser.py +2 -2
sglang/srt/sampling/sampling_batch_info.py +54 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +93 -24
sglang/srt/speculative/eagle_worker.py +3 -2
sglang/srt/utils.py +123 -10
sglang/test/runners.py +4 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_utils.py +32 -1
sglang/version.py +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/METADATA +18 -9
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/RECORD +119 -99
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post3.dist-info}/top_level.txt +0 -0

sglang/srt/models/xiaomi_mimo.py ADDED Viewed

@@ -0,0 +1,171 @@
+# Adapted from qwen2.py
+from functools import partial
+from typing import Any, Dict, Iterable, Optional, Tuple
+import torch
+from torch import nn
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2Model
+from sglang.srt.utils import add_prefix
+MiMoConfig = None
+class MiMoModel(Qwen2Model):
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen2DecoderLayer,
+        )
+class MiMoForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if (
+                "rotary_emb.inv_freq" in name
+                or "projector" in name
+                or "mtp_layers" in name
+            ):
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+EntryClass = MiMoForCausalLM

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Conversion between OpenAI APIs and native SRT APIs"""
 import asyncio
+import base64
 import json
 import logging
 import os
@@ -36,6 +37,7 @@ from sglang.srt.conversation import (
     chat_template_exists,
     generate_chat_conv,
     generate_embedding_convs,
+    get_conv_template_by_model_path,
     register_conv_template,
 )
 from sglang.srt.function_call_parser import FunctionCallParser
@@ -163,10 +165,14 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, mode
     else:
         chat_template_name = chat_template_arg
-    # Check chat-template
-    # TODO:
-    # 1. Do not import any code from sglang.lang
-    # 2. For VLM, when chat_template_arg is None, set it automatically by guessing from model_path.
+def guess_chat_template_name_from_model_path(model_path):
+    global chat_template_name
+    chat_template_name = get_conv_template_by_model_path(model_path)
+    if chat_template_name is not None:
+        logger.info(
+            f"Infer the chat template name from the model path and obtain the result: {chat_template_name}."
+        )
 async def v1_files_create(
@@ -523,6 +529,7 @@ def v1_generate_request(
                 "temperature": request.temperature,
                 "max_new_tokens": request.max_tokens,
                 "min_new_tokens": request.min_tokens,
+                "thinking_budget": request.thinking_budget,
                 "stop": request.stop,
                 "stop_token_ids": request.stop_token_ids,
                 "top_p": request.top_p,
@@ -894,6 +901,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
     return response
+def _get_enable_thinking_from_request(request_obj):
+    """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
+    Args:
+        request_obj: The request object (or an item from a list of requests).
+    Returns:
+        The boolean value of 'enable_thinking' if found and not True, otherwise True.
+    """
+    if (
+        hasattr(request_obj, "chat_template_kwargs")
+        and request_obj.chat_template_kwargs
+        and request_obj.chat_template_kwargs.get("enable_thinking") is not None
+    ):
+        return request_obj.chat_template_kwargs.get("enable_thinking")
+    return True
 def v1_chat_generate_request(
     all_requests: List[ChatCompletionRequest],
     tokenizer_manager,
@@ -943,47 +968,23 @@ def v1_chat_generate_request(
             if chat_template_name is None:
                 openai_compatible_messages = []
-                if (
-                    tools
-                    and tokenizer_manager.server_args.tool_call_parser == "deepseekv3"
-                ):
-                    # add function call prompt to deepseekv3
-                    openai_compatible_messages.append(
-                        {
-                            "role": "system",
-                            "content": """You are a helpful Assistant.
-                    ## Tools
-                    ### Function
-                    You have the following functions available:
-                    """
-                            + "".join(
-                                [
-                                    f"""
-                        - `{tool['name']}`:
-                        ```json
-                        {json.dumps(tool)}
-                        ```
-                        """
-                                    for tool in tools
-                                ]
-                            ),
-                        }
-                    )
                 for message in request.messages:
                     if message.content is None:
                         message.content = ""
-                    if isinstance(message.content, str):
-                        openai_compatible_messages.append(
-                            {"role": message.role, "content": message.content}
-                        )
+                    msg_dict = message.dict()
+                    if isinstance(msg_dict.get("content"), list):
+                        for chunk in msg_dict["content"]:
+                            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                                new_msg = msg_dict.copy()
+                                new_msg["content"] = chunk["text"]
+                                new_msg = {
+                                    k: v for k, v in new_msg.items() if v is not None
+                                }
+                                openai_compatible_messages.append(new_msg)
                     else:
-                        content_list = message.dict()["content"]
-                        for content in content_list:
-                            if content["type"] == "text":
-                                openai_compatible_messages.append(
-                                    {"role": message.role, "content": content["text"]}
-                                )
+                        msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
+                        openai_compatible_messages.append(msg_dict)
                 if (
                     openai_compatible_messages
                     and openai_compatible_messages[-1]["role"] == "assistant"
@@ -1099,8 +1100,9 @@ def v1_chat_generate_request(
         sampling_params = {
             "temperature": request.temperature,
-            "max_new_tokens": request.max_tokens,
+            "max_new_tokens": request.max_tokens or request.max_completion_tokens,
             "min_new_tokens": request.min_tokens,
+            "thinking_budget": request.thinking_budget,
             "stop": stop,
             "stop_token_ids": request.stop_token_ids,
             "top_p": request.top_p,
@@ -1258,31 +1260,16 @@ def v1_chat_generate_response(
         tool_calls = None
         text = ret_item["text"]
-        enable_thinking = True
         if isinstance(request, list):
             tool_choice = request[idx].tool_choice
             tools = request[idx].tools
             separate_reasoning = request[idx].separate_reasoning
-            if (
-                request[idx].chat_template_kwargs
-                and request[idx].chat_template_kwargs.get("enable_thinking") is not None
-            ):
-                enable_thinking = request[idx].chat_template_kwargs.get(
-                    "enable_thinking", True
-                )
+            enable_thinking = _get_enable_thinking_from_request(request[idx])
         else:
             tool_choice = request.tool_choice
             tools = request.tools
             separate_reasoning = request.separate_reasoning
-            if (
-                request.chat_template_kwargs
-                and request.chat_template_kwargs.get("enable_thinking") is not None
-            ):
-                enable_thinking = request.chat_template_kwargs.get(
-                    "enable_thinking", True
-                )
+            enable_thinking = _get_enable_thinking_from_request(request)
         reasoning_text = None
         if reasoning_parser and separate_reasoning and enable_thinking:
@@ -1308,7 +1295,8 @@ def v1_chat_generate_response(
                     text, call_info_list = parser.parse_non_stream(text)
                     tool_calls = [
                         ToolCall(
-                            id=str(call_info.tool_index),
+                            id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
+                            index=call_info.tool_index,
                             function=FunctionResponse(
                                 name=call_info.name, arguments=call_info.parameters
                             ),
@@ -1424,6 +1412,7 @@ async def v1_chat_completions(
         reasoning_parser_dict = {}
         async def generate_stream_resp():
+            tool_call_first = True
             is_firsts = {}
             stream_buffers = {}
             n_prev_tokens = {}
@@ -1521,9 +1510,12 @@ async def v1_chat_completions(
                     delta = text[len(stream_buffer) :]
                     new_stream_buffer = stream_buffer + delta
+                    enable_thinking = _get_enable_thinking_from_request(request)
                     if (
                         tokenizer_manager.server_args.reasoning_parser
                         and request.separate_reasoning
+                        and enable_thinking
                     ):
                         if index not in reasoning_parser_dict:
                             reasoning_parser_dict[index] = ReasoningParser(
@@ -1587,7 +1579,6 @@ async def v1_chat_completions(
                         # 2) if we found calls, we output them as separate chunk(s)
                         for call_item in calls:
                             # transform call_item -> FunctionResponse + ToolCall
                             if finish_reason_type == "stop":
                                 latest_delta_len = 0
                                 if isinstance(call_item.parameters, str):
@@ -1610,14 +1601,19 @@ async def v1_chat_completions(
                                 call_item.parameters = remaining_call
                                 finish_reason_type = "tool_calls"
                             tool_call = ToolCall(
-                                id=str(call_item.tool_index),
+                                id=(
+                                    f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}"
+                                    if tool_call_first
+                                    else None
+                                ),
+                                index=call_item.tool_index,
                                 function=FunctionResponse(
                                     name=call_item.name,
                                     arguments=call_item.parameters,
                                 ),
                             )
+                            tool_call_first = False
                             choice_data = ChatCompletionResponseStreamChoice(
                                 index=index,
                                 delta=DeltaMessage(tool_calls=[tool_call]),

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -172,6 +172,7 @@ class CompletionRequest(BaseModel):
     top_k: int = -1
     min_p: float = 0.0
     min_tokens: int = 0
+    thinking_budget: Optional[int] = None
     json_schema: Optional[str] = None
     regex: Optional[str] = None
     ebnf: Optional[str] = None
@@ -250,9 +251,29 @@ ChatCompletionMessageContentPart = Union[
 ]
+class FunctionResponse(BaseModel):
+    """Function response."""
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+class ToolCall(BaseModel):
+    """Tool call response."""
+    id: Optional[str] = None
+    index: Optional[int] = None
+    type: Literal["function"] = "function"
+    function: FunctionResponse
 class ChatCompletionMessageGenericParam(BaseModel):
     role: Literal["system", "assistant", "tool"]
     content: Union[str, List[ChatCompletionMessageContentTextPart], None]
+    tool_call_id: Optional[str] = None
+    name: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 class ChatCompletionMessageUserParam(BaseModel):
@@ -320,7 +341,23 @@ class ChatCompletionRequest(BaseModel):
     logit_bias: Optional[Dict[str, float]] = None
     logprobs: bool = False
     top_logprobs: Optional[int] = None
-    max_tokens: Optional[int] = None
+    max_tokens: Optional[int] = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+        description="The maximum number of tokens that can be generated in the chat completion. ",
+    )
+    max_completion_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of completion tokens for a chat completion request, "
+        "including visible output tokens and reasoning tokens. Input tokens are not included. ",
+    )
+    thinking_budget: Optional[int] = Field(
+        default=None,
+        description="The maximum number of reasoning tokens that can be generated for a request. "
+        "This setting of does not affect the thinking process of models. "
+        "If the number of tokens generated by the model's thinking process exceeds thinking_budget, "
+        "the reasoning content will be truncated and the final response content will be generated immediately.",
+    )
     n: int = 1
     presence_penalty: float = 0.0
     response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
@@ -369,21 +406,6 @@ class ChatCompletionRequest(BaseModel):
     bootstrap_room: Optional[int] = None
-class FunctionResponse(BaseModel):
-    """Function response."""
-    name: Optional[str] = None
-    arguments: Optional[str] = None
-class ToolCall(BaseModel):
-    """Tool call response."""
-    id: str
-    type: Literal["function"] = "function"
-    function: FunctionResponse
 class ChatMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
         One-time parsing: Detects and parses reasoning sections in the provided text.
         Returns both reasoning content and normal text separately.
         """
-        text = text.replace(self.think_start_token, "").strip()
+        text = text.replace(self.think_start_token, "")
         if self.think_end_token not in text:
             # Assume reasoning was truncated before `</think>` token
             return StreamingParseResult(reasoning_text=text)
@@ -73,7 +73,7 @@ class BaseReasoningFormatDetector:
             normal_text = current_text[end_idx + len(self.think_end_token) :]
             return StreamingParseResult(
-                normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
+                normal_text=normal_text, reasoning_text=reasoning_text
             )
         # Continue with reasoning content

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -30,8 +30,13 @@ class SamplingBatchInfo:
     # Whether any request needs min_p sampling
     need_min_p_sampling: bool
+    # Use thinking_budget to truncate thinking
+    num_thinking_tokens: Optional[torch.Tensor] = None
+    think_end_ids: Optional[torch.Tensor] = None
+    thinking_budgets: Optional[torch.Tensor] = None
     # Masking tensors for grammar-guided structured outputs
-    vocab_size: int
+    vocab_size: int = 0
     grammars: Optional[List] = None
     vocab_mask: Optional[torch.Tensor] = None
     apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
@@ -76,7 +81,22 @@ class SamplingBatchInfo:
         min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
+            think_end_ids = torch.tensor(
+                [getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
+                dtype=torch.int64,
+            ).to(device, non_blocking=True)
+            num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
+                device, non_blocking=True
+            )
+            thinking_budgets = torch.tensor(
+                [r.sampling_params.thinking_budget or -1 for r in reqs],
+                dtype=torch.int64,
+            ).to(device, non_blocking=True)
+        else:
+            think_end_ids = None
+            num_thinking_tokens = None
+            thinking_budgets = None
         # Check if any request has custom logit processor
         has_custom_logit_processor = (
             batch.enable_custom_logit_processor  # check the flag first.
@@ -132,6 +152,9 @@ class SamplingBatchInfo:
             top_ps=top_ps,
             top_ks=top_ks,
             min_ps=min_ps,
+            think_end_ids=think_end_ids,
+            num_thinking_tokens=num_thinking_tokens,
+            thinking_budgets=thinking_budgets,
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             vocab_size=vocab_size,
@@ -146,6 +169,35 @@ class SamplingBatchInfo:
     def __len__(self):
         return len(self.temperatures)
+    def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
+        has_budget = self.thinking_budgets > 0
+        if not has_budget.any():
+            return
+        torch.where(
+            has_budget,
+            self.num_thinking_tokens + 1,
+            self.num_thinking_tokens,
+            out=self.num_thinking_tokens,
+        )
+        should_stop = has_budget & (
+            self.num_thinking_tokens - 1 > self.thinking_budgets
+        )
+        next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
+        batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
+        if len(batch_indices) > 0:
+            end_token_indices = self.think_end_ids[batch_indices]
+            next_token_logits[batch_indices, end_token_indices] = 0.0
+    def update_thinking_budgets(self, next_token_ids: torch.Tensor):
+        if not torch.any(self.thinking_budgets > 0):
+            return
+        torch.where(
+            next_token_ids == self.think_end_ids,
+            torch.tensor(-1, device=self.thinking_budgets.device),
+            self.thinking_budgets,
+            out=self.thinking_budgets,
+        )
     def update_regex_vocab_mask(self):
         if not self.grammars:
             self.vocab_mask = None

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -30,6 +30,7 @@ class SamplingParams:
     def __init__(
         self,
         max_new_tokens: int = 128,
+        thinking_budget: Optional[int] = None,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -57,6 +58,7 @@ class SamplingParams:
             self.stop_token_ids = set(stop_token_ids)
         else:
             self.stop_token_ids = None
+        self.thinking_budget = thinking_budget
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k

sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post3__py3-none-any.whl

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post3py3-none-any.whl