PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +10 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +2 -1
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +93 -76
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +103 -15
sglang/srt/entrypoints/engine.py +31 -33
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +48 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -2
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/qwen3_coder_detector.py +151 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +24 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +190 -23
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +34 -112
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +340 -9
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +162 -164
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +83 -35
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +288 -0
sglang/srt/managers/io_struct.py +60 -30
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +163 -113
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +256 -86
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +38 -27
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +74 -23
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +168 -0
sglang/srt/mem_cache/hiradix_cache.py +194 -5
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +44 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +66 -31
sglang/srt/model_executor/forward_batch_info.py +210 -25
sglang/srt/model_executor/model_runner.py +147 -42
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +192 -173
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +13 -6
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -9
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +57 -24
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +454 -270
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +10 -5
sglang/srt/utils.py +44 -69
sglang/test/runners.py +14 -3
sglang/test/test_activation.py +50 -1
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/qwen_audio.py ADDED Viewed

@@ -0,0 +1,65 @@
+import re
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
+    models = [Qwen2AudioForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+        self.AUDIO_TOKEN_REGEX = re.compile(
+            r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
+        )
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        self.audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
+        self.audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
+        self.audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
+        self.mm_tokens = MultimodalSpecialTokens(
+            audio_token=self.AUDIO_TOKEN,
+            audio_token_regex=self.AUDIO_TOKEN_REGEX,
+            audio_token_id=self.audio_token_id,
+        ).build(_processor)
+    async def process_mm_data_async(
+        self,
+        audio_data,
+        input_text,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            audio_data=audio_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        if base_output is None:
+            return None
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+        assert (
+            "feature_attention_mask" in ret
+        ), "feature_attention_mask not found in processor output"
+        input_lengths = ret["feature_attention_mask"].sum(dim=-1)
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "audio_start_id": self.audio_start_id,
+            "audio_token_id": self.audio_token_id,
+            "audio_end_id": self.audio_end_id,
+        }

sglang/srt/multimodal/processors/qwen_vl.py CHANGED Viewed

@@ -203,16 +203,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
-        # The single, pre-expanded image token.
-        self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
         # The regex that matches expanded image tokens.
-        self.IMAGE_TOKEN_REGEX = re.compile(
-            r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
-        )
         self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
         self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
-        self.IM_TOKEN_ID = hf_config.image_token_id
-        self.VIDEO_TOKEN_ID = hf_config.video_token_id
         self.vision_start_token_id = hf_config.vision_start_token_id
         self.vision_end_token_id = hf_config.vision_end_token_id
         self.NUM_TOKEN_PER_FRAME = 770
@@ -220,19 +213,20 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         self.MIN_PIXELS = 4 * 28 * 28
         self.MAX_PIXELS = 16384 * 28 * 28
         self.MAX_RATIO = 200
-        # TODO(mick): move all MultimodalSpecialTokens initializations into processor init
-        self.mm_special_tokens = MultimodalSpecialTokens(
-            image_token=self.IMAGE_TOKEN,
-            image_token_regex=self.IMAGE_TOKEN_REGEX,
-            video_token=self.VIDEO_TOKEN_ID,
-        )
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<|vision_start|><|image_pad|><|vision_end|>",
+            image_token_id=hf_config.image_token_id,
+            image_token_regex=re.compile(
+                r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
+            ),
+            video_token_id=hf_config.video_token_id,
+        ).build(_processor)
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
         input_text,
         request_obj,
-        max_req_input_len,
         *args,
         **kwargs,
     ):
@@ -241,8 +235,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
             prompt=input_text,
             image_data=image_data,
             video_data=request_obj.video_data,
-            multimodal_tokens=self.mm_special_tokens,
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
         )
         # Qwen-specific: resize images if they are raw Image objects
@@ -255,13 +248,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
                 await preprocess_video(video) for video in base_output.videos
             ]
-        mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
         input_ids = input_ids.flatten()
         mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
             spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
-            image_token_id=self.IM_TOKEN_ID,
-            video_token_id=self.VIDEO_TOKEN_ID,
+            image_token_id=self.mm_tokens.image_token_id,
+            video_token_id=self.mm_tokens.video_token_id,
             vision_start_token_id=self.vision_start_token_id,
             model_type=self.hf_config.model_type,
             tokens_per_second=getattr(
@@ -279,8 +274,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
             "mm_items": mm_items,
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
-            "im_token_id": self.IM_TOKEN_ID,
-            "video_token_id": self.VIDEO_TOKEN_ID,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.video_token_id,
             "mrope_positions": mrope_positions,
             "mrope_position_delta": mrope_position_delta,
         }

sglang/srt/multimodal/processors/vila.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Type, cast
+from typing import Any, Dict, List, Optional, Type
 import torch.nn as nn
 from transformers.configuration_utils import PretrainedConfig
@@ -8,9 +8,8 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     GenerateReqInput,
-    ImageDataItem,
+    ImageDataInputItem,
 )
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.models.vila import VILAForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
@@ -37,31 +36,32 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
         _processor: VILAProcessor,
     ) -> None:
         super().__init__(hf_config, server_args, _processor)
-        self.IM_TOKEN_ID = hf_config.image_token_id
-        self.VIDEO_TOKEN_ID = hf_config.video_token_id
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self._processor.tokenizer.image_token,
+            image_token_id=hf_config.image_token_id,
+            video_token_id=hf_config.video_token_id,
+        ).build(_processor)
     async def process_mm_data_async(
         self,
-        image_data: Optional[ImageDataItem | List[ImageDataItem]],
+        image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
         input_text: str | List[int],
         request_obj: GenerateReqInput | EmbeddingReqInput,
-        max_req_input_len: int,
         **kwargs,
     ) -> Optional[Dict[str, Any]]:
         base_output = self.load_mm_data(
             prompt=input_text,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self._processor.tokenizer.image_token
-            ),
-            max_req_input_len=max_req_input_len,
+            multimodal_tokens=self.mm_tokens,
             image_data=image_data,
         )
-        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
         return {
             "input_ids": input_ids.tolist(),
             "mm_items": mm_items,
-            "im_token_id": self.IM_TOKEN_ID,
-            "video_token_id": self.VIDEO_TOKEN_ID,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.video_token_id,
         }

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
     Returns all the text before the </think> tag as `reasoning_text`
     and the rest of the text as `normal_text`.
+    Supported models:
+      - DeepSeek-R1: Always generates thinking content without <think> start tag
+      - DeepSeek-R1-0528: Generates thinking content with <think> start tag
+    Format patterns:
+      - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
+      - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
     Args:
         stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
             If True, streams reasoning content as it arrives.
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
 class Qwen3Detector(BaseReasoningFormatDetector):
     """
-    Detector for Qwen3 model.
+    Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
     Assumes reasoning format:
       (<think>)*(.*)</think>
-    Returns all the text before the </think> tag as `reasoning_text`
-    and the rest of the text as `normal_text`.
+    Qwen3 models released before 07/2025 supports switching between thinking mode and normal
+    mode using `enable_thinking` parameter in the request parameter.
+      - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
+      - enable_thinking=False: "The answer is 42." (no thinking tokens)
+    This detector handles both cases.
+    NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
+    Those models always generate thinking content without <think> start tags.
+    Use "qwen3-thinking" parser type for those models instead.
     Args:
         stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
     """
     def __init__(self, stream_reasoning: bool = True):
-        # Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
         super().__init__(
             "<think>",
             "</think>",
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
         )
+class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
+    """
+    Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
+    Assumes reasoning format:
+      *(.*)</think>
+    These models always generate thinking content without <think> start tag.
+    They do not support the enable_thinking parameter and always think.
+    Format: "I need to think about this...</think>The answer is 42."
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+    def __init__(self, stream_reasoning: bool = True):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=True,
+            stream_reasoning=stream_reasoning,
+        )
 class KimiDetector(BaseReasoningFormatDetector):
     """
     Detector for Kimi Thinking model.
@@ -189,6 +230,7 @@ class ReasoningParser:
     DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
         "deepseek-r1": DeepSeekR1Detector,
         "qwen3": Qwen3Detector,
+        "qwen3-thinking": Qwen3ThinkingDetector,
         "kimi": KimiDetector,
     }

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -322,6 +322,12 @@ class SamplingBatchInfo:
             # Set the flag to True if any of the two has custom logit processor
             self.has_custom_logit_processor = True
+        # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
+        # See note below on len(self) and len(other).
+        self.logit_bias = merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
+        )
         # Note: because the __len()__ operator is defined on the temperatures tensor,
         # please make sure any merge operation with len(self) or len(other) is done before
         # the merge operation of the temperatures tensor below.
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
         self.need_top_k_sampling |= other.need_top_k_sampling
         self.need_min_p_sampling |= other.need_min_p_sampling
-        # Merge logit bias
-        self.logit_bias = merge_bias_tensor(
-            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
-        )
 def merge_bias_tensor(
     lhs: Optional[torch.Tensor],

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -89,7 +89,7 @@ class SamplingParams:
         if self.top_k == -1:
             self.top_k = TOP_K_ALL  # whole vocabulary
-    def verify(self):
+    def verify(self, vocab_size):
         if self.temperature < 0.0:
             raise ValueError(
                 f"temperature must be non-negative, got {self.temperature}."
@@ -131,6 +131,13 @@ class SamplingParams:
                     f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
+        if self.logit_bias is not None:
+            for token_id in self.logit_bias:
+                if not 0 <= int(token_id) < vocab_size:
+                    raise ValueError(
+                        f"logit_bias must has keys in [0, {vocab_size - 1}], got "
+                        f"{token_id}."
+                    )
         grammars = [
             self.json_schema,
             self.regex,

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl