sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +10 -8
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +2 -1
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +93 -76
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +103 -15
- sglang/srt/entrypoints/engine.py +31 -33
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +48 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +95 -63
- sglang/srt/function_call/function_call_parser.py +4 -2
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/qwen3_coder_detector.py +151 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +24 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/logits_processor.py +34 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +190 -23
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +34 -112
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +340 -9
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +162 -164
- sglang/srt/lora/lora_registry.py +124 -0
- sglang/srt/lora/mem_pool.py +83 -35
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +288 -0
- sglang/srt/managers/io_struct.py +60 -30
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +163 -113
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +256 -86
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +38 -27
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +74 -23
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +168 -0
- sglang/srt/mem_cache/hiradix_cache.py +194 -5
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +44 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +66 -31
- sglang/srt/model_executor/forward_batch_info.py +210 -25
- sglang/srt/model_executor/model_runner.py +147 -42
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +192 -173
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +13 -6
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -9
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +57 -24
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/reasoning_parser.py +46 -4
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +454 -270
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +10 -5
- sglang/srt/utils.py +44 -69
- sglang/test/runners.py +14 -3
- sglang/test/test_activation.py +50 -1
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
4
|
+
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
|
5
|
+
from sglang.srt.multimodal.processors.base_processor import (
|
6
|
+
BaseMultimodalProcessor,
|
7
|
+
MultimodalSpecialTokens,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
12
|
+
models = [Qwen2AudioForConditionalGeneration]
|
13
|
+
|
14
|
+
def __init__(self, hf_config, server_args, _processor):
|
15
|
+
super().__init__(hf_config, server_args, _processor)
|
16
|
+
self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
|
17
|
+
self.AUDIO_TOKEN_REGEX = re.compile(
|
18
|
+
r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
|
19
|
+
)
|
20
|
+
# Collect special token ids
|
21
|
+
tokenizer = self._processor.tokenizer
|
22
|
+
self.audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
|
23
|
+
self.audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
|
24
|
+
self.audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
|
25
|
+
|
26
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
27
|
+
audio_token=self.AUDIO_TOKEN,
|
28
|
+
audio_token_regex=self.AUDIO_TOKEN_REGEX,
|
29
|
+
audio_token_id=self.audio_token_id,
|
30
|
+
).build(_processor)
|
31
|
+
|
32
|
+
async def process_mm_data_async(
|
33
|
+
self,
|
34
|
+
audio_data,
|
35
|
+
input_text,
|
36
|
+
**kwargs,
|
37
|
+
):
|
38
|
+
base_output = self.load_mm_data(
|
39
|
+
prompt=input_text,
|
40
|
+
audio_data=audio_data,
|
41
|
+
multimodal_tokens=self.mm_tokens,
|
42
|
+
)
|
43
|
+
if base_output is None:
|
44
|
+
return None
|
45
|
+
|
46
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
47
|
+
base_output, self.mm_tokens
|
48
|
+
)
|
49
|
+
|
50
|
+
assert (
|
51
|
+
"feature_attention_mask" in ret
|
52
|
+
), "feature_attention_mask not found in processor output"
|
53
|
+
input_lengths = ret["feature_attention_mask"].sum(dim=-1)
|
54
|
+
input_lengths = (input_lengths - 1) // 2 + 1
|
55
|
+
output_lengths = (input_lengths - 2) // 2 + 1
|
56
|
+
|
57
|
+
mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
|
58
|
+
|
59
|
+
return {
|
60
|
+
"mm_items": mm_items,
|
61
|
+
"input_ids": input_ids.tolist(),
|
62
|
+
"audio_start_id": self.audio_start_id,
|
63
|
+
"audio_token_id": self.audio_token_id,
|
64
|
+
"audio_end_id": self.audio_end_id,
|
65
|
+
}
|
@@ -203,16 +203,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
203
203
|
|
204
204
|
def __init__(self, hf_config, server_args, _processor):
|
205
205
|
super().__init__(hf_config, server_args, _processor)
|
206
|
-
# The single, pre-expanded image token.
|
207
|
-
self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
|
208
206
|
# The regex that matches expanded image tokens.
|
209
|
-
self.IMAGE_TOKEN_REGEX = re.compile(
|
210
|
-
r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
|
211
|
-
)
|
212
207
|
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
|
213
208
|
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
|
214
|
-
self.IM_TOKEN_ID = hf_config.image_token_id
|
215
|
-
self.VIDEO_TOKEN_ID = hf_config.video_token_id
|
216
209
|
self.vision_start_token_id = hf_config.vision_start_token_id
|
217
210
|
self.vision_end_token_id = hf_config.vision_end_token_id
|
218
211
|
self.NUM_TOKEN_PER_FRAME = 770
|
@@ -220,19 +213,20 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
220
213
|
self.MIN_PIXELS = 4 * 28 * 28
|
221
214
|
self.MAX_PIXELS = 16384 * 28 * 28
|
222
215
|
self.MAX_RATIO = 200
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
image_token_regex=
|
227
|
-
|
228
|
-
|
216
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
217
|
+
image_token="<|vision_start|><|image_pad|><|vision_end|>",
|
218
|
+
image_token_id=hf_config.image_token_id,
|
219
|
+
image_token_regex=re.compile(
|
220
|
+
r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
|
221
|
+
),
|
222
|
+
video_token_id=hf_config.video_token_id,
|
223
|
+
).build(_processor)
|
229
224
|
|
230
225
|
async def process_mm_data_async(
|
231
226
|
self,
|
232
227
|
image_data: List[Union[str, bytes]],
|
233
228
|
input_text,
|
234
229
|
request_obj,
|
235
|
-
max_req_input_len,
|
236
230
|
*args,
|
237
231
|
**kwargs,
|
238
232
|
):
|
@@ -241,8 +235,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
241
235
|
prompt=input_text,
|
242
236
|
image_data=image_data,
|
243
237
|
video_data=request_obj.video_data,
|
244
|
-
multimodal_tokens=self.
|
245
|
-
max_req_input_len=max_req_input_len,
|
238
|
+
multimodal_tokens=self.mm_tokens,
|
246
239
|
)
|
247
240
|
|
248
241
|
# Qwen-specific: resize images if they are raw Image objects
|
@@ -255,13 +248,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
255
248
|
await preprocess_video(video) for video in base_output.videos
|
256
249
|
]
|
257
250
|
|
258
|
-
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
251
|
+
mm_items, input_ids, ret = self.process_and_combine_mm_data(
|
252
|
+
base_output, self.mm_tokens
|
253
|
+
)
|
259
254
|
|
260
255
|
input_ids = input_ids.flatten()
|
261
256
|
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
|
262
257
|
spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
|
263
|
-
image_token_id=self.
|
264
|
-
video_token_id=self.
|
258
|
+
image_token_id=self.mm_tokens.image_token_id,
|
259
|
+
video_token_id=self.mm_tokens.video_token_id,
|
265
260
|
vision_start_token_id=self.vision_start_token_id,
|
266
261
|
model_type=self.hf_config.model_type,
|
267
262
|
tokens_per_second=getattr(
|
@@ -279,8 +274,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
279
274
|
"mm_items": mm_items,
|
280
275
|
"im_start_id": self.IM_START_TOKEN_ID,
|
281
276
|
"im_end_id": self.IM_END_TOKEN_ID,
|
282
|
-
"im_token_id": self.
|
283
|
-
"video_token_id": self.
|
277
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
278
|
+
"video_token_id": self.mm_tokens.video_token_id,
|
284
279
|
"mrope_positions": mrope_positions,
|
285
280
|
"mrope_position_delta": mrope_position_delta,
|
286
281
|
}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import Any, Dict, List, Optional, Type
|
1
|
+
from typing import Any, Dict, List, Optional, Type
|
2
2
|
|
3
3
|
import torch.nn as nn
|
4
4
|
from transformers.configuration_utils import PretrainedConfig
|
@@ -8,9 +8,8 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
8
8
|
from sglang.srt.managers.io_struct import (
|
9
9
|
EmbeddingReqInput,
|
10
10
|
GenerateReqInput,
|
11
|
-
|
11
|
+
ImageDataInputItem,
|
12
12
|
)
|
13
|
-
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
14
13
|
from sglang.srt.models.vila import VILAForConditionalGeneration
|
15
14
|
from sglang.srt.multimodal.processors.base_processor import (
|
16
15
|
BaseMultimodalProcessor,
|
@@ -37,31 +36,32 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
37
36
|
_processor: VILAProcessor,
|
38
37
|
) -> None:
|
39
38
|
super().__init__(hf_config, server_args, _processor)
|
40
|
-
self.
|
41
|
-
|
39
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
40
|
+
image_token=self._processor.tokenizer.image_token,
|
41
|
+
image_token_id=hf_config.image_token_id,
|
42
|
+
video_token_id=hf_config.video_token_id,
|
43
|
+
).build(_processor)
|
42
44
|
|
43
45
|
async def process_mm_data_async(
|
44
46
|
self,
|
45
|
-
image_data: Optional[
|
47
|
+
image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
|
46
48
|
input_text: str | List[int],
|
47
49
|
request_obj: GenerateReqInput | EmbeddingReqInput,
|
48
|
-
max_req_input_len: int,
|
49
50
|
**kwargs,
|
50
51
|
) -> Optional[Dict[str, Any]]:
|
51
52
|
base_output = self.load_mm_data(
|
52
53
|
prompt=input_text,
|
53
|
-
multimodal_tokens=
|
54
|
-
image_token=self._processor.tokenizer.image_token
|
55
|
-
),
|
56
|
-
max_req_input_len=max_req_input_len,
|
54
|
+
multimodal_tokens=self.mm_tokens,
|
57
55
|
image_data=image_data,
|
58
56
|
)
|
59
57
|
|
60
|
-
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
58
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
59
|
+
base_output, self.mm_tokens
|
60
|
+
)
|
61
61
|
|
62
62
|
return {
|
63
63
|
"input_ids": input_ids.tolist(),
|
64
64
|
"mm_items": mm_items,
|
65
|
-
"im_token_id": self.
|
66
|
-
"video_token_id": self.
|
65
|
+
"im_token_id": self.mm_tokens.image_token_id,
|
66
|
+
"video_token_id": self.mm_tokens.video_token_id,
|
67
67
|
}
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
118
118
|
Returns all the text before the </think> tag as `reasoning_text`
|
119
119
|
and the rest of the text as `normal_text`.
|
120
120
|
|
121
|
+
Supported models:
|
122
|
+
- DeepSeek-R1: Always generates thinking content without <think> start tag
|
123
|
+
- DeepSeek-R1-0528: Generates thinking content with <think> start tag
|
124
|
+
|
125
|
+
Format patterns:
|
126
|
+
- DeepSeek-R1: "I need to think about this...</think>The answer is 42."
|
127
|
+
- DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
|
128
|
+
|
121
129
|
Args:
|
122
130
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
123
131
|
If True, streams reasoning content as it arrives.
|
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
136
144
|
|
137
145
|
class Qwen3Detector(BaseReasoningFormatDetector):
|
138
146
|
"""
|
139
|
-
Detector for Qwen3
|
147
|
+
Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
140
148
|
Assumes reasoning format:
|
141
149
|
(<think>)*(.*)</think>
|
142
|
-
|
143
|
-
|
150
|
+
|
151
|
+
Qwen3 models released before 07/2025 supports switching between thinking mode and normal
|
152
|
+
mode using `enable_thinking` parameter in the request parameter.
|
153
|
+
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
154
|
+
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
155
|
+
|
156
|
+
This detector handles both cases.
|
157
|
+
|
158
|
+
NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
159
|
+
Those models always generate thinking content without <think> start tags.
|
160
|
+
Use "qwen3-thinking" parser type for those models instead.
|
144
161
|
|
145
162
|
Args:
|
146
163
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
148
165
|
"""
|
149
166
|
|
150
167
|
def __init__(self, stream_reasoning: bool = True):
|
151
|
-
# Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
|
152
168
|
super().__init__(
|
153
169
|
"<think>",
|
154
170
|
"</think>",
|
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
157
173
|
)
|
158
174
|
|
159
175
|
|
176
|
+
class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
|
177
|
+
"""
|
178
|
+
Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
179
|
+
Assumes reasoning format:
|
180
|
+
*(.*)</think>
|
181
|
+
|
182
|
+
These models always generate thinking content without <think> start tag.
|
183
|
+
They do not support the enable_thinking parameter and always think.
|
184
|
+
|
185
|
+
Format: "I need to think about this...</think>The answer is 42."
|
186
|
+
|
187
|
+
Args:
|
188
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
189
|
+
If True, streams reasoning content as it arrives.
|
190
|
+
"""
|
191
|
+
|
192
|
+
def __init__(self, stream_reasoning: bool = True):
|
193
|
+
super().__init__(
|
194
|
+
"<think>",
|
195
|
+
"</think>",
|
196
|
+
force_reasoning=True,
|
197
|
+
stream_reasoning=stream_reasoning,
|
198
|
+
)
|
199
|
+
|
200
|
+
|
160
201
|
class KimiDetector(BaseReasoningFormatDetector):
|
161
202
|
"""
|
162
203
|
Detector for Kimi Thinking model.
|
@@ -189,6 +230,7 @@ class ReasoningParser:
|
|
189
230
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
190
231
|
"deepseek-r1": DeepSeekR1Detector,
|
191
232
|
"qwen3": Qwen3Detector,
|
233
|
+
"qwen3-thinking": Qwen3ThinkingDetector,
|
192
234
|
"kimi": KimiDetector,
|
193
235
|
}
|
194
236
|
|
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
|
|
322
322
|
# Set the flag to True if any of the two has custom logit processor
|
323
323
|
self.has_custom_logit_processor = True
|
324
324
|
|
325
|
+
# Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
|
326
|
+
# See note below on len(self) and len(other).
|
327
|
+
self.logit_bias = merge_bias_tensor(
|
328
|
+
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
329
|
+
)
|
330
|
+
|
325
331
|
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
326
332
|
# please make sure any merge operation with len(self) or len(other) is done before
|
327
333
|
# the merge operation of the temperatures tensor below.
|
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
|
|
340
346
|
self.need_top_k_sampling |= other.need_top_k_sampling
|
341
347
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
342
348
|
|
343
|
-
# Merge logit bias
|
344
|
-
self.logit_bias = merge_bias_tensor(
|
345
|
-
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
346
|
-
)
|
347
|
-
|
348
349
|
|
349
350
|
def merge_bias_tensor(
|
350
351
|
lhs: Optional[torch.Tensor],
|
@@ -89,7 +89,7 @@ class SamplingParams:
|
|
89
89
|
if self.top_k == -1:
|
90
90
|
self.top_k = TOP_K_ALL # whole vocabulary
|
91
91
|
|
92
|
-
def verify(self):
|
92
|
+
def verify(self, vocab_size):
|
93
93
|
if self.temperature < 0.0:
|
94
94
|
raise ValueError(
|
95
95
|
f"temperature must be non-negative, got {self.temperature}."
|
@@ -131,6 +131,13 @@ class SamplingParams:
|
|
131
131
|
f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
|
132
132
|
f"{self.min_new_tokens}."
|
133
133
|
)
|
134
|
+
if self.logit_bias is not None:
|
135
|
+
for token_id in self.logit_bias:
|
136
|
+
if not 0 <= int(token_id) < vocab_size:
|
137
|
+
raise ValueError(
|
138
|
+
f"logit_bias must has keys in [0, {vocab_size - 1}], got "
|
139
|
+
f"{token_id}."
|
140
|
+
)
|
134
141
|
grammars = [
|
135
142
|
self.json_schema,
|
136
143
|
self.regex,
|