sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/_custom_ops.py +29 -1
  4. sglang/srt/configs/deepseekvl2.py +11 -2
  5. sglang/srt/configs/internvl.py +3 -0
  6. sglang/srt/configs/janus_pro.py +3 -0
  7. sglang/srt/configs/model_config.py +10 -8
  8. sglang/srt/configs/update_config.py +3 -1
  9. sglang/srt/conversation.py +2 -1
  10. sglang/srt/custom_op.py +5 -2
  11. sglang/srt/disaggregation/common/conn.py +34 -6
  12. sglang/srt/disaggregation/decode.py +9 -1
  13. sglang/srt/disaggregation/mini_lb.py +3 -2
  14. sglang/srt/disaggregation/mooncake/conn.py +93 -76
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  16. sglang/srt/disaggregation/nixl/conn.py +17 -13
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  18. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  19. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  20. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  21. sglang/srt/distributed/parallel_state.py +103 -15
  22. sglang/srt/entrypoints/engine.py +31 -33
  23. sglang/srt/entrypoints/http_server.py +20 -32
  24. sglang/srt/entrypoints/openai/protocol.py +3 -3
  25. sglang/srt/entrypoints/openai/serving_chat.py +48 -6
  26. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  27. sglang/srt/function_call/base_format_detector.py +74 -12
  28. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  29. sglang/srt/function_call/ebnf_composer.py +95 -63
  30. sglang/srt/function_call/function_call_parser.py +4 -2
  31. sglang/srt/function_call/kimik2_detector.py +41 -16
  32. sglang/srt/function_call/llama32_detector.py +6 -3
  33. sglang/srt/function_call/mistral_detector.py +11 -3
  34. sglang/srt/function_call/pythonic_detector.py +16 -14
  35. sglang/srt/function_call/qwen25_detector.py +12 -3
  36. sglang/srt/function_call/qwen3_coder_detector.py +151 -0
  37. sglang/srt/hf_transformers_utils.py +0 -1
  38. sglang/srt/layers/activation.py +24 -3
  39. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  41. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  42. sglang/srt/layers/communicator.py +12 -12
  43. sglang/srt/layers/dp_attention.py +72 -24
  44. sglang/srt/layers/linear.py +13 -102
  45. sglang/srt/layers/logits_processor.py +34 -24
  46. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  47. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  49. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
  57. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  58. sglang/srt/layers/moe/topk.py +190 -23
  59. sglang/srt/layers/quantization/__init__.py +20 -134
  60. sglang/srt/layers/quantization/awq.py +578 -11
  61. sglang/srt/layers/quantization/awq_triton.py +339 -0
  62. sglang/srt/layers/quantization/base_config.py +85 -10
  63. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  64. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  65. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
  66. sglang/srt/layers/quantization/fp8.py +273 -62
  67. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  68. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  69. sglang/srt/layers/quantization/gptq.py +501 -143
  70. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  71. sglang/srt/layers/quantization/modelopt_quant.py +34 -112
  72. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  73. sglang/srt/layers/quantization/petit.py +252 -0
  74. sglang/srt/layers/quantization/petit_utils.py +104 -0
  75. sglang/srt/layers/quantization/qoq.py +7 -6
  76. sglang/srt/layers/quantization/scalar_type.py +352 -0
  77. sglang/srt/layers/quantization/unquant.py +422 -0
  78. sglang/srt/layers/quantization/utils.py +340 -9
  79. sglang/srt/layers/quantization/w4afp8.py +8 -4
  80. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  81. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  82. sglang/srt/layers/radix_attention.py +5 -3
  83. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  84. sglang/srt/lora/lora.py +0 -4
  85. sglang/srt/lora/lora_manager.py +162 -164
  86. sglang/srt/lora/lora_registry.py +124 -0
  87. sglang/srt/lora/mem_pool.py +83 -35
  88. sglang/srt/lora/utils.py +12 -5
  89. sglang/srt/managers/cache_controller.py +288 -0
  90. sglang/srt/managers/io_struct.py +60 -30
  91. sglang/srt/managers/mm_utils.py +7 -8
  92. sglang/srt/managers/schedule_batch.py +163 -113
  93. sglang/srt/managers/schedule_policy.py +68 -27
  94. sglang/srt/managers/scheduler.py +256 -86
  95. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  96. sglang/srt/managers/tokenizer_manager.py +38 -27
  97. sglang/srt/managers/tp_worker.py +16 -4
  98. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  99. sglang/srt/mem_cache/allocator.py +74 -23
  100. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  101. sglang/srt/mem_cache/chunk_cache.py +5 -2
  102. sglang/srt/mem_cache/hicache_storage.py +168 -0
  103. sglang/srt/mem_cache/hiradix_cache.py +194 -5
  104. sglang/srt/mem_cache/memory_pool.py +16 -1
  105. sglang/srt/mem_cache/memory_pool_host.py +44 -2
  106. sglang/srt/mem_cache/radix_cache.py +26 -0
  107. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  108. sglang/srt/metrics/collector.py +9 -0
  109. sglang/srt/model_executor/cuda_graph_runner.py +66 -31
  110. sglang/srt/model_executor/forward_batch_info.py +210 -25
  111. sglang/srt/model_executor/model_runner.py +147 -42
  112. sglang/srt/model_loader/loader.py +7 -1
  113. sglang/srt/model_loader/utils.py +4 -4
  114. sglang/srt/models/clip.py +1 -1
  115. sglang/srt/models/deepseek.py +9 -6
  116. sglang/srt/models/deepseek_janus_pro.py +1 -1
  117. sglang/srt/models/deepseek_v2.py +192 -173
  118. sglang/srt/models/deepseek_vl2.py +5 -5
  119. sglang/srt/models/gemma.py +48 -0
  120. sglang/srt/models/gemma2.py +52 -0
  121. sglang/srt/models/gemma3_causal.py +63 -0
  122. sglang/srt/models/gemma3_mm.py +1 -1
  123. sglang/srt/models/gemma3n_mm.py +2 -4
  124. sglang/srt/models/granitemoe.py +385 -0
  125. sglang/srt/models/grok.py +9 -3
  126. sglang/srt/models/hunyuan.py +63 -16
  127. sglang/srt/models/internvl.py +1 -1
  128. sglang/srt/models/kimi_vl.py +1 -1
  129. sglang/srt/models/llama.py +41 -0
  130. sglang/srt/models/llama4.py +11 -11
  131. sglang/srt/models/llava.py +2 -2
  132. sglang/srt/models/llavavid.py +1 -1
  133. sglang/srt/models/minicpm.py +0 -2
  134. sglang/srt/models/minicpmo.py +3 -7
  135. sglang/srt/models/minicpmv.py +1 -1
  136. sglang/srt/models/mistral.py +1 -1
  137. sglang/srt/models/mixtral.py +9 -2
  138. sglang/srt/models/mllama.py +3 -5
  139. sglang/srt/models/mllama4.py +13 -6
  140. sglang/srt/models/olmoe.py +8 -5
  141. sglang/srt/models/persimmon.py +330 -0
  142. sglang/srt/models/phi.py +321 -0
  143. sglang/srt/models/phi4mm.py +44 -4
  144. sglang/srt/models/phi4mm_audio.py +1260 -0
  145. sglang/srt/models/phi4mm_utils.py +1917 -0
  146. sglang/srt/models/phimoe.py +9 -3
  147. sglang/srt/models/qwen.py +37 -0
  148. sglang/srt/models/qwen2.py +41 -0
  149. sglang/srt/models/qwen2_5_vl.py +4 -4
  150. sglang/srt/models/qwen2_audio.py +1 -1
  151. sglang/srt/models/qwen2_moe.py +53 -9
  152. sglang/srt/models/qwen2_vl.py +4 -4
  153. sglang/srt/models/qwen3.py +65 -1
  154. sglang/srt/models/qwen3_moe.py +57 -24
  155. sglang/srt/models/vila.py +1 -1
  156. sglang/srt/multimodal/processors/base_processor.py +91 -97
  157. sglang/srt/multimodal/processors/clip.py +21 -19
  158. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  159. sglang/srt/multimodal/processors/gemma3.py +13 -17
  160. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  161. sglang/srt/multimodal/processors/internvl.py +9 -10
  162. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  163. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  164. sglang/srt/multimodal/processors/llava.py +4 -2
  165. sglang/srt/multimodal/processors/minicpm.py +35 -44
  166. sglang/srt/multimodal/processors/mlama.py +21 -18
  167. sglang/srt/multimodal/processors/mllama4.py +4 -5
  168. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  169. sglang/srt/multimodal/processors/pixtral.py +14 -35
  170. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  171. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  172. sglang/srt/multimodal/processors/vila.py +14 -14
  173. sglang/srt/reasoning_parser.py +46 -4
  174. sglang/srt/sampling/sampling_batch_info.py +6 -5
  175. sglang/srt/sampling/sampling_params.py +8 -1
  176. sglang/srt/server_args.py +454 -270
  177. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  178. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
  179. sglang/srt/speculative/eagle_utils.py +51 -23
  180. sglang/srt/speculative/eagle_worker.py +59 -44
  181. sglang/srt/two_batch_overlap.py +10 -5
  182. sglang/srt/utils.py +44 -69
  183. sglang/test/runners.py +14 -3
  184. sglang/test/test_activation.py +50 -1
  185. sglang/test/test_block_fp8.py +8 -3
  186. sglang/test/test_block_fp8_ep.py +1 -1
  187. sglang/test/test_custom_ops.py +12 -7
  188. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  189. sglang/test/test_fp4_moe.py +1 -3
  190. sglang/test/test_marlin_moe.py +286 -0
  191. sglang/test/test_marlin_utils.py +171 -0
  192. sglang/test/test_utils.py +35 -0
  193. sglang/version.py +1 -1
  194. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
  195. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
  196. sglang/srt/layers/quantization/quant_utils.py +0 -166
  197. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  198. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
  199. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
  200. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ import re
2
+
3
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
+ from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
5
+ from sglang.srt.multimodal.processors.base_processor import (
6
+ BaseMultimodalProcessor,
7
+ MultimodalSpecialTokens,
8
+ )
9
+
10
+
11
+ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
12
+ models = [Qwen2AudioForConditionalGeneration]
13
+
14
+ def __init__(self, hf_config, server_args, _processor):
15
+ super().__init__(hf_config, server_args, _processor)
16
+ self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
17
+ self.AUDIO_TOKEN_REGEX = re.compile(
18
+ r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
19
+ )
20
+ # Collect special token ids
21
+ tokenizer = self._processor.tokenizer
22
+ self.audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
23
+ self.audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
24
+ self.audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
25
+
26
+ self.mm_tokens = MultimodalSpecialTokens(
27
+ audio_token=self.AUDIO_TOKEN,
28
+ audio_token_regex=self.AUDIO_TOKEN_REGEX,
29
+ audio_token_id=self.audio_token_id,
30
+ ).build(_processor)
31
+
32
+ async def process_mm_data_async(
33
+ self,
34
+ audio_data,
35
+ input_text,
36
+ **kwargs,
37
+ ):
38
+ base_output = self.load_mm_data(
39
+ prompt=input_text,
40
+ audio_data=audio_data,
41
+ multimodal_tokens=self.mm_tokens,
42
+ )
43
+ if base_output is None:
44
+ return None
45
+
46
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
47
+ base_output, self.mm_tokens
48
+ )
49
+
50
+ assert (
51
+ "feature_attention_mask" in ret
52
+ ), "feature_attention_mask not found in processor output"
53
+ input_lengths = ret["feature_attention_mask"].sum(dim=-1)
54
+ input_lengths = (input_lengths - 1) // 2 + 1
55
+ output_lengths = (input_lengths - 2) // 2 + 1
56
+
57
+ mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
58
+
59
+ return {
60
+ "mm_items": mm_items,
61
+ "input_ids": input_ids.tolist(),
62
+ "audio_start_id": self.audio_start_id,
63
+ "audio_token_id": self.audio_token_id,
64
+ "audio_end_id": self.audio_end_id,
65
+ }
@@ -203,16 +203,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
203
203
 
204
204
  def __init__(self, hf_config, server_args, _processor):
205
205
  super().__init__(hf_config, server_args, _processor)
206
- # The single, pre-expanded image token.
207
- self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
208
206
  # The regex that matches expanded image tokens.
209
- self.IMAGE_TOKEN_REGEX = re.compile(
210
- r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
211
- )
212
207
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
213
208
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
214
- self.IM_TOKEN_ID = hf_config.image_token_id
215
- self.VIDEO_TOKEN_ID = hf_config.video_token_id
216
209
  self.vision_start_token_id = hf_config.vision_start_token_id
217
210
  self.vision_end_token_id = hf_config.vision_end_token_id
218
211
  self.NUM_TOKEN_PER_FRAME = 770
@@ -220,19 +213,20 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
220
213
  self.MIN_PIXELS = 4 * 28 * 28
221
214
  self.MAX_PIXELS = 16384 * 28 * 28
222
215
  self.MAX_RATIO = 200
223
- # TODO(mick): move all MultimodalSpecialTokens initializations into processor init
224
- self.mm_special_tokens = MultimodalSpecialTokens(
225
- image_token=self.IMAGE_TOKEN,
226
- image_token_regex=self.IMAGE_TOKEN_REGEX,
227
- video_token=self.VIDEO_TOKEN_ID,
228
- )
216
+ self.mm_tokens = MultimodalSpecialTokens(
217
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
218
+ image_token_id=hf_config.image_token_id,
219
+ image_token_regex=re.compile(
220
+ r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
221
+ ),
222
+ video_token_id=hf_config.video_token_id,
223
+ ).build(_processor)
229
224
 
230
225
  async def process_mm_data_async(
231
226
  self,
232
227
  image_data: List[Union[str, bytes]],
233
228
  input_text,
234
229
  request_obj,
235
- max_req_input_len,
236
230
  *args,
237
231
  **kwargs,
238
232
  ):
@@ -241,8 +235,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
241
235
  prompt=input_text,
242
236
  image_data=image_data,
243
237
  video_data=request_obj.video_data,
244
- multimodal_tokens=self.mm_special_tokens,
245
- max_req_input_len=max_req_input_len,
238
+ multimodal_tokens=self.mm_tokens,
246
239
  )
247
240
 
248
241
  # Qwen-specific: resize images if they are raw Image objects
@@ -255,13 +248,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
255
248
  await preprocess_video(video) for video in base_output.videos
256
249
  ]
257
250
 
258
- mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
251
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
252
+ base_output, self.mm_tokens
253
+ )
259
254
 
260
255
  input_ids = input_ids.flatten()
261
256
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
262
257
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
263
- image_token_id=self.IM_TOKEN_ID,
264
- video_token_id=self.VIDEO_TOKEN_ID,
258
+ image_token_id=self.mm_tokens.image_token_id,
259
+ video_token_id=self.mm_tokens.video_token_id,
265
260
  vision_start_token_id=self.vision_start_token_id,
266
261
  model_type=self.hf_config.model_type,
267
262
  tokens_per_second=getattr(
@@ -279,8 +274,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
279
274
  "mm_items": mm_items,
280
275
  "im_start_id": self.IM_START_TOKEN_ID,
281
276
  "im_end_id": self.IM_END_TOKEN_ID,
282
- "im_token_id": self.IM_TOKEN_ID,
283
- "video_token_id": self.VIDEO_TOKEN_ID,
277
+ "im_token_id": self.mm_tokens.image_token_id,
278
+ "video_token_id": self.mm_tokens.video_token_id,
284
279
  "mrope_positions": mrope_positions,
285
280
  "mrope_position_delta": mrope_position_delta,
286
281
  }
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List, Optional, Type, cast
1
+ from typing import Any, Dict, List, Optional, Type
2
2
 
3
3
  import torch.nn as nn
4
4
  from transformers.configuration_utils import PretrainedConfig
@@ -8,9 +8,8 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
8
8
  from sglang.srt.managers.io_struct import (
9
9
  EmbeddingReqInput,
10
10
  GenerateReqInput,
11
- ImageDataItem,
11
+ ImageDataInputItem,
12
12
  )
13
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
14
13
  from sglang.srt.models.vila import VILAForConditionalGeneration
15
14
  from sglang.srt.multimodal.processors.base_processor import (
16
15
  BaseMultimodalProcessor,
@@ -37,31 +36,32 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
37
36
  _processor: VILAProcessor,
38
37
  ) -> None:
39
38
  super().__init__(hf_config, server_args, _processor)
40
- self.IM_TOKEN_ID = hf_config.image_token_id
41
- self.VIDEO_TOKEN_ID = hf_config.video_token_id
39
+ self.mm_tokens = MultimodalSpecialTokens(
40
+ image_token=self._processor.tokenizer.image_token,
41
+ image_token_id=hf_config.image_token_id,
42
+ video_token_id=hf_config.video_token_id,
43
+ ).build(_processor)
42
44
 
43
45
  async def process_mm_data_async(
44
46
  self,
45
- image_data: Optional[ImageDataItem | List[ImageDataItem]],
47
+ image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
46
48
  input_text: str | List[int],
47
49
  request_obj: GenerateReqInput | EmbeddingReqInput,
48
- max_req_input_len: int,
49
50
  **kwargs,
50
51
  ) -> Optional[Dict[str, Any]]:
51
52
  base_output = self.load_mm_data(
52
53
  prompt=input_text,
53
- multimodal_tokens=MultimodalSpecialTokens(
54
- image_token=self._processor.tokenizer.image_token
55
- ),
56
- max_req_input_len=max_req_input_len,
54
+ multimodal_tokens=self.mm_tokens,
57
55
  image_data=image_data,
58
56
  )
59
57
 
60
- mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
58
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
59
+ base_output, self.mm_tokens
60
+ )
61
61
 
62
62
  return {
63
63
  "input_ids": input_ids.tolist(),
64
64
  "mm_items": mm_items,
65
- "im_token_id": self.IM_TOKEN_ID,
66
- "video_token_id": self.VIDEO_TOKEN_ID,
65
+ "im_token_id": self.mm_tokens.image_token_id,
66
+ "video_token_id": self.mm_tokens.video_token_id,
67
67
  }
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
118
118
  Returns all the text before the </think> tag as `reasoning_text`
119
119
  and the rest of the text as `normal_text`.
120
120
 
121
+ Supported models:
122
+ - DeepSeek-R1: Always generates thinking content without <think> start tag
123
+ - DeepSeek-R1-0528: Generates thinking content with <think> start tag
124
+
125
+ Format patterns:
126
+ - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
127
+ - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
128
+
121
129
  Args:
122
130
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
123
131
  If True, streams reasoning content as it arrives.
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
136
144
 
137
145
  class Qwen3Detector(BaseReasoningFormatDetector):
138
146
  """
139
- Detector for Qwen3 model.
147
+ Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
140
148
  Assumes reasoning format:
141
149
  (<think>)*(.*)</think>
142
- Returns all the text before the </think> tag as `reasoning_text`
143
- and the rest of the text as `normal_text`.
150
+
151
+ Qwen3 models released before 07/2025 supports switching between thinking mode and normal
152
+ mode using `enable_thinking` parameter in the request parameter.
153
+ - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
154
+ - enable_thinking=False: "The answer is 42." (no thinking tokens)
155
+
156
+ This detector handles both cases.
157
+
158
+ NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
159
+ Those models always generate thinking content without <think> start tags.
160
+ Use "qwen3-thinking" parser type for those models instead.
144
161
 
145
162
  Args:
146
163
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
148
165
  """
149
166
 
150
167
  def __init__(self, stream_reasoning: bool = True):
151
- # Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
152
168
  super().__init__(
153
169
  "<think>",
154
170
  "</think>",
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
157
173
  )
158
174
 
159
175
 
176
+ class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
177
+ """
178
+ Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
179
+ Assumes reasoning format:
180
+ *(.*)</think>
181
+
182
+ These models always generate thinking content without <think> start tag.
183
+ They do not support the enable_thinking parameter and always think.
184
+
185
+ Format: "I need to think about this...</think>The answer is 42."
186
+
187
+ Args:
188
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
189
+ If True, streams reasoning content as it arrives.
190
+ """
191
+
192
+ def __init__(self, stream_reasoning: bool = True):
193
+ super().__init__(
194
+ "<think>",
195
+ "</think>",
196
+ force_reasoning=True,
197
+ stream_reasoning=stream_reasoning,
198
+ )
199
+
200
+
160
201
  class KimiDetector(BaseReasoningFormatDetector):
161
202
  """
162
203
  Detector for Kimi Thinking model.
@@ -189,6 +230,7 @@ class ReasoningParser:
189
230
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
190
231
  "deepseek-r1": DeepSeekR1Detector,
191
232
  "qwen3": Qwen3Detector,
233
+ "qwen3-thinking": Qwen3ThinkingDetector,
192
234
  "kimi": KimiDetector,
193
235
  }
194
236
 
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
322
322
  # Set the flag to True if any of the two has custom logit processor
323
323
  self.has_custom_logit_processor = True
324
324
 
325
+ # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
326
+ # See note below on len(self) and len(other).
327
+ self.logit_bias = merge_bias_tensor(
328
+ self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
329
+ )
330
+
325
331
  # Note: because the __len()__ operator is defined on the temperatures tensor,
326
332
  # please make sure any merge operation with len(self) or len(other) is done before
327
333
  # the merge operation of the temperatures tensor below.
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
340
346
  self.need_top_k_sampling |= other.need_top_k_sampling
341
347
  self.need_min_p_sampling |= other.need_min_p_sampling
342
348
 
343
- # Merge logit bias
344
- self.logit_bias = merge_bias_tensor(
345
- self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
346
- )
347
-
348
349
 
349
350
  def merge_bias_tensor(
350
351
  lhs: Optional[torch.Tensor],
@@ -89,7 +89,7 @@ class SamplingParams:
89
89
  if self.top_k == -1:
90
90
  self.top_k = TOP_K_ALL # whole vocabulary
91
91
 
92
- def verify(self):
92
+ def verify(self, vocab_size):
93
93
  if self.temperature < 0.0:
94
94
  raise ValueError(
95
95
  f"temperature must be non-negative, got {self.temperature}."
@@ -131,6 +131,13 @@ class SamplingParams:
131
131
  f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
132
132
  f"{self.min_new_tokens}."
133
133
  )
134
+ if self.logit_bias is not None:
135
+ for token_id in self.logit_bias:
136
+ if not 0 <= int(token_id) < vocab_size:
137
+ raise ValueError(
138
+ f"logit_bias must has keys in [0, {vocab_size - 1}], got "
139
+ f"{token_id}."
140
+ )
134
141
  grammars = [
135
142
  self.json_schema,
136
143
  self.regex,