sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. sglang/lang/chat_template.py +21 -0
  2. sglang/srt/_custom_ops.py +29 -1
  3. sglang/srt/configs/internvl.py +3 -0
  4. sglang/srt/configs/model_config.py +5 -1
  5. sglang/srt/constrained/base_grammar_backend.py +10 -2
  6. sglang/srt/constrained/xgrammar_backend.py +7 -5
  7. sglang/srt/conversation.py +17 -2
  8. sglang/srt/debug_utils/__init__.py +0 -0
  9. sglang/srt/debug_utils/dump_comparator.py +131 -0
  10. sglang/srt/debug_utils/dumper.py +108 -0
  11. sglang/srt/debug_utils/text_comparator.py +172 -0
  12. sglang/srt/disaggregation/common/conn.py +34 -6
  13. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
  14. sglang/srt/disaggregation/mini_lb.py +3 -2
  15. sglang/srt/disaggregation/mooncake/conn.py +65 -20
  16. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  17. sglang/srt/disaggregation/nixl/conn.py +17 -13
  18. sglang/srt/disaggregation/prefill.py +13 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  21. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  22. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  23. sglang/srt/distributed/parallel_state.py +70 -15
  24. sglang/srt/entrypoints/engine.py +5 -9
  25. sglang/srt/entrypoints/http_server.py +20 -32
  26. sglang/srt/entrypoints/openai/protocol.py +3 -3
  27. sglang/srt/entrypoints/openai/serving_chat.py +148 -72
  28. sglang/srt/function_call/base_format_detector.py +74 -12
  29. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  30. sglang/srt/function_call/ebnf_composer.py +105 -66
  31. sglang/srt/function_call/function_call_parser.py +6 -4
  32. sglang/srt/function_call/glm4_moe_detector.py +164 -0
  33. sglang/srt/function_call/kimik2_detector.py +41 -16
  34. sglang/srt/function_call/llama32_detector.py +6 -3
  35. sglang/srt/function_call/mistral_detector.py +11 -3
  36. sglang/srt/function_call/pythonic_detector.py +16 -14
  37. sglang/srt/function_call/qwen25_detector.py +12 -3
  38. sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
  39. sglang/srt/layers/activation.py +11 -3
  40. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  41. sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
  42. sglang/srt/layers/attention/vision.py +56 -8
  43. sglang/srt/layers/communicator.py +12 -12
  44. sglang/srt/layers/dp_attention.py +72 -24
  45. sglang/srt/layers/layernorm.py +26 -1
  46. sglang/srt/layers/logits_processor.py +46 -25
  47. sglang/srt/layers/moe/ep_moe/layer.py +172 -206
  48. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
  51. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
  52. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
  53. sglang/srt/layers/moe/topk.py +88 -34
  54. sglang/srt/layers/multimodal.py +11 -8
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
  56. sglang/srt/layers/quantization/fp8.py +25 -247
  57. sglang/srt/layers/quantization/fp8_kernel.py +78 -48
  58. sglang/srt/layers/quantization/modelopt_quant.py +33 -14
  59. sglang/srt/layers/quantization/unquant.py +24 -76
  60. sglang/srt/layers/quantization/utils.py +0 -9
  61. sglang/srt/layers/quantization/w4afp8.py +68 -17
  62. sglang/srt/layers/radix_attention.py +5 -3
  63. sglang/srt/lora/lora_manager.py +133 -169
  64. sglang/srt/lora/lora_registry.py +188 -0
  65. sglang/srt/lora/mem_pool.py +2 -2
  66. sglang/srt/managers/cache_controller.py +62 -13
  67. sglang/srt/managers/io_struct.py +19 -1
  68. sglang/srt/managers/mm_utils.py +154 -35
  69. sglang/srt/managers/multimodal_processor.py +3 -14
  70. sglang/srt/managers/schedule_batch.py +27 -11
  71. sglang/srt/managers/scheduler.py +48 -26
  72. sglang/srt/managers/tokenizer_manager.py +62 -28
  73. sglang/srt/managers/tp_worker.py +5 -4
  74. sglang/srt/mem_cache/allocator.py +67 -7
  75. sglang/srt/mem_cache/hicache_storage.py +17 -1
  76. sglang/srt/mem_cache/hiradix_cache.py +35 -18
  77. sglang/srt/mem_cache/memory_pool_host.py +3 -0
  78. sglang/srt/model_executor/cuda_graph_runner.py +61 -25
  79. sglang/srt/model_executor/forward_batch_info.py +201 -29
  80. sglang/srt/model_executor/model_runner.py +109 -37
  81. sglang/srt/models/deepseek_v2.py +63 -30
  82. sglang/srt/models/glm4_moe.py +1035 -0
  83. sglang/srt/models/glm4_moe_nextn.py +167 -0
  84. sglang/srt/models/interns1.py +328 -0
  85. sglang/srt/models/internvl.py +143 -47
  86. sglang/srt/models/llava.py +9 -5
  87. sglang/srt/models/minicpmo.py +4 -1
  88. sglang/srt/models/mllama4.py +10 -3
  89. sglang/srt/models/qwen2_moe.py +2 -6
  90. sglang/srt/models/qwen3_moe.py +6 -8
  91. sglang/srt/multimodal/processors/base_processor.py +20 -6
  92. sglang/srt/multimodal/processors/clip.py +2 -2
  93. sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
  94. sglang/srt/multimodal/processors/gemma3.py +2 -2
  95. sglang/srt/multimodal/processors/gemma3n.py +2 -2
  96. sglang/srt/multimodal/processors/internvl.py +21 -8
  97. sglang/srt/multimodal/processors/janus_pro.py +2 -2
  98. sglang/srt/multimodal/processors/kimi_vl.py +2 -2
  99. sglang/srt/multimodal/processors/llava.py +4 -4
  100. sglang/srt/multimodal/processors/minicpm.py +2 -3
  101. sglang/srt/multimodal/processors/mlama.py +2 -2
  102. sglang/srt/multimodal/processors/mllama4.py +18 -111
  103. sglang/srt/multimodal/processors/phi4mm.py +2 -2
  104. sglang/srt/multimodal/processors/pixtral.py +2 -2
  105. sglang/srt/multimodal/processors/qwen_audio.py +2 -2
  106. sglang/srt/multimodal/processors/qwen_vl.py +2 -2
  107. sglang/srt/multimodal/processors/vila.py +3 -1
  108. sglang/srt/reasoning_parser.py +48 -5
  109. sglang/srt/sampling/sampling_batch_info.py +6 -5
  110. sglang/srt/server_args.py +132 -60
  111. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  112. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
  113. sglang/srt/speculative/eagle_utils.py +51 -23
  114. sglang/srt/speculative/eagle_worker.py +59 -44
  115. sglang/srt/two_batch_overlap.py +9 -5
  116. sglang/srt/utils.py +113 -69
  117. sglang/srt/weight_sync/utils.py +119 -0
  118. sglang/test/runners.py +4 -0
  119. sglang/test/test_activation.py +50 -1
  120. sglang/test/test_utils.py +65 -5
  121. sglang/utils.py +19 -0
  122. sglang/version.py +1 -1
  123. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
  124. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
  125. sglang/srt/debug_utils.py +0 -74
  126. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
  127. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
  128. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -15,8 +15,8 @@ from sglang.srt.multimodal.processors.base_processor import (
15
15
  class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
16
16
  models = [MiniCPMV, MiniCPMO]
17
17
 
18
- def __init__(self, hf_config, server_args, _processor):
19
- super().__init__(hf_config, server_args, _processor)
18
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
19
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
20
20
  # Collect special token ids
21
21
  tokenizer = self._processor.tokenizer
22
22
  self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
@@ -26,7 +26,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
26
26
  self.im_start_id = getattr(tokenizer, "im_start_id", None)
27
27
  self.im_end_id = getattr(tokenizer, "im_end_id", None)
28
28
  self.im_token_id = getattr(tokenizer, "unk_id", None)
29
-
30
29
  self.mm_tokens = MultimodalSpecialTokens(
31
30
  image_token="(<image>./</image>)",
32
31
  audio_token="(<audio>./</audio>)",
@@ -10,8 +10,8 @@ from sglang.srt.multimodal.processors.base_processor import (
10
10
  class MllamaImageProcessor(BaseMultimodalProcessor):
11
11
  models = [MllamaForConditionalGeneration]
12
12
 
13
- def __init__(self, hf_config, server_args, _processor):
14
- super().__init__(hf_config, server_args, _processor)
13
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
14
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
15
15
  self.mm_tokens = MultimodalSpecialTokens(
16
16
  image_token=self._processor.image_token,
17
17
  image_token_id=self._processor.image_token_id,
@@ -18,16 +18,16 @@ from sglang.srt.multimodal.processors.base_processor import (
18
18
  class Mllama4ImageProcessor(BaseMultimodalProcessor):
19
19
  models = [Llama4ForConditionalGeneration]
20
20
 
21
- def __init__(self, hf_config, server_args, _processor):
22
- super().__init__(hf_config, server_args, _processor)
21
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
22
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
23
23
  self.vision_config = hf_config.vision_config
24
24
  self.text_config = hf_config.text_config
25
- self.boi_token_index = hf_config.boi_token_index
26
- self.eoi_token_index = hf_config.eoi_token_index
27
- self.image_token_index = hf_config.image_token_index
28
- self.multimodal_tokens = MultimodalSpecialTokens(
25
+ self.IM_START_TOKEN_ID = hf_config.boi_token_index
26
+ self.IM_END_TOKEN_ID = hf_config.eoi_token_index
27
+ self.IM_TOKEN_ID = hf_config.image_token_index
28
+ self.mm_tokens = MultimodalSpecialTokens(
29
29
  image_token=_processor.image_token,
30
- image_token_id=self.image_token_index,
30
+ image_token_id=self.IM_TOKEN_ID,
31
31
  ).build(_processor)
32
32
 
33
33
  async def process_mm_data_async(
@@ -37,114 +37,21 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
37
37
  *args,
38
38
  **kwargs,
39
39
  ):
40
- if isinstance(input_text, list):
41
- assert len(input_text) and isinstance(input_text[0], int)
42
- input_text = self._processor.tokenizer.decode(input_text)
43
-
44
- # Process images and text using the base processor's load_mm_data method
45
- processed_data = self.load_mm_data(
40
+ base_output = self.load_mm_data(
46
41
  prompt=input_text,
47
- multimodal_tokens=self.multimodal_tokens,
48
42
  image_data=image_data,
49
- return_text=True,
43
+ multimodal_tokens=self.mm_tokens,
50
44
  )
51
45
 
52
- # Process the images using the processor
53
- processor = self._processor
54
-
55
46
  # Process the prompt and images
56
- processor_output = self.process_mm_data(
57
- input_text=processed_data.input_text,
58
- images=processed_data.images,
59
- )
60
-
61
- # Handle image resolutions and aspect ratios
62
- if "pixel_values" not in processor_output: # no image processed
63
- return None
64
-
65
- image_processor = processor.image_processor
66
- tokenizer = self._processor.tokenizer
67
-
68
- # Calculate tile size and find supported resolutions
69
- tile_size = self.vision_config.image_size
70
- max_num_tiles = getattr(self.vision_config, "max_patches", 1)
71
-
72
- possible_resolutions = find_supported_resolutions(
73
- max_num_chunks=max_num_tiles,
74
- patch_size=SizeDict(height=tile_size, width=tile_size),
47
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
48
+ base_output, self.mm_tokens
75
49
  )
76
50
 
77
- # Find best fit for each image
78
- best_fit_sizes = [
79
- get_best_fit(
80
- (image.size[1], image.size[0]), # (height, width)
81
- torch.tensor(possible_resolutions),
82
- resize_to_max_canvas=image_processor.resize_to_max_canvas,
83
- )
84
- for image in processed_data.images
85
- ]
86
-
87
- # Calculate aspect ratios and patches per image
88
- aspect_ratios = [
89
- (image_size[0] // tile_size, image_size[1] // tile_size)
90
- for image_size in best_fit_sizes
91
- ]
92
-
93
- patches_per_image = [
94
- 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
95
- ]
96
-
97
- # Add to image_inputs
98
- processor_output["aspect_ratios"] = aspect_ratios
99
- processor_output["patches_per_image"] = torch.tensor(patches_per_image)
100
-
101
- # Process embed_is_patch
102
- vocab = tokenizer.get_vocab()
103
- patch_id = vocab.get(processor.img_patch_token, -1)
104
- image_end_id = vocab.get(processor.end_of_img_token, -1)
105
-
106
- if patch_id != -1 and image_end_id != -1:
107
- input_ids = processor_output["input_ids"].view(-1)
108
-
109
- # Remove BOS token if present
110
- if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
111
- input_ids = input_ids[1:]
112
-
113
- # Find image end indices and split input_ids
114
- image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
115
-
116
- if image_end_indices.size(0) > 0:
117
- # Split at image boundaries
118
- split_indices = (image_end_indices + 1)[:-1]
119
- split_input_ids = torch.tensor_split(input_ids, split_indices)
120
- split_input_ids = [x for x in split_input_ids if x.numel() > 0]
121
-
122
- # Create embed_is_patch for each image
123
- embed_is_patch = []
124
- for per_image_input_ids in split_input_ids:
125
- embed_is_patch.append(per_image_input_ids == patch_id)
126
-
127
- processor_output["embed_is_patch"] = embed_is_patch
128
-
129
- # Convert to the format expected by SGLang
130
- processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
131
-
132
- processor_output["im_start_id"] = self.boi_token_index
133
- processor_output["im_end_id"] = self.eoi_token_index
134
- processor_output["im_token_id"] = self.image_token_index
135
-
136
- image_offsets = self.get_mm_items_offset(
137
- input_ids=torch.tensor(processor_output["input_ids"]),
138
- mm_token_id=self.image_token_index,
139
- )
140
-
141
- # Add metadata for image processing
142
- processor_output["mm_items"] = [
143
- MultimodalDataItem(
144
- feature=processor_output["pixel_values"],
145
- modality=Modality.IMAGE,
146
- offsets=image_offsets,
147
- )
148
- ]
149
-
150
- return processor_output
51
+ return {
52
+ "input_ids": input_ids.tolist(),
53
+ "mm_items": mm_items,
54
+ "im_start_id": self.IM_START_TOKEN_ID,
55
+ "im_end_id": self.IM_END_TOKEN_ID,
56
+ "im_token_id": self.IM_TOKEN_ID,
57
+ }
@@ -47,9 +47,9 @@ class Phi4MMProcessorAdapter(ProcessorMixin):
47
47
  class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
48
48
  models = [Phi4MMForCausalLM]
49
49
 
50
- def __init__(self, hf_config, server_args, _processor):
50
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
51
51
  self.processor = Phi4MMProcessorAdapter(_processor)
52
- super().__init__(hf_config, server_args, self.processor)
52
+ super().__init__(hf_config, server_args, self.processor, *args, **kwargs)
53
53
 
54
54
  # the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
55
55
  # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
@@ -42,8 +42,8 @@ class PixtralProcessor(BaseMultimodalProcessor):
42
42
 
43
43
  return ncols, nrows
44
44
 
45
- def __init__(self, hf_config, server_args, _processor):
46
- super().__init__(hf_config, server_args, _processor)
45
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
46
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
47
47
  self.IM_TOKEN_ID = getattr(
48
48
  hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
49
49
  )
@@ -11,8 +11,8 @@ from sglang.srt.multimodal.processors.base_processor import (
11
11
  class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
12
12
  models = [Qwen2AudioForConditionalGeneration]
13
13
 
14
- def __init__(self, hf_config, server_args, _processor):
15
- super().__init__(hf_config, server_args, _processor)
14
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
15
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
16
16
  self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
17
17
  self.AUDIO_TOKEN_REGEX = re.compile(
18
18
  r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
@@ -201,8 +201,8 @@ async def preprocess_video(
201
201
  class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
202
202
  models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
203
203
 
204
- def __init__(self, hf_config, server_args, _processor):
205
- super().__init__(hf_config, server_args, _processor)
204
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
205
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
206
206
  # The regex that matches expanded image tokens.
207
207
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
208
208
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
@@ -34,8 +34,10 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
34
34
  hf_config: PretrainedConfig,
35
35
  server_args: ServerArgs,
36
36
  _processor: VILAProcessor,
37
+ *args,
38
+ **kwargs,
37
39
  ) -> None:
38
- super().__init__(hf_config, server_args, _processor)
40
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
39
41
  self.mm_tokens = MultimodalSpecialTokens(
40
42
  image_token=self._processor.tokenizer.image_token,
41
43
  image_token_id=hf_config.image_token_id,
@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
32
32
  One-time parsing: Detects and parses reasoning sections in the provided text.
33
33
  Returns both reasoning content and normal text separately.
34
34
  """
35
- in_reasoning = self._in_reasoning or text.startswith(self.think_start_token)
35
+ in_reasoning = self._in_reasoning or self.think_start_token in text
36
36
 
37
37
  if not in_reasoning:
38
38
  return StreamingParseResult(normal_text=text)
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
118
118
  Returns all the text before the </think> tag as `reasoning_text`
119
119
  and the rest of the text as `normal_text`.
120
120
 
121
+ Supported models:
122
+ - DeepSeek-R1: Always generates thinking content without <think> start tag
123
+ - DeepSeek-R1-0528: Generates thinking content with <think> start tag
124
+
125
+ Format patterns:
126
+ - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
127
+ - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
128
+
121
129
  Args:
122
130
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
123
131
  If True, streams reasoning content as it arrives.
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
136
144
 
137
145
  class Qwen3Detector(BaseReasoningFormatDetector):
138
146
  """
139
- Detector for Qwen3 model.
147
+ Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
140
148
  Assumes reasoning format:
141
149
  (<think>)*(.*)</think>
142
- Returns all the text before the </think> tag as `reasoning_text`
143
- and the rest of the text as `normal_text`.
150
+
151
+ Qwen3 models released before 07/2025 supports switching between thinking mode and normal
152
+ mode using `enable_thinking` parameter in the request parameter.
153
+ - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
154
+ - enable_thinking=False: "The answer is 42." (no thinking tokens)
155
+
156
+ This detector handles both cases.
157
+
158
+ NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
159
+ Those models always generate thinking content without <think> start tags.
160
+ Use "qwen3-thinking" parser type for those models instead.
144
161
 
145
162
  Args:
146
163
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
148
165
  """
149
166
 
150
167
  def __init__(self, stream_reasoning: bool = True):
151
- # Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
152
168
  super().__init__(
153
169
  "<think>",
154
170
  "</think>",
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
157
173
  )
158
174
 
159
175
 
176
+ class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
177
+ """
178
+ Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
179
+ Assumes reasoning format:
180
+ *(.*)</think>
181
+
182
+ These models always generate thinking content without <think> start tag.
183
+ They do not support the enable_thinking parameter and always think.
184
+
185
+ Format: "I need to think about this...</think>The answer is 42."
186
+
187
+ Args:
188
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
189
+ If True, streams reasoning content as it arrives.
190
+ """
191
+
192
+ def __init__(self, stream_reasoning: bool = True):
193
+ super().__init__(
194
+ "<think>",
195
+ "</think>",
196
+ force_reasoning=True,
197
+ stream_reasoning=stream_reasoning,
198
+ )
199
+
200
+
160
201
  class KimiDetector(BaseReasoningFormatDetector):
161
202
  """
162
203
  Detector for Kimi Thinking model.
@@ -189,6 +230,8 @@ class ReasoningParser:
189
230
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
190
231
  "deepseek-r1": DeepSeekR1Detector,
191
232
  "qwen3": Qwen3Detector,
233
+ "qwen3-thinking": Qwen3ThinkingDetector,
234
+ "glm45": Qwen3Detector,
192
235
  "kimi": KimiDetector,
193
236
  }
194
237
 
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
322
322
  # Set the flag to True if any of the two has custom logit processor
323
323
  self.has_custom_logit_processor = True
324
324
 
325
+ # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
326
+ # See note below on len(self) and len(other).
327
+ self.logit_bias = merge_bias_tensor(
328
+ self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
329
+ )
330
+
325
331
  # Note: because the __len()__ operator is defined on the temperatures tensor,
326
332
  # please make sure any merge operation with len(self) or len(other) is done before
327
333
  # the merge operation of the temperatures tensor below.
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
340
346
  self.need_top_k_sampling |= other.need_top_k_sampling
341
347
  self.need_min_p_sampling |= other.need_min_p_sampling
342
348
 
343
- # Merge logit bias
344
- self.logit_bias = merge_bias_tensor(
345
- self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
346
- )
347
-
348
349
 
349
350
  def merge_bias_tensor(
350
351
  lhs: Optional[torch.Tensor],