sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +5 -1
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +17 -2
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +65 -20
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +70 -15
- sglang/srt/entrypoints/engine.py +5 -9
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +148 -72
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +105 -66
- sglang/srt/function_call/function_call_parser.py +6 -4
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
- sglang/srt/layers/activation.py +11 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +46 -25
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +88 -34
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +33 -14
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/utils.py +0 -9
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/lora/lora_manager.py +133 -169
- sglang/srt/lora/lora_registry.py +188 -0
- sglang/srt/lora/mem_pool.py +2 -2
- sglang/srt/managers/cache_controller.py +62 -13
- sglang/srt/managers/io_struct.py +19 -1
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +27 -11
- sglang/srt/managers/scheduler.py +48 -26
- sglang/srt/managers/tokenizer_manager.py +62 -28
- sglang/srt/managers/tp_worker.py +5 -4
- sglang/srt/mem_cache/allocator.py +67 -7
- sglang/srt/mem_cache/hicache_storage.py +17 -1
- sglang/srt/mem_cache/hiradix_cache.py +35 -18
- sglang/srt/mem_cache/memory_pool_host.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +61 -25
- sglang/srt/model_executor/forward_batch_info.py +201 -29
- sglang/srt/model_executor/model_runner.py +109 -37
- sglang/srt/models/deepseek_v2.py +63 -30
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/mllama4.py +10 -3
- sglang/srt/models/qwen2_moe.py +2 -6
- sglang/srt/models/qwen3_moe.py +6 -8
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +48 -5
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/server_args.py +132 -60
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils.py +113 -69
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_activation.py +50 -1
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -15,8 +15,8 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
15
15
|
class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
16
16
|
models = [MiniCPMV, MiniCPMO]
|
17
17
|
|
18
|
-
def __init__(self, hf_config, server_args, _processor):
|
19
|
-
super().__init__(hf_config, server_args, _processor)
|
18
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
19
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
20
20
|
# Collect special token ids
|
21
21
|
tokenizer = self._processor.tokenizer
|
22
22
|
self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
|
@@ -26,7 +26,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
|
26
26
|
self.im_start_id = getattr(tokenizer, "im_start_id", None)
|
27
27
|
self.im_end_id = getattr(tokenizer, "im_end_id", None)
|
28
28
|
self.im_token_id = getattr(tokenizer, "unk_id", None)
|
29
|
-
|
30
29
|
self.mm_tokens = MultimodalSpecialTokens(
|
31
30
|
image_token="(<image>./</image>)",
|
32
31
|
audio_token="(<audio>./</audio>)",
|
@@ -10,8 +10,8 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
10
10
|
class MllamaImageProcessor(BaseMultimodalProcessor):
|
11
11
|
models = [MllamaForConditionalGeneration]
|
12
12
|
|
13
|
-
def __init__(self, hf_config, server_args, _processor):
|
14
|
-
super().__init__(hf_config, server_args, _processor)
|
13
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
14
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
15
15
|
self.mm_tokens = MultimodalSpecialTokens(
|
16
16
|
image_token=self._processor.image_token,
|
17
17
|
image_token_id=self._processor.image_token_id,
|
@@ -18,16 +18,16 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
18
18
|
class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
19
19
|
models = [Llama4ForConditionalGeneration]
|
20
20
|
|
21
|
-
def __init__(self, hf_config, server_args, _processor):
|
22
|
-
super().__init__(hf_config, server_args, _processor)
|
21
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
22
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
23
23
|
self.vision_config = hf_config.vision_config
|
24
24
|
self.text_config = hf_config.text_config
|
25
|
-
self.
|
26
|
-
self.
|
27
|
-
self.
|
28
|
-
self.
|
25
|
+
self.IM_START_TOKEN_ID = hf_config.boi_token_index
|
26
|
+
self.IM_END_TOKEN_ID = hf_config.eoi_token_index
|
27
|
+
self.IM_TOKEN_ID = hf_config.image_token_index
|
28
|
+
self.mm_tokens = MultimodalSpecialTokens(
|
29
29
|
image_token=_processor.image_token,
|
30
|
-
image_token_id=self.
|
30
|
+
image_token_id=self.IM_TOKEN_ID,
|
31
31
|
).build(_processor)
|
32
32
|
|
33
33
|
async def process_mm_data_async(
|
@@ -37,114 +37,21 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
|
37
37
|
*args,
|
38
38
|
**kwargs,
|
39
39
|
):
|
40
|
-
|
41
|
-
assert len(input_text) and isinstance(input_text[0], int)
|
42
|
-
input_text = self._processor.tokenizer.decode(input_text)
|
43
|
-
|
44
|
-
# Process images and text using the base processor's load_mm_data method
|
45
|
-
processed_data = self.load_mm_data(
|
40
|
+
base_output = self.load_mm_data(
|
46
41
|
prompt=input_text,
|
47
|
-
multimodal_tokens=self.multimodal_tokens,
|
48
42
|
image_data=image_data,
|
49
|
-
|
43
|
+
multimodal_tokens=self.mm_tokens,
|
50
44
|
)
|
51
45
|
|
52
|
-
# Process the images using the processor
|
53
|
-
processor = self._processor
|
54
|
-
|
55
46
|
# Process the prompt and images
|
56
|
-
|
57
|
-
|
58
|
-
images=processed_data.images,
|
59
|
-
)
|
60
|
-
|
61
|
-
# Handle image resolutions and aspect ratios
|
62
|
-
if "pixel_values" not in processor_output: # no image processed
|
63
|
-
return None
|
64
|
-
|
65
|
-
image_processor = processor.image_processor
|
66
|
-
tokenizer = self._processor.tokenizer
|
67
|
-
|
68
|
-
# Calculate tile size and find supported resolutions
|
69
|
-
tile_size = self.vision_config.image_size
|
70
|
-
max_num_tiles = getattr(self.vision_config, "max_patches", 1)
|
71
|
-
|
72
|
-
possible_resolutions = find_supported_resolutions(
|
73
|
-
max_num_chunks=max_num_tiles,
|
74
|
-
patch_size=SizeDict(height=tile_size, width=tile_size),
|
47
|
+
mm_items, input_ids, _ = self.process_and_combine_mm_data(
|
48
|
+
base_output, self.mm_tokens
|
75
49
|
)
|
76
50
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
for image in processed_data.images
|
85
|
-
]
|
86
|
-
|
87
|
-
# Calculate aspect ratios and patches per image
|
88
|
-
aspect_ratios = [
|
89
|
-
(image_size[0] // tile_size, image_size[1] // tile_size)
|
90
|
-
for image_size in best_fit_sizes
|
91
|
-
]
|
92
|
-
|
93
|
-
patches_per_image = [
|
94
|
-
1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
|
95
|
-
]
|
96
|
-
|
97
|
-
# Add to image_inputs
|
98
|
-
processor_output["aspect_ratios"] = aspect_ratios
|
99
|
-
processor_output["patches_per_image"] = torch.tensor(patches_per_image)
|
100
|
-
|
101
|
-
# Process embed_is_patch
|
102
|
-
vocab = tokenizer.get_vocab()
|
103
|
-
patch_id = vocab.get(processor.img_patch_token, -1)
|
104
|
-
image_end_id = vocab.get(processor.end_of_img_token, -1)
|
105
|
-
|
106
|
-
if patch_id != -1 and image_end_id != -1:
|
107
|
-
input_ids = processor_output["input_ids"].view(-1)
|
108
|
-
|
109
|
-
# Remove BOS token if present
|
110
|
-
if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
|
111
|
-
input_ids = input_ids[1:]
|
112
|
-
|
113
|
-
# Find image end indices and split input_ids
|
114
|
-
image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
|
115
|
-
|
116
|
-
if image_end_indices.size(0) > 0:
|
117
|
-
# Split at image boundaries
|
118
|
-
split_indices = (image_end_indices + 1)[:-1]
|
119
|
-
split_input_ids = torch.tensor_split(input_ids, split_indices)
|
120
|
-
split_input_ids = [x for x in split_input_ids if x.numel() > 0]
|
121
|
-
|
122
|
-
# Create embed_is_patch for each image
|
123
|
-
embed_is_patch = []
|
124
|
-
for per_image_input_ids in split_input_ids:
|
125
|
-
embed_is_patch.append(per_image_input_ids == patch_id)
|
126
|
-
|
127
|
-
processor_output["embed_is_patch"] = embed_is_patch
|
128
|
-
|
129
|
-
# Convert to the format expected by SGLang
|
130
|
-
processor_output["input_ids"] = processor_output["input_ids"].tolist()[0]
|
131
|
-
|
132
|
-
processor_output["im_start_id"] = self.boi_token_index
|
133
|
-
processor_output["im_end_id"] = self.eoi_token_index
|
134
|
-
processor_output["im_token_id"] = self.image_token_index
|
135
|
-
|
136
|
-
image_offsets = self.get_mm_items_offset(
|
137
|
-
input_ids=torch.tensor(processor_output["input_ids"]),
|
138
|
-
mm_token_id=self.image_token_index,
|
139
|
-
)
|
140
|
-
|
141
|
-
# Add metadata for image processing
|
142
|
-
processor_output["mm_items"] = [
|
143
|
-
MultimodalDataItem(
|
144
|
-
feature=processor_output["pixel_values"],
|
145
|
-
modality=Modality.IMAGE,
|
146
|
-
offsets=image_offsets,
|
147
|
-
)
|
148
|
-
]
|
149
|
-
|
150
|
-
return processor_output
|
51
|
+
return {
|
52
|
+
"input_ids": input_ids.tolist(),
|
53
|
+
"mm_items": mm_items,
|
54
|
+
"im_start_id": self.IM_START_TOKEN_ID,
|
55
|
+
"im_end_id": self.IM_END_TOKEN_ID,
|
56
|
+
"im_token_id": self.IM_TOKEN_ID,
|
57
|
+
}
|
@@ -47,9 +47,9 @@ class Phi4MMProcessorAdapter(ProcessorMixin):
|
|
47
47
|
class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
|
48
48
|
models = [Phi4MMForCausalLM]
|
49
49
|
|
50
|
-
def __init__(self, hf_config, server_args, _processor):
|
50
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
51
51
|
self.processor = Phi4MMProcessorAdapter(_processor)
|
52
|
-
super().__init__(hf_config, server_args, self.processor)
|
52
|
+
super().__init__(hf_config, server_args, self.processor, *args, **kwargs)
|
53
53
|
|
54
54
|
# the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
|
55
55
|
# ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
|
@@ -42,8 +42,8 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
|
42
42
|
|
43
43
|
return ncols, nrows
|
44
44
|
|
45
|
-
def __init__(self, hf_config, server_args, _processor):
|
46
|
-
super().__init__(hf_config, server_args, _processor)
|
45
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
46
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
47
47
|
self.IM_TOKEN_ID = getattr(
|
48
48
|
hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
|
49
49
|
)
|
@@ -11,8 +11,8 @@ from sglang.srt.multimodal.processors.base_processor import (
|
|
11
11
|
class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
12
12
|
models = [Qwen2AudioForConditionalGeneration]
|
13
13
|
|
14
|
-
def __init__(self, hf_config, server_args, _processor):
|
15
|
-
super().__init__(hf_config, server_args, _processor)
|
14
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
15
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
16
16
|
self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
|
17
17
|
self.AUDIO_TOKEN_REGEX = re.compile(
|
18
18
|
r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
|
@@ -201,8 +201,8 @@ async def preprocess_video(
|
|
201
201
|
class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
202
202
|
models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
|
203
203
|
|
204
|
-
def __init__(self, hf_config, server_args, _processor):
|
205
|
-
super().__init__(hf_config, server_args, _processor)
|
204
|
+
def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
|
205
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
206
206
|
# The regex that matches expanded image tokens.
|
207
207
|
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
|
208
208
|
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
|
@@ -34,8 +34,10 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
|
|
34
34
|
hf_config: PretrainedConfig,
|
35
35
|
server_args: ServerArgs,
|
36
36
|
_processor: VILAProcessor,
|
37
|
+
*args,
|
38
|
+
**kwargs,
|
37
39
|
) -> None:
|
38
|
-
super().__init__(hf_config, server_args, _processor)
|
40
|
+
super().__init__(hf_config, server_args, _processor, *args, **kwargs)
|
39
41
|
self.mm_tokens = MultimodalSpecialTokens(
|
40
42
|
image_token=self._processor.tokenizer.image_token,
|
41
43
|
image_token_id=hf_config.image_token_id,
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
|
|
32
32
|
One-time parsing: Detects and parses reasoning sections in the provided text.
|
33
33
|
Returns both reasoning content and normal text separately.
|
34
34
|
"""
|
35
|
-
in_reasoning = self._in_reasoning or
|
35
|
+
in_reasoning = self._in_reasoning or self.think_start_token in text
|
36
36
|
|
37
37
|
if not in_reasoning:
|
38
38
|
return StreamingParseResult(normal_text=text)
|
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
118
118
|
Returns all the text before the </think> tag as `reasoning_text`
|
119
119
|
and the rest of the text as `normal_text`.
|
120
120
|
|
121
|
+
Supported models:
|
122
|
+
- DeepSeek-R1: Always generates thinking content without <think> start tag
|
123
|
+
- DeepSeek-R1-0528: Generates thinking content with <think> start tag
|
124
|
+
|
125
|
+
Format patterns:
|
126
|
+
- DeepSeek-R1: "I need to think about this...</think>The answer is 42."
|
127
|
+
- DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
|
128
|
+
|
121
129
|
Args:
|
122
130
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
123
131
|
If True, streams reasoning content as it arrives.
|
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
136
144
|
|
137
145
|
class Qwen3Detector(BaseReasoningFormatDetector):
|
138
146
|
"""
|
139
|
-
Detector for Qwen3
|
147
|
+
Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
140
148
|
Assumes reasoning format:
|
141
149
|
(<think>)*(.*)</think>
|
142
|
-
|
143
|
-
|
150
|
+
|
151
|
+
Qwen3 models released before 07/2025 supports switching between thinking mode and normal
|
152
|
+
mode using `enable_thinking` parameter in the request parameter.
|
153
|
+
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
154
|
+
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
155
|
+
|
156
|
+
This detector handles both cases.
|
157
|
+
|
158
|
+
NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
159
|
+
Those models always generate thinking content without <think> start tags.
|
160
|
+
Use "qwen3-thinking" parser type for those models instead.
|
144
161
|
|
145
162
|
Args:
|
146
163
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
148
165
|
"""
|
149
166
|
|
150
167
|
def __init__(self, stream_reasoning: bool = True):
|
151
|
-
# Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
|
152
168
|
super().__init__(
|
153
169
|
"<think>",
|
154
170
|
"</think>",
|
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
157
173
|
)
|
158
174
|
|
159
175
|
|
176
|
+
class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
|
177
|
+
"""
|
178
|
+
Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
179
|
+
Assumes reasoning format:
|
180
|
+
*(.*)</think>
|
181
|
+
|
182
|
+
These models always generate thinking content without <think> start tag.
|
183
|
+
They do not support the enable_thinking parameter and always think.
|
184
|
+
|
185
|
+
Format: "I need to think about this...</think>The answer is 42."
|
186
|
+
|
187
|
+
Args:
|
188
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
189
|
+
If True, streams reasoning content as it arrives.
|
190
|
+
"""
|
191
|
+
|
192
|
+
def __init__(self, stream_reasoning: bool = True):
|
193
|
+
super().__init__(
|
194
|
+
"<think>",
|
195
|
+
"</think>",
|
196
|
+
force_reasoning=True,
|
197
|
+
stream_reasoning=stream_reasoning,
|
198
|
+
)
|
199
|
+
|
200
|
+
|
160
201
|
class KimiDetector(BaseReasoningFormatDetector):
|
161
202
|
"""
|
162
203
|
Detector for Kimi Thinking model.
|
@@ -189,6 +230,8 @@ class ReasoningParser:
|
|
189
230
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
190
231
|
"deepseek-r1": DeepSeekR1Detector,
|
191
232
|
"qwen3": Qwen3Detector,
|
233
|
+
"qwen3-thinking": Qwen3ThinkingDetector,
|
234
|
+
"glm45": Qwen3Detector,
|
192
235
|
"kimi": KimiDetector,
|
193
236
|
}
|
194
237
|
|
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
|
|
322
322
|
# Set the flag to True if any of the two has custom logit processor
|
323
323
|
self.has_custom_logit_processor = True
|
324
324
|
|
325
|
+
# Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
|
326
|
+
# See note below on len(self) and len(other).
|
327
|
+
self.logit_bias = merge_bias_tensor(
|
328
|
+
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
329
|
+
)
|
330
|
+
|
325
331
|
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
326
332
|
# please make sure any merge operation with len(self) or len(other) is done before
|
327
333
|
# the merge operation of the temperatures tensor below.
|
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
|
|
340
346
|
self.need_top_k_sampling |= other.need_top_k_sampling
|
341
347
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
342
348
|
|
343
|
-
# Merge logit bias
|
344
|
-
self.logit_bias = merge_bias_tensor(
|
345
|
-
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
346
|
-
)
|
347
|
-
|
348
349
|
|
349
350
|
def merge_bias_tensor(
|
350
351
|
lhs: Optional[torch.Tensor],
|