sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ import torch
|
|
8
8
|
from PIL import Image
|
9
9
|
from torchvision import transforms
|
10
10
|
from torchvision.transforms import InterpolationMode
|
11
|
-
from transformers import BatchFeature, TensorType
|
11
|
+
from transformers import BatchFeature, ProcessorMixin, TensorType
|
12
12
|
|
13
13
|
from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
|
14
14
|
from sglang.srt.multimodal.processors.base_processor import (
|
@@ -276,6 +276,8 @@ class Step3VLProcessor:
|
|
276
276
|
super().__init__()
|
277
277
|
|
278
278
|
self.config = config
|
279
|
+
if isinstance(tokenizer, ProcessorMixin):
|
280
|
+
tokenizer = tokenizer.tokenizer
|
279
281
|
self.tokenizer = tokenizer
|
280
282
|
|
281
283
|
self.image_size = 728
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import re
|
1
2
|
from typing import Dict, Optional, Tuple, Type
|
2
3
|
|
3
4
|
|
@@ -131,7 +132,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
131
132
|
If True, streams reasoning content as it arrives.
|
132
133
|
"""
|
133
134
|
|
134
|
-
def __init__(self, stream_reasoning: bool = True):
|
135
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
135
136
|
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
|
136
137
|
super().__init__(
|
137
138
|
"<think>",
|
@@ -144,7 +145,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
144
145
|
|
145
146
|
class Qwen3Detector(BaseReasoningFormatDetector):
|
146
147
|
"""
|
147
|
-
Detector for
|
148
|
+
Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
148
149
|
Assumes reasoning format:
|
149
150
|
(<think>)*(.*)</think>
|
150
151
|
|
@@ -153,68 +154,351 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
153
154
|
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
154
155
|
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
155
156
|
|
156
|
-
This detector handles both cases.
|
157
|
-
|
158
|
-
NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
159
|
-
Those models always generate thinking content without <think> start tags.
|
160
|
-
Use "qwen3-thinking" parser type for those models instead.
|
161
|
-
|
162
157
|
Args:
|
163
158
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
164
159
|
If True, streams reasoning content as it arrives.
|
165
160
|
"""
|
166
161
|
|
167
|
-
def __init__(self, stream_reasoning: bool = True):
|
162
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
168
163
|
super().__init__(
|
169
164
|
"<think>",
|
170
165
|
"</think>",
|
171
|
-
force_reasoning=
|
166
|
+
force_reasoning=force_reasoning,
|
172
167
|
stream_reasoning=stream_reasoning,
|
173
168
|
)
|
174
169
|
|
175
170
|
|
176
|
-
class
|
171
|
+
class KimiDetector(BaseReasoningFormatDetector):
|
177
172
|
"""
|
178
|
-
Detector for
|
173
|
+
Detector for Kimi Thinking model.
|
179
174
|
Assumes reasoning format:
|
180
|
-
|
175
|
+
◁think▷*(.*)◁/think▷
|
176
|
+
Returns all the text before the ◁/think▷ tag as `reasoning_text`
|
177
|
+
and the rest of the text as `normal_text`.
|
178
|
+
"""
|
179
|
+
|
180
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
181
|
+
super().__init__(
|
182
|
+
"◁think▷",
|
183
|
+
"◁/think▷",
|
184
|
+
force_reasoning=False,
|
185
|
+
stream_reasoning=stream_reasoning,
|
186
|
+
)
|
181
187
|
|
182
|
-
These models always generate thinking content without <think> start tag.
|
183
|
-
They do not support the enable_thinking parameter and always think.
|
184
188
|
|
185
|
-
|
189
|
+
class GptOssDetector(BaseReasoningFormatDetector):
|
190
|
+
"""
|
191
|
+
Detector for T4-style reasoning format.
|
192
|
+
|
193
|
+
Assumes reasoning format with two channels:
|
194
|
+
<|channel|>analysis<|message|>...reasoning content...<|end|>
|
195
|
+
<|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
|
196
|
+
|
197
|
+
Returns content from 'analysis' channel as reasoning_text
|
198
|
+
and content from 'final' channel as normal_text.
|
186
199
|
|
187
200
|
Args:
|
188
|
-
stream_reasoning (bool): If False, accumulates reasoning content until
|
201
|
+
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
189
202
|
If True, streams reasoning content as it arrives.
|
190
203
|
"""
|
191
204
|
|
192
|
-
def __init__(self, stream_reasoning: bool = True):
|
205
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
206
|
+
# TypeScript uses channel tokens instead of simple start/end tokens
|
193
207
|
super().__init__(
|
194
|
-
"
|
195
|
-
"
|
208
|
+
"<|channel|>analysis<|message|>",
|
209
|
+
"<|end|>",
|
196
210
|
force_reasoning=True,
|
197
211
|
stream_reasoning=stream_reasoning,
|
198
212
|
)
|
213
|
+
self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
|
214
|
+
self.final_channel_end = "<|return|>"
|
215
|
+
self._in_final_channel = False
|
216
|
+
self._analysis_complete = False
|
217
|
+
self._in_reasoning = True
|
199
218
|
|
219
|
+
def detect_and_parse(self, text: str) -> StreamingParseResult:
|
220
|
+
"""
|
221
|
+
One-time parsing: Detects and parses both analysis and final channels.
|
222
|
+
Tool call channels are preserved in normal_text for downstream processing.
|
200
223
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
224
|
+
HACK: Also handles simplified format where text starts with "analysis" and transitions
|
225
|
+
to "assistantfinal" without full channel markers.
|
226
|
+
"""
|
227
|
+
# HACK: Handle simplified format (analysis...assistantfinal) without channel markers
|
228
|
+
if (
|
229
|
+
text.startswith("analysis")
|
230
|
+
and "assistantfinal" in text
|
231
|
+
and "<|channel|>" not in text
|
232
|
+
):
|
233
|
+
# Split on "assistantfinal"
|
234
|
+
parts = text.split("assistantfinal", 1)
|
235
|
+
self._in_reasoning = False
|
236
|
+
if len(parts) == 2:
|
237
|
+
reasoning_text = parts[0][
|
238
|
+
len("analysis") :
|
239
|
+
].strip() # Remove "analysis" prefix
|
240
|
+
normal_text = parts[1].strip()
|
241
|
+
return StreamingParseResult(
|
242
|
+
normal_text=normal_text, reasoning_text=reasoning_text
|
243
|
+
)
|
244
|
+
|
245
|
+
reasoning_parts = []
|
246
|
+
normal_parts = []
|
247
|
+
current_pos = 0
|
248
|
+
|
249
|
+
# Process text sequentially to preserve tool calls between analysis sections
|
250
|
+
while current_pos < len(text):
|
251
|
+
# Look for next analysis channel
|
252
|
+
analysis_start_idx = text.find(self.think_start_token, current_pos)
|
253
|
+
|
254
|
+
if analysis_start_idx == -1:
|
255
|
+
# No more analysis channels, rest goes to remaining
|
256
|
+
break
|
257
|
+
|
258
|
+
# Preserve any content before this analysis channel (could include tool calls)
|
259
|
+
if analysis_start_idx > current_pos:
|
260
|
+
between_content = text[current_pos:analysis_start_idx]
|
261
|
+
# This content will be added to normal_parts later
|
262
|
+
normal_parts.append(between_content)
|
263
|
+
|
264
|
+
# Extract analysis content
|
265
|
+
analysis_content_start = analysis_start_idx + len(self.think_start_token)
|
266
|
+
analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
|
267
|
+
|
268
|
+
if analysis_end_idx != -1:
|
269
|
+
reasoning_parts.append(
|
270
|
+
text[analysis_content_start:analysis_end_idx].strip()
|
271
|
+
)
|
272
|
+
current_pos = analysis_end_idx + len(self.think_end_token)
|
273
|
+
else:
|
274
|
+
# Analysis not complete
|
275
|
+
reasoning_parts.append(text[analysis_content_start:].strip())
|
276
|
+
reasoning_text = "".join(reasoning_parts)
|
277
|
+
return StreamingParseResult(reasoning_text=reasoning_text)
|
278
|
+
|
279
|
+
# Add any remaining text after all analysis sections
|
280
|
+
if current_pos < len(text):
|
281
|
+
remaining = text[current_pos:]
|
282
|
+
normal_parts.append(remaining)
|
283
|
+
|
284
|
+
# Process non-analysis content for commentary sections
|
285
|
+
full_normal_text = "".join(normal_parts)
|
286
|
+
|
287
|
+
# Extract reasoning from non-tool-call commentary sections
|
288
|
+
# Tool calls have "to=" in their header, regular commentary does not
|
289
|
+
commentary_pattern = re.compile(
|
290
|
+
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
291
|
+
re.DOTALL,
|
292
|
+
)
|
209
293
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
294
|
+
cleaned_text = full_normal_text
|
295
|
+
for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
|
296
|
+
# Check if this commentary is a tool call by looking at the text before <|message|>
|
297
|
+
match_start = match.start()
|
298
|
+
# Find where "<|channel|>commentary" starts within the matched pattern
|
299
|
+
# The pattern starts with "<|start|>assistant<|channel|>commentary"
|
300
|
+
# So we look for the text between "commentary" and "<|message|>" in the match
|
301
|
+
match_text = full_normal_text[match_start : match.end()]
|
302
|
+
commentary_idx = match_text.find("<|channel|>commentary")
|
303
|
+
if commentary_idx != -1:
|
304
|
+
message_idx = match_text.find("<|message|>", commentary_idx)
|
305
|
+
if message_idx != -1:
|
306
|
+
between_text = match_text[commentary_idx:message_idx]
|
307
|
+
# If no "to=" found, this is regular commentary (reasoning content)
|
308
|
+
if " to=" not in between_text:
|
309
|
+
content = match.group(1).strip()
|
310
|
+
reasoning_parts.append(content)
|
311
|
+
# Remove this commentary section from normal text
|
312
|
+
cleaned_text = (
|
313
|
+
cleaned_text[: match.start()] + cleaned_text[match.end() :]
|
314
|
+
)
|
315
|
+
|
316
|
+
full_normal_text = cleaned_text
|
317
|
+
|
318
|
+
# Combine all reasoning parts
|
319
|
+
reasoning_text = "".join(reasoning_parts)
|
320
|
+
|
321
|
+
# Process full_normal_text for final output
|
322
|
+
normal_text = ""
|
323
|
+
if self.final_channel_start in full_normal_text:
|
324
|
+
final_start = full_normal_text.find(self.final_channel_start)
|
325
|
+
final_content_start = final_start + len(self.final_channel_start)
|
326
|
+
final_end = full_normal_text.find(
|
327
|
+
self.final_channel_end, final_content_start
|
328
|
+
)
|
329
|
+
|
330
|
+
if final_end != -1:
|
331
|
+
# Extract content before final channel (includes tool calls)
|
332
|
+
before_final = full_normal_text[:final_start].strip()
|
333
|
+
# Extract ONLY the final channel content (not the channel markers)
|
334
|
+
final_text = full_normal_text[final_content_start:final_end].strip()
|
335
|
+
# Extract content after final channel
|
336
|
+
after_final = full_normal_text[
|
337
|
+
final_end + len(self.final_channel_end) :
|
338
|
+
].strip()
|
339
|
+
|
340
|
+
# For tool calls + final answer: concatenate tool calls with final text
|
341
|
+
parts = []
|
342
|
+
if before_final:
|
343
|
+
parts.append(before_final)
|
344
|
+
if final_text:
|
345
|
+
parts.append(final_text)
|
346
|
+
if after_final:
|
347
|
+
parts.append(after_final)
|
348
|
+
normal_text = " ".join(parts)
|
349
|
+
else:
|
350
|
+
# Final channel not complete - extract what we have
|
351
|
+
# Look for just <|channel|>final<|message|> without <|return|>
|
352
|
+
alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
|
353
|
+
if alt_final_start != -1:
|
354
|
+
before_alt_final = full_normal_text[:alt_final_start].strip()
|
355
|
+
alt_final_content = full_normal_text[
|
356
|
+
alt_final_start + len("<|channel|>final<|message|>") :
|
357
|
+
].strip()
|
358
|
+
|
359
|
+
parts = []
|
360
|
+
if before_alt_final:
|
361
|
+
parts.append(before_alt_final)
|
362
|
+
if alt_final_content:
|
363
|
+
parts.append(alt_final_content)
|
364
|
+
normal_text = " ".join(parts)
|
365
|
+
else:
|
366
|
+
normal_text = full_normal_text.strip()
|
367
|
+
else:
|
368
|
+
# No final channel, treat all as normal text (includes tool calls)
|
369
|
+
normal_text = full_normal_text.strip()
|
370
|
+
|
371
|
+
return StreamingParseResult(
|
372
|
+
normal_text=normal_text, reasoning_text=reasoning_text
|
216
373
|
)
|
217
374
|
|
375
|
+
def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
|
376
|
+
"""
|
377
|
+
Streaming incremental parsing for GPT-OSS format.
|
378
|
+
|
379
|
+
This is a simplified streaming implementation that accumulates content
|
380
|
+
and delegates to the non-streaming parser for complex multi-channel parsing.
|
381
|
+
TODO: Implement proper incremental parsing for better streaming performance.
|
382
|
+
"""
|
383
|
+
self._buffer += new_text
|
384
|
+
|
385
|
+
if not self._in_reasoning:
|
386
|
+
return StreamingParseResult(normal_text=new_text)
|
387
|
+
|
388
|
+
# Check if we have complete sections to process
|
389
|
+
# For GPT-OSS, we need to wait for complete channel sections
|
390
|
+
# HACK: For now, use simplified approach - wait for key markers before processing
|
391
|
+
key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
|
392
|
+
has_complete_section = any(marker in self._buffer for marker in key_markers)
|
393
|
+
|
394
|
+
if not has_complete_section:
|
395
|
+
# Still accumulating, don't process yet
|
396
|
+
return StreamingParseResult()
|
397
|
+
|
398
|
+
# Handle simplified format (analysis...assistantfinal) with true incremental streaming
|
399
|
+
if (
|
400
|
+
"<|channel|>" not in self._buffer
|
401
|
+
): # Simplified format without channel markers
|
402
|
+
if self._buffer.startswith("analysis"):
|
403
|
+
# Check if we have the transition to assistantfinal
|
404
|
+
if "assistantfinal" in self._buffer:
|
405
|
+
self._in_reasoning = False
|
406
|
+
# Complete reasoning section - extract and stream it
|
407
|
+
parts = self._buffer.split("assistantfinal", 1)
|
408
|
+
reasoning_text = parts[0][len("analysis") :].strip()
|
409
|
+
final_content = parts[1].strip()
|
410
|
+
|
411
|
+
# Clear buffer and return both reasoning and final content
|
412
|
+
self._buffer = ""
|
413
|
+
return StreamingParseResult(
|
414
|
+
reasoning_text=reasoning_text if self.stream_reasoning else "",
|
415
|
+
normal_text=final_content,
|
416
|
+
)
|
417
|
+
elif self.stream_reasoning:
|
418
|
+
# Stream reasoning content incrementally as it arrives
|
419
|
+
current_reasoning = self._buffer[len("analysis") :].strip()
|
420
|
+
self._buffer = ""
|
421
|
+
return StreamingParseResult(reasoning_text=current_reasoning)
|
422
|
+
else:
|
423
|
+
# Wait for assistantfinal
|
424
|
+
return StreamingParseResult()
|
425
|
+
elif self._buffer.startswith("assistantfinal"):
|
426
|
+
# Direct final content without analysis
|
427
|
+
final_content = self._buffer[len("assistantfinal") :].strip()
|
428
|
+
self._buffer = ""
|
429
|
+
return StreamingParseResult(normal_text=final_content)
|
430
|
+
|
431
|
+
# For full channel format, process sections as they complete
|
432
|
+
result = StreamingParseResult()
|
433
|
+
|
434
|
+
# Process complete analysis sections
|
435
|
+
while (
|
436
|
+
self.think_start_token in self._buffer
|
437
|
+
and self.think_end_token in self._buffer
|
438
|
+
):
|
439
|
+
start_idx = self._buffer.find(self.think_start_token)
|
440
|
+
start_pos = start_idx + len(self.think_start_token)
|
441
|
+
end_pos = self._buffer.find(self.think_end_token, start_pos)
|
442
|
+
|
443
|
+
if end_pos != -1:
|
444
|
+
reasoning_content = self._buffer[start_pos:end_pos].strip()
|
445
|
+
if self.stream_reasoning and reasoning_content:
|
446
|
+
result.reasoning_text += reasoning_content
|
447
|
+
|
448
|
+
# Remove processed analysis section
|
449
|
+
self._buffer = (
|
450
|
+
self._buffer[:start_idx]
|
451
|
+
+ self._buffer[end_pos + len(self.think_end_token) :]
|
452
|
+
)
|
453
|
+
else:
|
454
|
+
break
|
455
|
+
|
456
|
+
# Process complete commentary sections
|
457
|
+
commentary_pattern = re.compile(
|
458
|
+
r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
459
|
+
re.DOTALL,
|
460
|
+
)
|
461
|
+
|
462
|
+
for match in reversed(list(commentary_pattern.finditer(self._buffer))):
|
463
|
+
# Check if this is a tool call
|
464
|
+
start_pos = match.start()
|
465
|
+
commentary_content = match.group(1).strip()
|
466
|
+
if self.stream_reasoning and commentary_content:
|
467
|
+
result.reasoning_text += commentary_content
|
468
|
+
|
469
|
+
# Remove this commentary section
|
470
|
+
self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
|
471
|
+
# Clean up any standalone <|start|>assistant
|
472
|
+
self._buffer = re.sub(
|
473
|
+
r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
|
474
|
+
)
|
475
|
+
|
476
|
+
# Handle final channel completion
|
477
|
+
if self.final_channel_start in self._buffer:
|
478
|
+
final_start = self._buffer.find(self.final_channel_start)
|
479
|
+
final_content_start = final_start + len(self.final_channel_start)
|
480
|
+
|
481
|
+
# Check if final channel is complete
|
482
|
+
final_end = self._buffer.find(self.final_channel_end, final_content_start)
|
483
|
+
if final_end != -1:
|
484
|
+
# Complete final channel - process everything
|
485
|
+
final_result = self.detect_and_parse(self._buffer)
|
486
|
+
self._buffer = ""
|
487
|
+
return StreamingParseResult(
|
488
|
+
normal_text=final_result.normal_text,
|
489
|
+
reasoning_text=result.reasoning_text + final_result.reasoning_text,
|
490
|
+
)
|
491
|
+
else:
|
492
|
+
# Extract content before final channel (e.g. tool calls)
|
493
|
+
before_final = self._buffer[:final_start]
|
494
|
+
if before_final:
|
495
|
+
# Output tool calls for processing
|
496
|
+
result.normal_text += before_final
|
497
|
+
# Keep the final channel part in buffer
|
498
|
+
self._buffer = self._buffer[final_start:]
|
499
|
+
|
500
|
+
return result
|
501
|
+
|
218
502
|
|
219
503
|
class ReasoningParser:
|
220
504
|
"""
|
@@ -230,13 +514,19 @@ class ReasoningParser:
|
|
230
514
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
231
515
|
"deepseek-r1": DeepSeekR1Detector,
|
232
516
|
"qwen3": Qwen3Detector,
|
233
|
-
"qwen3-thinking":
|
517
|
+
"qwen3-thinking": Qwen3Detector,
|
234
518
|
"glm45": Qwen3Detector,
|
235
519
|
"kimi": KimiDetector,
|
236
520
|
"step3": DeepSeekR1Detector,
|
521
|
+
"gpt-oss": GptOssDetector,
|
237
522
|
}
|
238
523
|
|
239
|
-
def __init__(
|
524
|
+
def __init__(
|
525
|
+
self,
|
526
|
+
model_type: Optional[str] = None,
|
527
|
+
stream_reasoning: bool = True,
|
528
|
+
force_reasoning: bool = False,
|
529
|
+
):
|
240
530
|
if not model_type:
|
241
531
|
raise ValueError("Model type must be specified")
|
242
532
|
|
@@ -244,7 +534,12 @@ class ReasoningParser:
|
|
244
534
|
if not detector_class:
|
245
535
|
raise ValueError(f"Unsupported model type: {model_type}")
|
246
536
|
|
247
|
-
|
537
|
+
if model_type.lower() == "qwen3-thinking":
|
538
|
+
force_reasoning = True
|
539
|
+
|
540
|
+
self.detector = detector_class(
|
541
|
+
stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
|
542
|
+
)
|
248
543
|
|
249
544
|
def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
|
250
545
|
"""Non-streaming call: one-time parsing"""
|