sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ import torch
8
8
  from PIL import Image
9
9
  from torchvision import transforms
10
10
  from torchvision.transforms import InterpolationMode
11
- from transformers import BatchFeature, TensorType
11
+ from transformers import BatchFeature, ProcessorMixin, TensorType
12
12
 
13
13
  from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
14
14
  from sglang.srt.multimodal.processors.base_processor import (
@@ -276,6 +276,8 @@ class Step3VLProcessor:
276
276
  super().__init__()
277
277
 
278
278
  self.config = config
279
+ if isinstance(tokenizer, ProcessorMixin):
280
+ tokenizer = tokenizer.tokenizer
279
281
  self.tokenizer = tokenizer
280
282
 
281
283
  self.image_size = 728
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from typing import Dict, Optional, Tuple, Type
2
3
 
3
4
 
@@ -131,7 +132,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
131
132
  If True, streams reasoning content as it arrives.
132
133
  """
133
134
 
134
- def __init__(self, stream_reasoning: bool = True):
135
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
135
136
  # DeepSeek-R1 is assumed to be reasoning until `</think>` token
136
137
  super().__init__(
137
138
  "<think>",
@@ -144,7 +145,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
144
145
 
145
146
  class Qwen3Detector(BaseReasoningFormatDetector):
146
147
  """
147
- Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
148
+ Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
148
149
  Assumes reasoning format:
149
150
  (<think>)*(.*)</think>
150
151
 
@@ -153,68 +154,351 @@ class Qwen3Detector(BaseReasoningFormatDetector):
153
154
  - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
154
155
  - enable_thinking=False: "The answer is 42." (no thinking tokens)
155
156
 
156
- This detector handles both cases.
157
-
158
- NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
159
- Those models always generate thinking content without <think> start tags.
160
- Use "qwen3-thinking" parser type for those models instead.
161
-
162
157
  Args:
163
158
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
164
159
  If True, streams reasoning content as it arrives.
165
160
  """
166
161
 
167
- def __init__(self, stream_reasoning: bool = True):
162
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
168
163
  super().__init__(
169
164
  "<think>",
170
165
  "</think>",
171
- force_reasoning=False,
166
+ force_reasoning=force_reasoning,
172
167
  stream_reasoning=stream_reasoning,
173
168
  )
174
169
 
175
170
 
176
- class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
171
+ class KimiDetector(BaseReasoningFormatDetector):
177
172
  """
178
- Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
173
+ Detector for Kimi Thinking model.
179
174
  Assumes reasoning format:
180
- *(.*)</think>
175
+ ◁think▷*(.*)◁/think
176
+ Returns all the text before the ◁/think▷ tag as `reasoning_text`
177
+ and the rest of the text as `normal_text`.
178
+ """
179
+
180
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
181
+ super().__init__(
182
+ "◁think▷",
183
+ "◁/think▷",
184
+ force_reasoning=False,
185
+ stream_reasoning=stream_reasoning,
186
+ )
181
187
 
182
- These models always generate thinking content without <think> start tag.
183
- They do not support the enable_thinking parameter and always think.
184
188
 
185
- Format: "I need to think about this...</think>The answer is 42."
189
+ class GptOssDetector(BaseReasoningFormatDetector):
190
+ """
191
+ Detector for T4-style reasoning format.
192
+
193
+ Assumes reasoning format with two channels:
194
+ <|channel|>analysis<|message|>...reasoning content...<|end|>
195
+ <|start|>assistant<|channel|>final<|message|>...final answer...<|return|>
196
+
197
+ Returns content from 'analysis' channel as reasoning_text
198
+ and content from 'final' channel as normal_text.
186
199
 
187
200
  Args:
188
- stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
201
+ stream_reasoning (bool): If False, accumulates reasoning content until complete.
189
202
  If True, streams reasoning content as it arrives.
190
203
  """
191
204
 
192
- def __init__(self, stream_reasoning: bool = True):
205
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
206
+ # TypeScript uses channel tokens instead of simple start/end tokens
193
207
  super().__init__(
194
- "<think>",
195
- "</think>",
208
+ "<|channel|>analysis<|message|>",
209
+ "<|end|>",
196
210
  force_reasoning=True,
197
211
  stream_reasoning=stream_reasoning,
198
212
  )
213
+ self.final_channel_start = "<|start|>assistant<|channel|>final<|message|>"
214
+ self.final_channel_end = "<|return|>"
215
+ self._in_final_channel = False
216
+ self._analysis_complete = False
217
+ self._in_reasoning = True
199
218
 
219
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
220
+ """
221
+ One-time parsing: Detects and parses both analysis and final channels.
222
+ Tool call channels are preserved in normal_text for downstream processing.
200
223
 
201
- class KimiDetector(BaseReasoningFormatDetector):
202
- """
203
- Detector for Kimi Thinking model.
204
- Assumes reasoning format:
205
- ◁think▷*(.*)◁/think▷
206
- Returns all the text before the ◁/think▷ tag as `reasoning_text`
207
- and the rest of the text as `normal_text`.
208
- """
224
+ HACK: Also handles simplified format where text starts with "analysis" and transitions
225
+ to "assistantfinal" without full channel markers.
226
+ """
227
+ # HACK: Handle simplified format (analysis...assistantfinal) without channel markers
228
+ if (
229
+ text.startswith("analysis")
230
+ and "assistantfinal" in text
231
+ and "<|channel|>" not in text
232
+ ):
233
+ # Split on "assistantfinal"
234
+ parts = text.split("assistantfinal", 1)
235
+ self._in_reasoning = False
236
+ if len(parts) == 2:
237
+ reasoning_text = parts[0][
238
+ len("analysis") :
239
+ ].strip() # Remove "analysis" prefix
240
+ normal_text = parts[1].strip()
241
+ return StreamingParseResult(
242
+ normal_text=normal_text, reasoning_text=reasoning_text
243
+ )
244
+
245
+ reasoning_parts = []
246
+ normal_parts = []
247
+ current_pos = 0
248
+
249
+ # Process text sequentially to preserve tool calls between analysis sections
250
+ while current_pos < len(text):
251
+ # Look for next analysis channel
252
+ analysis_start_idx = text.find(self.think_start_token, current_pos)
253
+
254
+ if analysis_start_idx == -1:
255
+ # No more analysis channels, rest goes to remaining
256
+ break
257
+
258
+ # Preserve any content before this analysis channel (could include tool calls)
259
+ if analysis_start_idx > current_pos:
260
+ between_content = text[current_pos:analysis_start_idx]
261
+ # This content will be added to normal_parts later
262
+ normal_parts.append(between_content)
263
+
264
+ # Extract analysis content
265
+ analysis_content_start = analysis_start_idx + len(self.think_start_token)
266
+ analysis_end_idx = text.find(self.think_end_token, analysis_content_start)
267
+
268
+ if analysis_end_idx != -1:
269
+ reasoning_parts.append(
270
+ text[analysis_content_start:analysis_end_idx].strip()
271
+ )
272
+ current_pos = analysis_end_idx + len(self.think_end_token)
273
+ else:
274
+ # Analysis not complete
275
+ reasoning_parts.append(text[analysis_content_start:].strip())
276
+ reasoning_text = "".join(reasoning_parts)
277
+ return StreamingParseResult(reasoning_text=reasoning_text)
278
+
279
+ # Add any remaining text after all analysis sections
280
+ if current_pos < len(text):
281
+ remaining = text[current_pos:]
282
+ normal_parts.append(remaining)
283
+
284
+ # Process non-analysis content for commentary sections
285
+ full_normal_text = "".join(normal_parts)
286
+
287
+ # Extract reasoning from non-tool-call commentary sections
288
+ # Tool calls have "to=" in their header, regular commentary does not
289
+ commentary_pattern = re.compile(
290
+ r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
291
+ re.DOTALL,
292
+ )
209
293
 
210
- def __init__(self, stream_reasoning: bool = True):
211
- super().__init__(
212
- "◁think▷",
213
- "◁/think▷",
214
- force_reasoning=False,
215
- stream_reasoning=stream_reasoning,
294
+ cleaned_text = full_normal_text
295
+ for match in reversed(list(commentary_pattern.finditer(full_normal_text))):
296
+ # Check if this commentary is a tool call by looking at the text before <|message|>
297
+ match_start = match.start()
298
+ # Find where "<|channel|>commentary" starts within the matched pattern
299
+ # The pattern starts with "<|start|>assistant<|channel|>commentary"
300
+ # So we look for the text between "commentary" and "<|message|>" in the match
301
+ match_text = full_normal_text[match_start : match.end()]
302
+ commentary_idx = match_text.find("<|channel|>commentary")
303
+ if commentary_idx != -1:
304
+ message_idx = match_text.find("<|message|>", commentary_idx)
305
+ if message_idx != -1:
306
+ between_text = match_text[commentary_idx:message_idx]
307
+ # If no "to=" found, this is regular commentary (reasoning content)
308
+ if " to=" not in between_text:
309
+ content = match.group(1).strip()
310
+ reasoning_parts.append(content)
311
+ # Remove this commentary section from normal text
312
+ cleaned_text = (
313
+ cleaned_text[: match.start()] + cleaned_text[match.end() :]
314
+ )
315
+
316
+ full_normal_text = cleaned_text
317
+
318
+ # Combine all reasoning parts
319
+ reasoning_text = "".join(reasoning_parts)
320
+
321
+ # Process full_normal_text for final output
322
+ normal_text = ""
323
+ if self.final_channel_start in full_normal_text:
324
+ final_start = full_normal_text.find(self.final_channel_start)
325
+ final_content_start = final_start + len(self.final_channel_start)
326
+ final_end = full_normal_text.find(
327
+ self.final_channel_end, final_content_start
328
+ )
329
+
330
+ if final_end != -1:
331
+ # Extract content before final channel (includes tool calls)
332
+ before_final = full_normal_text[:final_start].strip()
333
+ # Extract ONLY the final channel content (not the channel markers)
334
+ final_text = full_normal_text[final_content_start:final_end].strip()
335
+ # Extract content after final channel
336
+ after_final = full_normal_text[
337
+ final_end + len(self.final_channel_end) :
338
+ ].strip()
339
+
340
+ # For tool calls + final answer: concatenate tool calls with final text
341
+ parts = []
342
+ if before_final:
343
+ parts.append(before_final)
344
+ if final_text:
345
+ parts.append(final_text)
346
+ if after_final:
347
+ parts.append(after_final)
348
+ normal_text = " ".join(parts)
349
+ else:
350
+ # Final channel not complete - extract what we have
351
+ # Look for just <|channel|>final<|message|> without <|return|>
352
+ alt_final_start = full_normal_text.find("<|channel|>final<|message|>")
353
+ if alt_final_start != -1:
354
+ before_alt_final = full_normal_text[:alt_final_start].strip()
355
+ alt_final_content = full_normal_text[
356
+ alt_final_start + len("<|channel|>final<|message|>") :
357
+ ].strip()
358
+
359
+ parts = []
360
+ if before_alt_final:
361
+ parts.append(before_alt_final)
362
+ if alt_final_content:
363
+ parts.append(alt_final_content)
364
+ normal_text = " ".join(parts)
365
+ else:
366
+ normal_text = full_normal_text.strip()
367
+ else:
368
+ # No final channel, treat all as normal text (includes tool calls)
369
+ normal_text = full_normal_text.strip()
370
+
371
+ return StreamingParseResult(
372
+ normal_text=normal_text, reasoning_text=reasoning_text
216
373
  )
217
374
 
375
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
376
+ """
377
+ Streaming incremental parsing for GPT-OSS format.
378
+
379
+ This is a simplified streaming implementation that accumulates content
380
+ and delegates to the non-streaming parser for complex multi-channel parsing.
381
+ TODO: Implement proper incremental parsing for better streaming performance.
382
+ """
383
+ self._buffer += new_text
384
+
385
+ if not self._in_reasoning:
386
+ return StreamingParseResult(normal_text=new_text)
387
+
388
+ # Check if we have complete sections to process
389
+ # For GPT-OSS, we need to wait for complete channel sections
390
+ # HACK: For now, use simplified approach - wait for key markers before processing
391
+ key_markers = ["<|end|>", "<|call|>", "<|return|>", "assistantfinal"]
392
+ has_complete_section = any(marker in self._buffer for marker in key_markers)
393
+
394
+ if not has_complete_section:
395
+ # Still accumulating, don't process yet
396
+ return StreamingParseResult()
397
+
398
+ # Handle simplified format (analysis...assistantfinal) with true incremental streaming
399
+ if (
400
+ "<|channel|>" not in self._buffer
401
+ ): # Simplified format without channel markers
402
+ if self._buffer.startswith("analysis"):
403
+ # Check if we have the transition to assistantfinal
404
+ if "assistantfinal" in self._buffer:
405
+ self._in_reasoning = False
406
+ # Complete reasoning section - extract and stream it
407
+ parts = self._buffer.split("assistantfinal", 1)
408
+ reasoning_text = parts[0][len("analysis") :].strip()
409
+ final_content = parts[1].strip()
410
+
411
+ # Clear buffer and return both reasoning and final content
412
+ self._buffer = ""
413
+ return StreamingParseResult(
414
+ reasoning_text=reasoning_text if self.stream_reasoning else "",
415
+ normal_text=final_content,
416
+ )
417
+ elif self.stream_reasoning:
418
+ # Stream reasoning content incrementally as it arrives
419
+ current_reasoning = self._buffer[len("analysis") :].strip()
420
+ self._buffer = ""
421
+ return StreamingParseResult(reasoning_text=current_reasoning)
422
+ else:
423
+ # Wait for assistantfinal
424
+ return StreamingParseResult()
425
+ elif self._buffer.startswith("assistantfinal"):
426
+ # Direct final content without analysis
427
+ final_content = self._buffer[len("assistantfinal") :].strip()
428
+ self._buffer = ""
429
+ return StreamingParseResult(normal_text=final_content)
430
+
431
+ # For full channel format, process sections as they complete
432
+ result = StreamingParseResult()
433
+
434
+ # Process complete analysis sections
435
+ while (
436
+ self.think_start_token in self._buffer
437
+ and self.think_end_token in self._buffer
438
+ ):
439
+ start_idx = self._buffer.find(self.think_start_token)
440
+ start_pos = start_idx + len(self.think_start_token)
441
+ end_pos = self._buffer.find(self.think_end_token, start_pos)
442
+
443
+ if end_pos != -1:
444
+ reasoning_content = self._buffer[start_pos:end_pos].strip()
445
+ if self.stream_reasoning and reasoning_content:
446
+ result.reasoning_text += reasoning_content
447
+
448
+ # Remove processed analysis section
449
+ self._buffer = (
450
+ self._buffer[:start_idx]
451
+ + self._buffer[end_pos + len(self.think_end_token) :]
452
+ )
453
+ else:
454
+ break
455
+
456
+ # Process complete commentary sections
457
+ commentary_pattern = re.compile(
458
+ r"<\|start\|>assistant<\|channel\|>commentary<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
459
+ re.DOTALL,
460
+ )
461
+
462
+ for match in reversed(list(commentary_pattern.finditer(self._buffer))):
463
+ # Check if this is a tool call
464
+ start_pos = match.start()
465
+ commentary_content = match.group(1).strip()
466
+ if self.stream_reasoning and commentary_content:
467
+ result.reasoning_text += commentary_content
468
+
469
+ # Remove this commentary section
470
+ self._buffer = self._buffer[: match.start()] + self._buffer[match.end() :]
471
+ # Clean up any standalone <|start|>assistant
472
+ self._buffer = re.sub(
473
+ r"<\|start\|>assistant(?=<\|start\|>assistant)", "", self._buffer
474
+ )
475
+
476
+ # Handle final channel completion
477
+ if self.final_channel_start in self._buffer:
478
+ final_start = self._buffer.find(self.final_channel_start)
479
+ final_content_start = final_start + len(self.final_channel_start)
480
+
481
+ # Check if final channel is complete
482
+ final_end = self._buffer.find(self.final_channel_end, final_content_start)
483
+ if final_end != -1:
484
+ # Complete final channel - process everything
485
+ final_result = self.detect_and_parse(self._buffer)
486
+ self._buffer = ""
487
+ return StreamingParseResult(
488
+ normal_text=final_result.normal_text,
489
+ reasoning_text=result.reasoning_text + final_result.reasoning_text,
490
+ )
491
+ else:
492
+ # Extract content before final channel (e.g. tool calls)
493
+ before_final = self._buffer[:final_start]
494
+ if before_final:
495
+ # Output tool calls for processing
496
+ result.normal_text += before_final
497
+ # Keep the final channel part in buffer
498
+ self._buffer = self._buffer[final_start:]
499
+
500
+ return result
501
+
218
502
 
219
503
  class ReasoningParser:
220
504
  """
@@ -230,13 +514,19 @@ class ReasoningParser:
230
514
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
231
515
  "deepseek-r1": DeepSeekR1Detector,
232
516
  "qwen3": Qwen3Detector,
233
- "qwen3-thinking": Qwen3ThinkingDetector,
517
+ "qwen3-thinking": Qwen3Detector,
234
518
  "glm45": Qwen3Detector,
235
519
  "kimi": KimiDetector,
236
520
  "step3": DeepSeekR1Detector,
521
+ "gpt-oss": GptOssDetector,
237
522
  }
238
523
 
239
- def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True):
524
+ def __init__(
525
+ self,
526
+ model_type: Optional[str] = None,
527
+ stream_reasoning: bool = True,
528
+ force_reasoning: bool = False,
529
+ ):
240
530
  if not model_type:
241
531
  raise ValueError("Model type must be specified")
242
532
 
@@ -244,7 +534,12 @@ class ReasoningParser:
244
534
  if not detector_class:
245
535
  raise ValueError(f"Unsupported model type: {model_type}")
246
536
 
247
- self.detector = detector_class(stream_reasoning=stream_reasoning)
537
+ if model_type.lower() == "qwen3-thinking":
538
+ force_reasoning = True
539
+
540
+ self.detector = detector_class(
541
+ stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
542
+ )
248
543
 
249
544
  def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
250
545
  """Non-streaming call: one-time parsing"""