sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -1
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +8 -7
  6. sglang/srt/disaggregation/decode.py +8 -4
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +68 -5
  14. sglang/srt/entrypoints/openai/protocol.py +2 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +60 -265
  16. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  18. sglang/srt/function_call/ebnf_composer.py +1 -0
  19. sglang/srt/function_call/function_call_parser.py +2 -0
  20. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  21. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  22. sglang/srt/function_call/kimik2_detector.py +3 -3
  23. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  24. sglang/srt/jinja_template_utils.py +6 -0
  25. sglang/srt/layers/attention/aiter_backend.py +370 -107
  26. sglang/srt/layers/attention/ascend_backend.py +3 -0
  27. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  28. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  29. sglang/srt/layers/attention/flashinfer_backend.py +55 -13
  30. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
  31. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  32. sglang/srt/layers/attention/triton_backend.py +24 -27
  33. sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
  34. sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
  35. sglang/srt/layers/attention/vision.py +9 -1
  36. sglang/srt/layers/attention/wave_backend.py +627 -0
  37. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  38. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  39. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  40. sglang/srt/layers/communicator.py +11 -13
  41. sglang/srt/layers/dp_attention.py +118 -27
  42. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  43. sglang/srt/layers/linear.py +1 -0
  44. sglang/srt/layers/logits_processor.py +12 -18
  45. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  46. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  47. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  48. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  63. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  64. sglang/srt/layers/moe/topk.py +4 -1
  65. sglang/srt/layers/multimodal.py +156 -40
  66. sglang/srt/layers/quantization/__init__.py +10 -35
  67. sglang/srt/layers/quantization/awq.py +15 -16
  68. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
  69. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  70. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  71. sglang/srt/layers/quantization/gptq.py +12 -17
  72. sglang/srt/layers/quantization/marlin_utils.py +15 -5
  73. sglang/srt/layers/quantization/modelopt_quant.py +58 -41
  74. sglang/srt/layers/quantization/mxfp4.py +20 -3
  75. sglang/srt/layers/quantization/utils.py +52 -2
  76. sglang/srt/layers/quantization/w4afp8.py +20 -11
  77. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  78. sglang/srt/layers/rotary_embedding.py +281 -2
  79. sglang/srt/layers/sampler.py +5 -2
  80. sglang/srt/lora/backend/base_backend.py +3 -23
  81. sglang/srt/lora/layers.py +66 -116
  82. sglang/srt/lora/lora.py +17 -62
  83. sglang/srt/lora/lora_manager.py +12 -48
  84. sglang/srt/lora/lora_registry.py +20 -9
  85. sglang/srt/lora/mem_pool.py +20 -63
  86. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  87. sglang/srt/lora/utils.py +25 -58
  88. sglang/srt/managers/cache_controller.py +24 -29
  89. sglang/srt/managers/detokenizer_manager.py +1 -1
  90. sglang/srt/managers/io_struct.py +20 -6
  91. sglang/srt/managers/mm_utils.py +1 -2
  92. sglang/srt/managers/multimodal_processor.py +1 -1
  93. sglang/srt/managers/schedule_batch.py +43 -49
  94. sglang/srt/managers/schedule_policy.py +6 -6
  95. sglang/srt/managers/scheduler.py +18 -11
  96. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  97. sglang/srt/managers/tokenizer_manager.py +53 -44
  98. sglang/srt/mem_cache/allocator.py +39 -214
  99. sglang/srt/mem_cache/allocator_ascend.py +158 -0
  100. sglang/srt/mem_cache/chunk_cache.py +1 -1
  101. sglang/srt/mem_cache/hicache_storage.py +1 -1
  102. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  103. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  104. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  105. sglang/srt/mem_cache/radix_cache.py +2 -5
  106. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  107. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  108. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  109. sglang/srt/model_executor/cuda_graph_runner.py +29 -23
  110. sglang/srt/model_executor/forward_batch_info.py +33 -14
  111. sglang/srt/model_executor/model_runner.py +179 -81
  112. sglang/srt/model_loader/loader.py +18 -6
  113. sglang/srt/models/deepseek_nextn.py +2 -1
  114. sglang/srt/models/deepseek_v2.py +79 -38
  115. sglang/srt/models/gemma2.py +0 -34
  116. sglang/srt/models/gemma3n_mm.py +8 -9
  117. sglang/srt/models/glm4.py +6 -0
  118. sglang/srt/models/glm4_moe.py +11 -11
  119. sglang/srt/models/glm4_moe_nextn.py +2 -1
  120. sglang/srt/models/glm4v.py +589 -0
  121. sglang/srt/models/glm4v_moe.py +400 -0
  122. sglang/srt/models/gpt_oss.py +142 -20
  123. sglang/srt/models/granite.py +0 -25
  124. sglang/srt/models/llama.py +10 -27
  125. sglang/srt/models/llama4.py +19 -6
  126. sglang/srt/models/qwen2.py +2 -2
  127. sglang/srt/models/qwen2_5_vl.py +7 -3
  128. sglang/srt/models/qwen2_audio.py +10 -9
  129. sglang/srt/models/qwen2_moe.py +20 -5
  130. sglang/srt/models/qwen3.py +0 -24
  131. sglang/srt/models/qwen3_classification.py +78 -0
  132. sglang/srt/models/qwen3_moe.py +18 -5
  133. sglang/srt/models/registry.py +1 -1
  134. sglang/srt/models/step3_vl.py +6 -2
  135. sglang/srt/models/torch_native_llama.py +0 -24
  136. sglang/srt/multimodal/processors/base_processor.py +23 -13
  137. sglang/srt/multimodal/processors/glm4v.py +132 -0
  138. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  139. sglang/srt/operations.py +17 -2
  140. sglang/srt/reasoning_parser.py +316 -0
  141. sglang/srt/sampling/sampling_batch_info.py +7 -4
  142. sglang/srt/server_args.py +142 -140
  143. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
  144. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  145. sglang/srt/speculative/eagle_worker.py +16 -0
  146. sglang/srt/two_batch_overlap.py +16 -12
  147. sglang/srt/utils.py +3 -3
  148. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  149. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  150. sglang/test/doc_patch.py +59 -0
  151. sglang/test/few_shot_gsm8k.py +1 -1
  152. sglang/test/few_shot_gsm8k_engine.py +1 -1
  153. sglang/test/run_eval.py +4 -1
  154. sglang/test/simple_eval_common.py +6 -0
  155. sglang/test/simple_eval_gpqa.py +2 -0
  156. sglang/test/test_fp4_moe.py +118 -36
  157. sglang/test/test_marlin_moe.py +1 -1
  158. sglang/test/test_marlin_utils.py +1 -1
  159. sglang/utils.py +1 -1
  160. sglang/version.py +1 -1
  161. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
  162. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
  163. sglang/lang/backend/__init__.py +0 -0
  164. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  165. sglang/srt/layers/quantization/scalar_type.py +0 -352
  166. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  167. /sglang/{api.py → lang/api.py} +0 -0
  168. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
  169. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
  170. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,331 @@
1
+ import json
2
+ import logging
3
+ import re
4
+ from typing import List
5
+
6
+ from sglang.srt.entrypoints.openai.protocol import Tool
7
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
8
+ from sglang.srt.function_call.core_types import (
9
+ StreamingParseResult,
10
+ ToolCallItem,
11
+ _GetInfoFunc,
12
+ )
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class GptOssDetector(BaseFormatDetector):
18
+ """
19
+ Detector for T4-style function calls with channel format.
20
+
21
+ Supports two formats:
22
+ 1. Direct function call: <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
23
+ 2. Commentary with action plan: <|channel|>commentary<|message|>{content}<|end|>
24
+
25
+ For parallel function calls, each call is self-contained and starts with its own channel:
26
+ <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"SF"}<|call|>
27
+ <|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"SF attractions"}<|call|>
28
+
29
+ Examples:
30
+ Single: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"San Francisco"}<|call|>commentary
31
+ Multiple: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"Paris"}<|call|>commentary<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"Paris tourism"}<|call|>
32
+ With Action Plan: <|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|><|start|>assistant<|channel|>commentary to=functions.x<|constrain|>json<|message|>{"template": "basic_html", "path": "index.html"}<|call|>
33
+ """
34
+
35
+ def __init__(self):
36
+ super().__init__()
37
+ self.bot_token = "<|start|>assistant<|channel|>commentary"
38
+ self.eot_token = "<|call|>"
39
+ # TODO: no clear indication how parallel tool call response format is
40
+ self.tool_call_separator = ""
41
+
42
+ # Pattern for complete function calls with to= parameter
43
+ # Handles both <|call|> and <|call|>commentary endings
44
+ # Also handles optional <|start|>assistant prefix and whitespace after function name
45
+ self.function_call_pattern = re.compile(
46
+ r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
47
+ r"<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?",
48
+ re.DOTALL,
49
+ )
50
+
51
+ # Pattern for streaming function calls (incomplete)
52
+ # Also handles optional whitespace after function name
53
+ self.streaming_pattern = re.compile(
54
+ r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
55
+ r"<\|constrain\|>json<\|message\|>(.*)",
56
+ re.DOTALL,
57
+ )
58
+
59
+ # Pattern for commentary with action plan (no to= parameter)
60
+ self.commentary_pattern = re.compile(
61
+ r"<\|channel\|>commentary<\|message\|>(.*?)<\|end\|>",
62
+ re.DOTALL,
63
+ )
64
+
65
+ self._last_arguments = ""
66
+
67
+ def has_tool_call(self, text: str) -> bool:
68
+ """Check if text contains TypeScript-style function call markers."""
69
+ return self.bot_token in text
70
+
71
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
72
+ """Parse TypeScript-style function calls from complete text."""
73
+ if not self.has_tool_call(text):
74
+ return StreamingParseResult(normal_text=text, calls=[])
75
+
76
+ tool_indices = self._get_tool_indices(tools)
77
+
78
+ calls = []
79
+ tool_index = 0
80
+
81
+ # Process the entire text to handle mixed commentary and tool calls
82
+ normal_text_parts = []
83
+
84
+ # Find all commentary sections (both with and without to=)
85
+ all_commentary_pattern = re.compile(
86
+ r"<\|channel\|>commentary(?:\s+to=[^<]*)?<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
87
+ re.DOTALL,
88
+ )
89
+
90
+ # Track processed positions to avoid double-processing
91
+ processed_ranges = []
92
+
93
+ # First, extract all tool calls
94
+ for match in self.function_call_pattern.finditer(text):
95
+ full_function_name = match.group(1)
96
+ args_content = match.group(2)
97
+ processed_ranges.append((match.start(), match.end()))
98
+
99
+ function_name = (
100
+ full_function_name.split(".")[-1]
101
+ if "." in full_function_name
102
+ else full_function_name
103
+ )
104
+
105
+ try:
106
+ arguments = json.loads(args_content) if args_content.strip() else {}
107
+ except json.JSONDecodeError:
108
+ continue
109
+
110
+ if function_name in tool_indices:
111
+ calls.append(
112
+ ToolCallItem(
113
+ tool_index=tool_index,
114
+ name=function_name,
115
+ parameters=json.dumps(arguments, ensure_ascii=False),
116
+ )
117
+ )
118
+ tool_index += 1
119
+
120
+ # Then, find non-tool-call commentary sections for normal text
121
+ for match in all_commentary_pattern.finditer(text):
122
+ # Check if this match overlaps with any processed tool call
123
+ match_start, match_end = match.start(), match.end()
124
+ is_tool_call = any(
125
+ start <= match_start < end or start < match_end <= end
126
+ for start, end in processed_ranges
127
+ )
128
+
129
+ # If this commentary is not part of a tool call, include it in normal text
130
+ if not is_tool_call:
131
+ content = match.group(1).strip()
132
+ if content:
133
+ normal_text_parts.append(content)
134
+
135
+ # Handle remaining text after all matches
136
+ if processed_ranges:
137
+ last_match_end = max(end for _, end in processed_ranges)
138
+ if last_match_end < len(text):
139
+ remaining_text = text[last_match_end:]
140
+
141
+ # Clean up <|start|>assistant prefixes and extract final content
142
+ # Remove standalone <|start|>assistant prefixes
143
+ remaining_text = re.sub(r"<\|start\|>assistant(?!\w)", "", remaining_text)
144
+
145
+ # Extract content from final channel if present
146
+ final_pattern = re.compile(
147
+ r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", re.DOTALL
148
+ )
149
+ final_match = final_pattern.search(remaining_text)
150
+
151
+ if final_match:
152
+ # Get everything before final channel + final channel content
153
+ before_final = remaining_text[: final_match.start()].strip()
154
+ final_content = final_match.group(1).strip()
155
+
156
+ parts = []
157
+ if before_final:
158
+ parts.append(before_final)
159
+ if final_content:
160
+ parts.append(final_content)
161
+ remaining_text = " ".join(parts) if parts else ""
162
+
163
+ remaining_text = remaining_text.strip()
164
+
165
+ if remaining_text:
166
+ normal_text_parts.append(remaining_text)
167
+
168
+ # Combine all normal text parts
169
+ final_normal_text = " ".join(part for part in normal_text_parts if part).strip()
170
+ return StreamingParseResult(normal_text=final_normal_text, calls=calls)
171
+
172
+ def parse_streaming_increment(
173
+ self, new_text: str, tools: List[Tool]
174
+ ) -> StreamingParseResult:
175
+ """Parse incremental streaming text for TypeScript-style function calls."""
176
+ self._buffer += new_text
177
+ current_text = self._buffer
178
+
179
+ # Check if we have a tool call
180
+ has_tool_call = "<|channel|>commentary to=" in current_text
181
+
182
+ if not has_tool_call and current_text:
183
+ # Check for commentary without function calls
184
+ commentary_match = self.commentary_pattern.search(current_text)
185
+ if commentary_match:
186
+ commentary_content = commentary_match.group(1)
187
+ self._buffer = current_text[commentary_match.end() :]
188
+ return StreamingParseResult(normal_text=commentary_content, calls=[])
189
+
190
+ # Check for final channel content
191
+ final_pattern = re.compile(
192
+ r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
193
+ re.DOTALL,
194
+ )
195
+ final_match = final_pattern.search(current_text)
196
+ if final_match:
197
+ final_content = final_match.group(1).strip()
198
+ self._buffer = ""
199
+ return StreamingParseResult(normal_text=final_content, calls=[])
200
+
201
+ self._buffer = ""
202
+ return StreamingParseResult(normal_text=new_text, calls=[])
203
+
204
+ if not hasattr(self, "_tool_indices"):
205
+ self._tool_indices = self._get_tool_indices(tools)
206
+
207
+ calls = []
208
+ try:
209
+ # Check for streaming function call
210
+ match = self.streaming_pattern.search(current_text)
211
+ if match:
212
+ full_function_name = match.group(1)
213
+ args_content = match.group(2)
214
+
215
+ function_name = (
216
+ full_function_name.split(".")[-1]
217
+ if "." in full_function_name
218
+ else full_function_name
219
+ )
220
+
221
+ # Initialize state if this is the first tool call
222
+ if self.current_tool_id == -1:
223
+ self.current_tool_id = 0
224
+ self.prev_tool_call_arr = []
225
+ self.streamed_args_for_tool = [""]
226
+
227
+ # Ensure we have enough entries in tracking arrays
228
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
229
+ self.prev_tool_call_arr.append({})
230
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
231
+ self.streamed_args_for_tool.append("")
232
+
233
+ if not self.current_tool_name_sent:
234
+ calls.append(
235
+ ToolCallItem(
236
+ tool_index=self.current_tool_id,
237
+ name=function_name,
238
+ parameters="",
239
+ )
240
+ )
241
+ self.current_tool_name_sent = True
242
+ # Store the tool call info
243
+ self.prev_tool_call_arr[self.current_tool_id] = {
244
+ "name": function_name,
245
+ "arguments": {},
246
+ }
247
+ self.streamed_args_for_tool[self.current_tool_id] = ""
248
+
249
+ # Check if we have a complete function call
250
+ complete_match = self.function_call_pattern.search(current_text)
251
+ if complete_match:
252
+ args_content = complete_match.group(2)
253
+
254
+ try:
255
+ parsed_args = json.loads(args_content)
256
+ self.prev_tool_call_arr[self.current_tool_id][
257
+ "arguments"
258
+ ] = parsed_args
259
+
260
+ # Send complete arguments if we haven't sent them yet
261
+ if not self.streamed_args_for_tool[self.current_tool_id]:
262
+ # Send the complete arguments as JSON string
263
+ calls.append(
264
+ ToolCallItem(
265
+ tool_index=self.current_tool_id,
266
+ name=None,
267
+ parameters=json.dumps(
268
+ parsed_args, ensure_ascii=False
269
+ ),
270
+ )
271
+ )
272
+ self.streamed_args_for_tool[self.current_tool_id] = (
273
+ json.dumps(parsed_args, ensure_ascii=False)
274
+ )
275
+ except json.JSONDecodeError:
276
+ pass
277
+
278
+ # Remove the completed function call from buffer
279
+ remaining_after_call = current_text[complete_match.end() :]
280
+
281
+ # Clean up <|start|>assistant prefixes and extract final content
282
+ remaining_after_call = re.sub(
283
+ r"<\|start\|>assistant(?!\w)", "", remaining_after_call
284
+ )
285
+
286
+ # Extract content from final channel if present
287
+ final_pattern = re.compile(
288
+ r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
289
+ re.DOTALL,
290
+ )
291
+ final_match = final_pattern.search(remaining_after_call)
292
+
293
+ if final_match:
294
+ before_final = remaining_after_call[
295
+ : final_match.start()
296
+ ].strip()
297
+ final_content = final_match.group(1).strip()
298
+
299
+ parts = []
300
+ if before_final:
301
+ parts.append(before_final)
302
+ if final_content:
303
+ parts.append(final_content)
304
+ remaining_after_call = " ".join(parts) if parts else ""
305
+
306
+ self._buffer = remaining_after_call.strip()
307
+
308
+ # Reset state for next tool call
309
+ self.current_tool_name_sent = False
310
+ self.current_tool_id += 1
311
+
312
+ # Return final content if available
313
+ final_text = ""
314
+ if final_match and final_content:
315
+ final_text = final_content
316
+ elif remaining_after_call:
317
+ final_text = remaining_after_call
318
+
319
+ return StreamingParseResult(normal_text=final_text, calls=calls)
320
+
321
+ return StreamingParseResult(normal_text="", calls=calls)
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error in parse_streaming_increment: {e}")
325
+ return StreamingParseResult(normal_text=current_text, calls=[])
326
+
327
+ def structure_info(self) -> _GetInfoFunc:
328
+ raise NotImplementedError()
329
+
330
+ def build_ebnf(self, tools: List[Tool]) -> str:
331
+ raise NotImplementedError()
@@ -24,7 +24,7 @@ class KimiK2Detector(BaseFormatDetector):
24
24
  Format Structure:
25
25
  ```
26
26
  <|tool_calls_section_begin|>
27
- <|tool_call_begin|>functions.{func_name}:{index} <|tool_call_argument_begin|>{json_args}<|tool_call_end|>
27
+ <|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
28
28
  <|tool_calls_section_end|>
29
29
  ```
30
30
 
@@ -219,7 +219,7 @@ class KimiK2Detector(BaseFormatDetector):
219
219
 
220
220
  def get_info(name: str) -> StructureInfo:
221
221
  return StructureInfo(
222
- begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0 <|tool_call_argument_begin|>",
222
+ begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
223
223
  end="<|tool_call_end|><|tool_calls_section_end|>",
224
224
  trigger="<|tool_calls_section_begin|>",
225
225
  )
@@ -240,6 +240,6 @@ class KimiK2Detector(BaseFormatDetector):
240
240
  sequence_start_token=self.bot_token,
241
241
  sequence_end_token=self.eot_token,
242
242
  tool_call_separator="",
243
- call_rule_fmt='"<|tool_call_begin|>functions.{name}:" [0-9]+ " <|tool_call_argument_begin|>" {arguments_rule} "<|tool_call_end|>"',
243
+ call_rule_fmt='"<|tool_call_begin|>functions.{name}:"[0-9]+"<|tool_call_argument_begin|>"{arguments_rule}"<|tool_call_end|>"',
244
244
  function_format="json",
245
245
  )
@@ -57,6 +57,15 @@ class Qwen3CoderDetector(BaseFormatDetector):
57
57
  )
58
58
  self._buf: str = ""
59
59
 
60
+ # Streaming state variables
61
+ self._current_function_name: str = ""
62
+ self._current_parameters: Dict[str, Any] = {}
63
+ self._streamed_parameters: Dict[str, str] = (
64
+ {}
65
+ ) # Track what parameter content we've streamed
66
+ self._in_tool_call: bool = False
67
+ self._function_name_sent: bool = False
68
+
60
69
  def has_tool_call(self, text: str) -> bool:
61
70
  return self.tool_call_start_token in text
62
71
 
@@ -70,23 +79,224 @@ class Qwen3CoderDetector(BaseFormatDetector):
70
79
  self._buf += new_text
71
80
  normal = ""
72
81
  calls: List[ToolCallItem] = []
82
+
83
+ # Build tool indices for validation
84
+ if not hasattr(self, "_tool_indices"):
85
+ self._tool_indices = self._get_tool_indices(tools)
86
+
73
87
  while True:
74
- if self.tool_call_start_token not in self._buf:
88
+ # If we're not in a tool call and don't see a start token, return normal text
89
+ if not self._in_tool_call and self.tool_call_start_token not in self._buf:
75
90
  normal += self._buf
76
91
  self._buf = ""
77
92
  break
78
- s = self._buf.find(self.tool_call_start_token)
79
- if s > 0:
93
+
94
+ # Look for tool call start
95
+ if not self._in_tool_call:
96
+ s = self._buf.find(self.tool_call_start_token)
97
+ if s == -1:
98
+ normal += self._buf
99
+ self._buf = ""
100
+ break
101
+
80
102
  normal += self._buf[:s]
81
103
  self._buf = self._buf[s:]
82
- e = self._buf.find(self.tool_call_end_token)
83
- if e == -1:
84
- break
85
- block = self._buf[: e + len(self.tool_call_end_token)]
86
- self._buf = self._buf[e + len(self.tool_call_end_token) :]
87
- calls.extend(self._parse_block(block, tools))
104
+
105
+ self._in_tool_call = True
106
+ self._function_name_sent = False
107
+ self._current_function_name = ""
108
+ self._current_parameters = {}
109
+ self._streamed_parameters = {}
110
+
111
+ # Remove the start token
112
+ self._buf = self._buf[len(self.tool_call_start_token) :]
113
+ continue
114
+
115
+ # We're in a tool call, try to parse function name if not sent yet
116
+ if not self._function_name_sent:
117
+ # Look for function name pattern: <function=name>
118
+ function_match = re.search(r"<function=([^>]+)>", self._buf)
119
+ if function_match:
120
+ function_name = function_match.group(1).strip()
121
+
122
+ # Validate function name
123
+ if function_name in self._tool_indices:
124
+ self._current_function_name = function_name
125
+ self._function_name_sent = True
126
+
127
+ # Initialize tool call tracking
128
+ if self.current_tool_id == -1:
129
+ self.current_tool_id = 0
130
+
131
+ # Ensure tracking arrays are large enough
132
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
133
+ self.prev_tool_call_arr.append({})
134
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
135
+ self.streamed_args_for_tool.append("")
136
+
137
+ # Store tool call info
138
+ self.prev_tool_call_arr[self.current_tool_id] = {
139
+ "name": function_name,
140
+ "arguments": {},
141
+ }
142
+
143
+ # Send tool name with empty parameters
144
+ calls.append(
145
+ ToolCallItem(
146
+ tool_index=self.current_tool_id,
147
+ name=function_name,
148
+ parameters="",
149
+ )
150
+ )
151
+
152
+ # Remove the processed function declaration
153
+ self._buf = self._buf[function_match.end() :]
154
+ continue
155
+ else:
156
+ # Invalid function name, reset state
157
+ logger.warning(f"Invalid function name: {function_name}")
158
+ self._reset_streaming_state()
159
+ normal += self._buf
160
+ self._buf = ""
161
+ break
162
+ else:
163
+ # Function name not complete yet, wait for more text
164
+ break
165
+
166
+ # Parse parameters incrementally
167
+ if self._function_name_sent:
168
+ # Process parameters and get any calls to emit
169
+ parameter_calls = self._parse_and_stream_parameters(self._buf)
170
+ calls.extend(parameter_calls)
171
+
172
+ # Check if tool call is complete
173
+ if self.tool_call_end_token in self._buf:
174
+ end_pos = self._buf.find(self.tool_call_end_token)
175
+
176
+ # Add closing brace to complete the JSON object
177
+ current_streamed = self.streamed_args_for_tool[self.current_tool_id]
178
+ if current_streamed:
179
+ # Count opening and closing braces to check if JSON is complete
180
+ open_braces = current_streamed.count("{")
181
+ close_braces = current_streamed.count("}")
182
+ if open_braces > close_braces:
183
+ calls.append(
184
+ ToolCallItem(
185
+ tool_index=self.current_tool_id,
186
+ name=None,
187
+ parameters="}",
188
+ )
189
+ )
190
+ self.streamed_args_for_tool[self.current_tool_id] = (
191
+ current_streamed + "}"
192
+ )
193
+
194
+ # Complete the tool call
195
+ self._buf = self._buf[end_pos + len(self.tool_call_end_token) :]
196
+ self._reset_streaming_state()
197
+ self.current_tool_id += 1
198
+ continue
199
+ else:
200
+ # Tool call not complete yet, wait for more text
201
+ break
202
+
88
203
  return StreamingParseResult(normal_text=normal, calls=calls)
89
204
 
205
+ def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
206
+ """
207
+ Parse complete parameter blocks from text and return any tool call items to emit.
208
+
209
+ This method:
210
+ 1. Finds all complete <parameter> blocks
211
+ 2. Parses them into a dictionary
212
+ 3. Compares with current parameters and generates diff if needed
213
+ 4. Updates internal state
214
+
215
+ Args:
216
+ text_to_parse: The text to search for parameter blocks
217
+
218
+ Returns:
219
+ List of ToolCallItem objects to emit (may be empty)
220
+ """
221
+ calls: List[ToolCallItem] = []
222
+
223
+ # Find all complete parameter patterns
224
+ param_matches = list(
225
+ re.finditer(
226
+ r"<parameter=([^>]+)>(.*?)</parameter>", text_to_parse, re.DOTALL
227
+ )
228
+ )
229
+
230
+ # Build new parameters dictionary
231
+ new_params = {}
232
+ for match in param_matches:
233
+ param_name = match.group(1).strip()
234
+ param_value = match.group(2)
235
+ new_params[param_name] = _safe_val(param_value)
236
+
237
+ # Calculate parameter diff to stream with proper incremental JSON building
238
+ if new_params != self._current_parameters:
239
+ previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
240
+
241
+ # Build incremental JSON properly
242
+ if not self._current_parameters:
243
+ # First parameter(s) - start JSON object but don't close it yet
244
+ items = []
245
+ for key, value in new_params.items():
246
+ items.append(
247
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
248
+ )
249
+ json_fragment = "{" + ", ".join(items)
250
+
251
+ calls.append(
252
+ ToolCallItem(
253
+ tool_index=self.current_tool_id,
254
+ name=None,
255
+ parameters=json_fragment,
256
+ )
257
+ )
258
+ self.streamed_args_for_tool[self.current_tool_id] = json_fragment
259
+
260
+ else:
261
+ # Additional parameters - add them incrementally
262
+ new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
263
+ if new_keys:
264
+ # Build the continuation part (no closing brace yet)
265
+ continuation_parts = []
266
+ for key in new_keys:
267
+ value = new_params[key]
268
+ continuation_parts.append(
269
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
270
+ )
271
+
272
+ json_fragment = ", " + ", ".join(continuation_parts)
273
+
274
+ calls.append(
275
+ ToolCallItem(
276
+ tool_index=self.current_tool_id,
277
+ name=None,
278
+ parameters=json_fragment,
279
+ )
280
+ )
281
+ self.streamed_args_for_tool[self.current_tool_id] = (
282
+ previous_args_json + json_fragment
283
+ )
284
+
285
+ # Update current state
286
+ self._current_parameters = new_params
287
+ self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
288
+
289
+ return calls
290
+
291
+ def _reset_streaming_state(self):
292
+ """Reset streaming state for the next tool call"""
293
+ self._in_tool_call = False
294
+ self._function_name_sent = False
295
+ self._current_function_name = ""
296
+ self._current_parameters = {}
297
+ self._streamed_parameters = {}
298
+ self.current_tool_name_sent = False
299
+
90
300
  def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
91
301
  normal_parts: List[str] = []
92
302
  calls: List[ToolCallItem] = []
@@ -102,6 +102,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
102
102
  if _is_var_or_elems_access(loop_iter, "message", "content"):
103
103
  return "openai" # Found content iteration → openai format
104
104
 
105
+ # Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
106
+ if _is_var_or_elems_access(
107
+ loop_iter, "msg", "content"
108
+ ) or _is_var_or_elems_access(loop_iter, "m", "content"):
109
+ return "openai" # Found content iteration → openai format (glm4v)
110
+
105
111
  return "string" # No content loops found → string format
106
112
  except Exception as e:
107
113
  logger.debug(f"Error when parsing AST of Jinja template: {e}")