sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -1
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +8 -7
- sglang/srt/disaggregation/decode.py +8 -4
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +68 -5
- sglang/srt/entrypoints/openai/protocol.py +2 -9
- sglang/srt/entrypoints/openai/serving_chat.py +60 -265
- sglang/srt/entrypoints/openai/serving_completions.py +1 -0
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +55 -13
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +24 -27
- sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +11 -13
- sglang/srt/layers/dp_attention.py +118 -27
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/logits_processor.py +12 -18
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/multimodal.py +156 -40
- sglang/srt/layers/quantization/__init__.py +10 -35
- sglang/srt/layers/quantization/awq.py +15 -16
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/gptq.py +12 -17
- sglang/srt/layers/quantization/marlin_utils.py +15 -5
- sglang/srt/layers/quantization/modelopt_quant.py +58 -41
- sglang/srt/layers/quantization/mxfp4.py +20 -3
- sglang/srt/layers/quantization/utils.py +52 -2
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/layers/sampler.py +5 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +66 -116
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +24 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +43 -49
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +18 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +53 -44
- sglang/srt/mem_cache/allocator.py +39 -214
- sglang/srt/mem_cache/allocator_ascend.py +158 -0
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -23
- sglang/srt/model_executor/forward_batch_info.py +33 -14
- sglang/srt/model_executor/model_runner.py +179 -81
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_nextn.py +2 -1
- sglang/srt/models/deepseek_v2.py +79 -38
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +11 -11
- sglang/srt/models/glm4_moe_nextn.py +2 -1
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +142 -20
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +10 -27
- sglang/srt/models/llama4.py +19 -6
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +20 -5
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_classification.py +78 -0
- sglang/srt/models/qwen3_moe.py +18 -5
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +6 -2
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/operations.py +17 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/sampling/sampling_batch_info.py +7 -4
- sglang/srt/server_args.py +142 -140
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +16 -12
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_marlin_moe.py +1 -1
- sglang/test/test_marlin_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/layers/quantization/scalar_type.py +0 -352
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,331 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
from sglang.srt.entrypoints.openai.protocol import Tool
|
7
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
8
|
+
from sglang.srt.function_call.core_types import (
|
9
|
+
StreamingParseResult,
|
10
|
+
ToolCallItem,
|
11
|
+
_GetInfoFunc,
|
12
|
+
)
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class GptOssDetector(BaseFormatDetector):
|
18
|
+
"""
|
19
|
+
Detector for T4-style function calls with channel format.
|
20
|
+
|
21
|
+
Supports two formats:
|
22
|
+
1. Direct function call: <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
|
23
|
+
2. Commentary with action plan: <|channel|>commentary<|message|>{content}<|end|>
|
24
|
+
|
25
|
+
For parallel function calls, each call is self-contained and starts with its own channel:
|
26
|
+
<|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"SF"}<|call|>
|
27
|
+
<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"SF attractions"}<|call|>
|
28
|
+
|
29
|
+
Examples:
|
30
|
+
Single: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"San Francisco"}<|call|>commentary
|
31
|
+
Multiple: <|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location":"Paris"}<|call|>commentary<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query":"Paris tourism"}<|call|>
|
32
|
+
With Action Plan: <|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|><|start|>assistant<|channel|>commentary to=functions.x<|constrain|>json<|message|>{"template": "basic_html", "path": "index.html"}<|call|>
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self):
|
36
|
+
super().__init__()
|
37
|
+
self.bot_token = "<|start|>assistant<|channel|>commentary"
|
38
|
+
self.eot_token = "<|call|>"
|
39
|
+
# TODO: no clear indication how parallel tool call response format is
|
40
|
+
self.tool_call_separator = ""
|
41
|
+
|
42
|
+
# Pattern for complete function calls with to= parameter
|
43
|
+
# Handles both <|call|> and <|call|>commentary endings
|
44
|
+
# Also handles optional <|start|>assistant prefix and whitespace after function name
|
45
|
+
self.function_call_pattern = re.compile(
|
46
|
+
r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
|
47
|
+
r"<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?",
|
48
|
+
re.DOTALL,
|
49
|
+
)
|
50
|
+
|
51
|
+
# Pattern for streaming function calls (incomplete)
|
52
|
+
# Also handles optional whitespace after function name
|
53
|
+
self.streaming_pattern = re.compile(
|
54
|
+
r"(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*"
|
55
|
+
r"<\|constrain\|>json<\|message\|>(.*)",
|
56
|
+
re.DOTALL,
|
57
|
+
)
|
58
|
+
|
59
|
+
# Pattern for commentary with action plan (no to= parameter)
|
60
|
+
self.commentary_pattern = re.compile(
|
61
|
+
r"<\|channel\|>commentary<\|message\|>(.*?)<\|end\|>",
|
62
|
+
re.DOTALL,
|
63
|
+
)
|
64
|
+
|
65
|
+
self._last_arguments = ""
|
66
|
+
|
67
|
+
def has_tool_call(self, text: str) -> bool:
|
68
|
+
"""Check if text contains TypeScript-style function call markers."""
|
69
|
+
return self.bot_token in text
|
70
|
+
|
71
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
72
|
+
"""Parse TypeScript-style function calls from complete text."""
|
73
|
+
if not self.has_tool_call(text):
|
74
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
75
|
+
|
76
|
+
tool_indices = self._get_tool_indices(tools)
|
77
|
+
|
78
|
+
calls = []
|
79
|
+
tool_index = 0
|
80
|
+
|
81
|
+
# Process the entire text to handle mixed commentary and tool calls
|
82
|
+
normal_text_parts = []
|
83
|
+
|
84
|
+
# Find all commentary sections (both with and without to=)
|
85
|
+
all_commentary_pattern = re.compile(
|
86
|
+
r"<\|channel\|>commentary(?:\s+to=[^<]*)?<\|message\|>(.*?)(?:<\|end\|>|<\|call\|>)",
|
87
|
+
re.DOTALL,
|
88
|
+
)
|
89
|
+
|
90
|
+
# Track processed positions to avoid double-processing
|
91
|
+
processed_ranges = []
|
92
|
+
|
93
|
+
# First, extract all tool calls
|
94
|
+
for match in self.function_call_pattern.finditer(text):
|
95
|
+
full_function_name = match.group(1)
|
96
|
+
args_content = match.group(2)
|
97
|
+
processed_ranges.append((match.start(), match.end()))
|
98
|
+
|
99
|
+
function_name = (
|
100
|
+
full_function_name.split(".")[-1]
|
101
|
+
if "." in full_function_name
|
102
|
+
else full_function_name
|
103
|
+
)
|
104
|
+
|
105
|
+
try:
|
106
|
+
arguments = json.loads(args_content) if args_content.strip() else {}
|
107
|
+
except json.JSONDecodeError:
|
108
|
+
continue
|
109
|
+
|
110
|
+
if function_name in tool_indices:
|
111
|
+
calls.append(
|
112
|
+
ToolCallItem(
|
113
|
+
tool_index=tool_index,
|
114
|
+
name=function_name,
|
115
|
+
parameters=json.dumps(arguments, ensure_ascii=False),
|
116
|
+
)
|
117
|
+
)
|
118
|
+
tool_index += 1
|
119
|
+
|
120
|
+
# Then, find non-tool-call commentary sections for normal text
|
121
|
+
for match in all_commentary_pattern.finditer(text):
|
122
|
+
# Check if this match overlaps with any processed tool call
|
123
|
+
match_start, match_end = match.start(), match.end()
|
124
|
+
is_tool_call = any(
|
125
|
+
start <= match_start < end or start < match_end <= end
|
126
|
+
for start, end in processed_ranges
|
127
|
+
)
|
128
|
+
|
129
|
+
# If this commentary is not part of a tool call, include it in normal text
|
130
|
+
if not is_tool_call:
|
131
|
+
content = match.group(1).strip()
|
132
|
+
if content:
|
133
|
+
normal_text_parts.append(content)
|
134
|
+
|
135
|
+
# Handle remaining text after all matches
|
136
|
+
if processed_ranges:
|
137
|
+
last_match_end = max(end for _, end in processed_ranges)
|
138
|
+
if last_match_end < len(text):
|
139
|
+
remaining_text = text[last_match_end:]
|
140
|
+
|
141
|
+
# Clean up <|start|>assistant prefixes and extract final content
|
142
|
+
# Remove standalone <|start|>assistant prefixes
|
143
|
+
remaining_text = re.sub(r"<\|start\|>assistant(?!\w)", "", remaining_text)
|
144
|
+
|
145
|
+
# Extract content from final channel if present
|
146
|
+
final_pattern = re.compile(
|
147
|
+
r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)", re.DOTALL
|
148
|
+
)
|
149
|
+
final_match = final_pattern.search(remaining_text)
|
150
|
+
|
151
|
+
if final_match:
|
152
|
+
# Get everything before final channel + final channel content
|
153
|
+
before_final = remaining_text[: final_match.start()].strip()
|
154
|
+
final_content = final_match.group(1).strip()
|
155
|
+
|
156
|
+
parts = []
|
157
|
+
if before_final:
|
158
|
+
parts.append(before_final)
|
159
|
+
if final_content:
|
160
|
+
parts.append(final_content)
|
161
|
+
remaining_text = " ".join(parts) if parts else ""
|
162
|
+
|
163
|
+
remaining_text = remaining_text.strip()
|
164
|
+
|
165
|
+
if remaining_text:
|
166
|
+
normal_text_parts.append(remaining_text)
|
167
|
+
|
168
|
+
# Combine all normal text parts
|
169
|
+
final_normal_text = " ".join(part for part in normal_text_parts if part).strip()
|
170
|
+
return StreamingParseResult(normal_text=final_normal_text, calls=calls)
|
171
|
+
|
172
|
+
def parse_streaming_increment(
|
173
|
+
self, new_text: str, tools: List[Tool]
|
174
|
+
) -> StreamingParseResult:
|
175
|
+
"""Parse incremental streaming text for TypeScript-style function calls."""
|
176
|
+
self._buffer += new_text
|
177
|
+
current_text = self._buffer
|
178
|
+
|
179
|
+
# Check if we have a tool call
|
180
|
+
has_tool_call = "<|channel|>commentary to=" in current_text
|
181
|
+
|
182
|
+
if not has_tool_call and current_text:
|
183
|
+
# Check for commentary without function calls
|
184
|
+
commentary_match = self.commentary_pattern.search(current_text)
|
185
|
+
if commentary_match:
|
186
|
+
commentary_content = commentary_match.group(1)
|
187
|
+
self._buffer = current_text[commentary_match.end() :]
|
188
|
+
return StreamingParseResult(normal_text=commentary_content, calls=[])
|
189
|
+
|
190
|
+
# Check for final channel content
|
191
|
+
final_pattern = re.compile(
|
192
|
+
r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
|
193
|
+
re.DOTALL,
|
194
|
+
)
|
195
|
+
final_match = final_pattern.search(current_text)
|
196
|
+
if final_match:
|
197
|
+
final_content = final_match.group(1).strip()
|
198
|
+
self._buffer = ""
|
199
|
+
return StreamingParseResult(normal_text=final_content, calls=[])
|
200
|
+
|
201
|
+
self._buffer = ""
|
202
|
+
return StreamingParseResult(normal_text=new_text, calls=[])
|
203
|
+
|
204
|
+
if not hasattr(self, "_tool_indices"):
|
205
|
+
self._tool_indices = self._get_tool_indices(tools)
|
206
|
+
|
207
|
+
calls = []
|
208
|
+
try:
|
209
|
+
# Check for streaming function call
|
210
|
+
match = self.streaming_pattern.search(current_text)
|
211
|
+
if match:
|
212
|
+
full_function_name = match.group(1)
|
213
|
+
args_content = match.group(2)
|
214
|
+
|
215
|
+
function_name = (
|
216
|
+
full_function_name.split(".")[-1]
|
217
|
+
if "." in full_function_name
|
218
|
+
else full_function_name
|
219
|
+
)
|
220
|
+
|
221
|
+
# Initialize state if this is the first tool call
|
222
|
+
if self.current_tool_id == -1:
|
223
|
+
self.current_tool_id = 0
|
224
|
+
self.prev_tool_call_arr = []
|
225
|
+
self.streamed_args_for_tool = [""]
|
226
|
+
|
227
|
+
# Ensure we have enough entries in tracking arrays
|
228
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
229
|
+
self.prev_tool_call_arr.append({})
|
230
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
231
|
+
self.streamed_args_for_tool.append("")
|
232
|
+
|
233
|
+
if not self.current_tool_name_sent:
|
234
|
+
calls.append(
|
235
|
+
ToolCallItem(
|
236
|
+
tool_index=self.current_tool_id,
|
237
|
+
name=function_name,
|
238
|
+
parameters="",
|
239
|
+
)
|
240
|
+
)
|
241
|
+
self.current_tool_name_sent = True
|
242
|
+
# Store the tool call info
|
243
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
244
|
+
"name": function_name,
|
245
|
+
"arguments": {},
|
246
|
+
}
|
247
|
+
self.streamed_args_for_tool[self.current_tool_id] = ""
|
248
|
+
|
249
|
+
# Check if we have a complete function call
|
250
|
+
complete_match = self.function_call_pattern.search(current_text)
|
251
|
+
if complete_match:
|
252
|
+
args_content = complete_match.group(2)
|
253
|
+
|
254
|
+
try:
|
255
|
+
parsed_args = json.loads(args_content)
|
256
|
+
self.prev_tool_call_arr[self.current_tool_id][
|
257
|
+
"arguments"
|
258
|
+
] = parsed_args
|
259
|
+
|
260
|
+
# Send complete arguments if we haven't sent them yet
|
261
|
+
if not self.streamed_args_for_tool[self.current_tool_id]:
|
262
|
+
# Send the complete arguments as JSON string
|
263
|
+
calls.append(
|
264
|
+
ToolCallItem(
|
265
|
+
tool_index=self.current_tool_id,
|
266
|
+
name=None,
|
267
|
+
parameters=json.dumps(
|
268
|
+
parsed_args, ensure_ascii=False
|
269
|
+
),
|
270
|
+
)
|
271
|
+
)
|
272
|
+
self.streamed_args_for_tool[self.current_tool_id] = (
|
273
|
+
json.dumps(parsed_args, ensure_ascii=False)
|
274
|
+
)
|
275
|
+
except json.JSONDecodeError:
|
276
|
+
pass
|
277
|
+
|
278
|
+
# Remove the completed function call from buffer
|
279
|
+
remaining_after_call = current_text[complete_match.end() :]
|
280
|
+
|
281
|
+
# Clean up <|start|>assistant prefixes and extract final content
|
282
|
+
remaining_after_call = re.sub(
|
283
|
+
r"<\|start\|>assistant(?!\w)", "", remaining_after_call
|
284
|
+
)
|
285
|
+
|
286
|
+
# Extract content from final channel if present
|
287
|
+
final_pattern = re.compile(
|
288
|
+
r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
|
289
|
+
re.DOTALL,
|
290
|
+
)
|
291
|
+
final_match = final_pattern.search(remaining_after_call)
|
292
|
+
|
293
|
+
if final_match:
|
294
|
+
before_final = remaining_after_call[
|
295
|
+
: final_match.start()
|
296
|
+
].strip()
|
297
|
+
final_content = final_match.group(1).strip()
|
298
|
+
|
299
|
+
parts = []
|
300
|
+
if before_final:
|
301
|
+
parts.append(before_final)
|
302
|
+
if final_content:
|
303
|
+
parts.append(final_content)
|
304
|
+
remaining_after_call = " ".join(parts) if parts else ""
|
305
|
+
|
306
|
+
self._buffer = remaining_after_call.strip()
|
307
|
+
|
308
|
+
# Reset state for next tool call
|
309
|
+
self.current_tool_name_sent = False
|
310
|
+
self.current_tool_id += 1
|
311
|
+
|
312
|
+
# Return final content if available
|
313
|
+
final_text = ""
|
314
|
+
if final_match and final_content:
|
315
|
+
final_text = final_content
|
316
|
+
elif remaining_after_call:
|
317
|
+
final_text = remaining_after_call
|
318
|
+
|
319
|
+
return StreamingParseResult(normal_text=final_text, calls=calls)
|
320
|
+
|
321
|
+
return StreamingParseResult(normal_text="", calls=calls)
|
322
|
+
|
323
|
+
except Exception as e:
|
324
|
+
logger.error(f"Error in parse_streaming_increment: {e}")
|
325
|
+
return StreamingParseResult(normal_text=current_text, calls=[])
|
326
|
+
|
327
|
+
def structure_info(self) -> _GetInfoFunc:
|
328
|
+
raise NotImplementedError()
|
329
|
+
|
330
|
+
def build_ebnf(self, tools: List[Tool]) -> str:
|
331
|
+
raise NotImplementedError()
|
@@ -24,7 +24,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
24
24
|
Format Structure:
|
25
25
|
```
|
26
26
|
<|tool_calls_section_begin|>
|
27
|
-
<|tool_call_begin|>functions.{func_name}:{index}
|
27
|
+
<|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
|
28
28
|
<|tool_calls_section_end|>
|
29
29
|
```
|
30
30
|
|
@@ -219,7 +219,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
219
219
|
|
220
220
|
def get_info(name: str) -> StructureInfo:
|
221
221
|
return StructureInfo(
|
222
|
-
begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0
|
222
|
+
begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
|
223
223
|
end="<|tool_call_end|><|tool_calls_section_end|>",
|
224
224
|
trigger="<|tool_calls_section_begin|>",
|
225
225
|
)
|
@@ -240,6 +240,6 @@ class KimiK2Detector(BaseFormatDetector):
|
|
240
240
|
sequence_start_token=self.bot_token,
|
241
241
|
sequence_end_token=self.eot_token,
|
242
242
|
tool_call_separator="",
|
243
|
-
call_rule_fmt='"<|tool_call_begin|>functions.{name}:"
|
243
|
+
call_rule_fmt='"<|tool_call_begin|>functions.{name}:"[0-9]+"<|tool_call_argument_begin|>"{arguments_rule}"<|tool_call_end|>"',
|
244
244
|
function_format="json",
|
245
245
|
)
|
@@ -57,6 +57,15 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
57
57
|
)
|
58
58
|
self._buf: str = ""
|
59
59
|
|
60
|
+
# Streaming state variables
|
61
|
+
self._current_function_name: str = ""
|
62
|
+
self._current_parameters: Dict[str, Any] = {}
|
63
|
+
self._streamed_parameters: Dict[str, str] = (
|
64
|
+
{}
|
65
|
+
) # Track what parameter content we've streamed
|
66
|
+
self._in_tool_call: bool = False
|
67
|
+
self._function_name_sent: bool = False
|
68
|
+
|
60
69
|
def has_tool_call(self, text: str) -> bool:
|
61
70
|
return self.tool_call_start_token in text
|
62
71
|
|
@@ -70,23 +79,224 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
70
79
|
self._buf += new_text
|
71
80
|
normal = ""
|
72
81
|
calls: List[ToolCallItem] = []
|
82
|
+
|
83
|
+
# Build tool indices for validation
|
84
|
+
if not hasattr(self, "_tool_indices"):
|
85
|
+
self._tool_indices = self._get_tool_indices(tools)
|
86
|
+
|
73
87
|
while True:
|
74
|
-
|
88
|
+
# If we're not in a tool call and don't see a start token, return normal text
|
89
|
+
if not self._in_tool_call and self.tool_call_start_token not in self._buf:
|
75
90
|
normal += self._buf
|
76
91
|
self._buf = ""
|
77
92
|
break
|
78
|
-
|
79
|
-
|
93
|
+
|
94
|
+
# Look for tool call start
|
95
|
+
if not self._in_tool_call:
|
96
|
+
s = self._buf.find(self.tool_call_start_token)
|
97
|
+
if s == -1:
|
98
|
+
normal += self._buf
|
99
|
+
self._buf = ""
|
100
|
+
break
|
101
|
+
|
80
102
|
normal += self._buf[:s]
|
81
103
|
self._buf = self._buf[s:]
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
104
|
+
|
105
|
+
self._in_tool_call = True
|
106
|
+
self._function_name_sent = False
|
107
|
+
self._current_function_name = ""
|
108
|
+
self._current_parameters = {}
|
109
|
+
self._streamed_parameters = {}
|
110
|
+
|
111
|
+
# Remove the start token
|
112
|
+
self._buf = self._buf[len(self.tool_call_start_token) :]
|
113
|
+
continue
|
114
|
+
|
115
|
+
# We're in a tool call, try to parse function name if not sent yet
|
116
|
+
if not self._function_name_sent:
|
117
|
+
# Look for function name pattern: <function=name>
|
118
|
+
function_match = re.search(r"<function=([^>]+)>", self._buf)
|
119
|
+
if function_match:
|
120
|
+
function_name = function_match.group(1).strip()
|
121
|
+
|
122
|
+
# Validate function name
|
123
|
+
if function_name in self._tool_indices:
|
124
|
+
self._current_function_name = function_name
|
125
|
+
self._function_name_sent = True
|
126
|
+
|
127
|
+
# Initialize tool call tracking
|
128
|
+
if self.current_tool_id == -1:
|
129
|
+
self.current_tool_id = 0
|
130
|
+
|
131
|
+
# Ensure tracking arrays are large enough
|
132
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
133
|
+
self.prev_tool_call_arr.append({})
|
134
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
135
|
+
self.streamed_args_for_tool.append("")
|
136
|
+
|
137
|
+
# Store tool call info
|
138
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
139
|
+
"name": function_name,
|
140
|
+
"arguments": {},
|
141
|
+
}
|
142
|
+
|
143
|
+
# Send tool name with empty parameters
|
144
|
+
calls.append(
|
145
|
+
ToolCallItem(
|
146
|
+
tool_index=self.current_tool_id,
|
147
|
+
name=function_name,
|
148
|
+
parameters="",
|
149
|
+
)
|
150
|
+
)
|
151
|
+
|
152
|
+
# Remove the processed function declaration
|
153
|
+
self._buf = self._buf[function_match.end() :]
|
154
|
+
continue
|
155
|
+
else:
|
156
|
+
# Invalid function name, reset state
|
157
|
+
logger.warning(f"Invalid function name: {function_name}")
|
158
|
+
self._reset_streaming_state()
|
159
|
+
normal += self._buf
|
160
|
+
self._buf = ""
|
161
|
+
break
|
162
|
+
else:
|
163
|
+
# Function name not complete yet, wait for more text
|
164
|
+
break
|
165
|
+
|
166
|
+
# Parse parameters incrementally
|
167
|
+
if self._function_name_sent:
|
168
|
+
# Process parameters and get any calls to emit
|
169
|
+
parameter_calls = self._parse_and_stream_parameters(self._buf)
|
170
|
+
calls.extend(parameter_calls)
|
171
|
+
|
172
|
+
# Check if tool call is complete
|
173
|
+
if self.tool_call_end_token in self._buf:
|
174
|
+
end_pos = self._buf.find(self.tool_call_end_token)
|
175
|
+
|
176
|
+
# Add closing brace to complete the JSON object
|
177
|
+
current_streamed = self.streamed_args_for_tool[self.current_tool_id]
|
178
|
+
if current_streamed:
|
179
|
+
# Count opening and closing braces to check if JSON is complete
|
180
|
+
open_braces = current_streamed.count("{")
|
181
|
+
close_braces = current_streamed.count("}")
|
182
|
+
if open_braces > close_braces:
|
183
|
+
calls.append(
|
184
|
+
ToolCallItem(
|
185
|
+
tool_index=self.current_tool_id,
|
186
|
+
name=None,
|
187
|
+
parameters="}",
|
188
|
+
)
|
189
|
+
)
|
190
|
+
self.streamed_args_for_tool[self.current_tool_id] = (
|
191
|
+
current_streamed + "}"
|
192
|
+
)
|
193
|
+
|
194
|
+
# Complete the tool call
|
195
|
+
self._buf = self._buf[end_pos + len(self.tool_call_end_token) :]
|
196
|
+
self._reset_streaming_state()
|
197
|
+
self.current_tool_id += 1
|
198
|
+
continue
|
199
|
+
else:
|
200
|
+
# Tool call not complete yet, wait for more text
|
201
|
+
break
|
202
|
+
|
88
203
|
return StreamingParseResult(normal_text=normal, calls=calls)
|
89
204
|
|
205
|
+
def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
|
206
|
+
"""
|
207
|
+
Parse complete parameter blocks from text and return any tool call items to emit.
|
208
|
+
|
209
|
+
This method:
|
210
|
+
1. Finds all complete <parameter> blocks
|
211
|
+
2. Parses them into a dictionary
|
212
|
+
3. Compares with current parameters and generates diff if needed
|
213
|
+
4. Updates internal state
|
214
|
+
|
215
|
+
Args:
|
216
|
+
text_to_parse: The text to search for parameter blocks
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
List of ToolCallItem objects to emit (may be empty)
|
220
|
+
"""
|
221
|
+
calls: List[ToolCallItem] = []
|
222
|
+
|
223
|
+
# Find all complete parameter patterns
|
224
|
+
param_matches = list(
|
225
|
+
re.finditer(
|
226
|
+
r"<parameter=([^>]+)>(.*?)</parameter>", text_to_parse, re.DOTALL
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
# Build new parameters dictionary
|
231
|
+
new_params = {}
|
232
|
+
for match in param_matches:
|
233
|
+
param_name = match.group(1).strip()
|
234
|
+
param_value = match.group(2)
|
235
|
+
new_params[param_name] = _safe_val(param_value)
|
236
|
+
|
237
|
+
# Calculate parameter diff to stream with proper incremental JSON building
|
238
|
+
if new_params != self._current_parameters:
|
239
|
+
previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
|
240
|
+
|
241
|
+
# Build incremental JSON properly
|
242
|
+
if not self._current_parameters:
|
243
|
+
# First parameter(s) - start JSON object but don't close it yet
|
244
|
+
items = []
|
245
|
+
for key, value in new_params.items():
|
246
|
+
items.append(
|
247
|
+
f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
|
248
|
+
)
|
249
|
+
json_fragment = "{" + ", ".join(items)
|
250
|
+
|
251
|
+
calls.append(
|
252
|
+
ToolCallItem(
|
253
|
+
tool_index=self.current_tool_id,
|
254
|
+
name=None,
|
255
|
+
parameters=json_fragment,
|
256
|
+
)
|
257
|
+
)
|
258
|
+
self.streamed_args_for_tool[self.current_tool_id] = json_fragment
|
259
|
+
|
260
|
+
else:
|
261
|
+
# Additional parameters - add them incrementally
|
262
|
+
new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
|
263
|
+
if new_keys:
|
264
|
+
# Build the continuation part (no closing brace yet)
|
265
|
+
continuation_parts = []
|
266
|
+
for key in new_keys:
|
267
|
+
value = new_params[key]
|
268
|
+
continuation_parts.append(
|
269
|
+
f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
|
270
|
+
)
|
271
|
+
|
272
|
+
json_fragment = ", " + ", ".join(continuation_parts)
|
273
|
+
|
274
|
+
calls.append(
|
275
|
+
ToolCallItem(
|
276
|
+
tool_index=self.current_tool_id,
|
277
|
+
name=None,
|
278
|
+
parameters=json_fragment,
|
279
|
+
)
|
280
|
+
)
|
281
|
+
self.streamed_args_for_tool[self.current_tool_id] = (
|
282
|
+
previous_args_json + json_fragment
|
283
|
+
)
|
284
|
+
|
285
|
+
# Update current state
|
286
|
+
self._current_parameters = new_params
|
287
|
+
self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
|
288
|
+
|
289
|
+
return calls
|
290
|
+
|
291
|
+
def _reset_streaming_state(self):
|
292
|
+
"""Reset streaming state for the next tool call"""
|
293
|
+
self._in_tool_call = False
|
294
|
+
self._function_name_sent = False
|
295
|
+
self._current_function_name = ""
|
296
|
+
self._current_parameters = {}
|
297
|
+
self._streamed_parameters = {}
|
298
|
+
self.current_tool_name_sent = False
|
299
|
+
|
90
300
|
def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
|
91
301
|
normal_parts: List[str] = []
|
92
302
|
calls: List[ToolCallItem] = []
|
@@ -102,6 +102,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
|
|
102
102
|
if _is_var_or_elems_access(loop_iter, "message", "content"):
|
103
103
|
return "openai" # Found content iteration → openai format
|
104
104
|
|
105
|
+
# Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
|
106
|
+
if _is_var_or_elems_access(
|
107
|
+
loop_iter, "msg", "content"
|
108
|
+
) or _is_var_or_elems_access(loop_iter, "m", "content"):
|
109
|
+
return "openai" # Found content iteration → openai format (glm4v)
|
110
|
+
|
105
111
|
return "string" # No content loops found → string format
|
106
112
|
except Exception as e:
|
107
113
|
logger.debug(f"Error when parsing AST of Jinja template: {e}")
|