sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
24
24
|
Format Structure:
|
25
25
|
```
|
26
26
|
<|tool_calls_section_begin|>
|
27
|
-
<|tool_call_begin|>functions.{func_name}:{index}
|
27
|
+
<|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
|
28
28
|
<|tool_calls_section_end|>
|
29
29
|
```
|
30
30
|
|
@@ -219,7 +219,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
219
219
|
|
220
220
|
def get_info(name: str) -> StructureInfo:
|
221
221
|
return StructureInfo(
|
222
|
-
begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0
|
222
|
+
begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
|
223
223
|
end="<|tool_call_end|><|tool_calls_section_end|>",
|
224
224
|
trigger="<|tool_calls_section_begin|>",
|
225
225
|
)
|
@@ -240,6 +240,6 @@ class KimiK2Detector(BaseFormatDetector):
|
|
240
240
|
sequence_start_token=self.bot_token,
|
241
241
|
sequence_end_token=self.eot_token,
|
242
242
|
tool_call_separator="",
|
243
|
-
call_rule_fmt='"<|tool_call_begin|>functions.{name}:"
|
243
|
+
call_rule_fmt='"<|tool_call_begin|>functions.{name}:"[0-9]+"<|tool_call_argument_begin|>"{arguments_rule}"<|tool_call_end|>"',
|
244
244
|
function_format="json",
|
245
245
|
)
|
@@ -57,6 +57,15 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
57
57
|
)
|
58
58
|
self._buf: str = ""
|
59
59
|
|
60
|
+
# Streaming state variables
|
61
|
+
self._current_function_name: str = ""
|
62
|
+
self._current_parameters: Dict[str, Any] = {}
|
63
|
+
self._streamed_parameters: Dict[str, str] = (
|
64
|
+
{}
|
65
|
+
) # Track what parameter content we've streamed
|
66
|
+
self._in_tool_call: bool = False
|
67
|
+
self._function_name_sent: bool = False
|
68
|
+
|
60
69
|
def has_tool_call(self, text: str) -> bool:
|
61
70
|
return self.tool_call_start_token in text
|
62
71
|
|
@@ -70,23 +79,224 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
70
79
|
self._buf += new_text
|
71
80
|
normal = ""
|
72
81
|
calls: List[ToolCallItem] = []
|
82
|
+
|
83
|
+
# Build tool indices for validation
|
84
|
+
if not hasattr(self, "_tool_indices"):
|
85
|
+
self._tool_indices = self._get_tool_indices(tools)
|
86
|
+
|
73
87
|
while True:
|
74
|
-
|
88
|
+
# If we're not in a tool call and don't see a start token, return normal text
|
89
|
+
if not self._in_tool_call and self.tool_call_start_token not in self._buf:
|
75
90
|
normal += self._buf
|
76
91
|
self._buf = ""
|
77
92
|
break
|
78
|
-
|
79
|
-
|
93
|
+
|
94
|
+
# Look for tool call start
|
95
|
+
if not self._in_tool_call:
|
96
|
+
s = self._buf.find(self.tool_call_start_token)
|
97
|
+
if s == -1:
|
98
|
+
normal += self._buf
|
99
|
+
self._buf = ""
|
100
|
+
break
|
101
|
+
|
80
102
|
normal += self._buf[:s]
|
81
103
|
self._buf = self._buf[s:]
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
104
|
+
|
105
|
+
self._in_tool_call = True
|
106
|
+
self._function_name_sent = False
|
107
|
+
self._current_function_name = ""
|
108
|
+
self._current_parameters = {}
|
109
|
+
self._streamed_parameters = {}
|
110
|
+
|
111
|
+
# Remove the start token
|
112
|
+
self._buf = self._buf[len(self.tool_call_start_token) :]
|
113
|
+
continue
|
114
|
+
|
115
|
+
# We're in a tool call, try to parse function name if not sent yet
|
116
|
+
if not self._function_name_sent:
|
117
|
+
# Look for function name pattern: <function=name>
|
118
|
+
function_match = re.search(r"<function=([^>]+)>", self._buf)
|
119
|
+
if function_match:
|
120
|
+
function_name = function_match.group(1).strip()
|
121
|
+
|
122
|
+
# Validate function name
|
123
|
+
if function_name in self._tool_indices:
|
124
|
+
self._current_function_name = function_name
|
125
|
+
self._function_name_sent = True
|
126
|
+
|
127
|
+
# Initialize tool call tracking
|
128
|
+
if self.current_tool_id == -1:
|
129
|
+
self.current_tool_id = 0
|
130
|
+
|
131
|
+
# Ensure tracking arrays are large enough
|
132
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
133
|
+
self.prev_tool_call_arr.append({})
|
134
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
135
|
+
self.streamed_args_for_tool.append("")
|
136
|
+
|
137
|
+
# Store tool call info
|
138
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
139
|
+
"name": function_name,
|
140
|
+
"arguments": {},
|
141
|
+
}
|
142
|
+
|
143
|
+
# Send tool name with empty parameters
|
144
|
+
calls.append(
|
145
|
+
ToolCallItem(
|
146
|
+
tool_index=self.current_tool_id,
|
147
|
+
name=function_name,
|
148
|
+
parameters="",
|
149
|
+
)
|
150
|
+
)
|
151
|
+
|
152
|
+
# Remove the processed function declaration
|
153
|
+
self._buf = self._buf[function_match.end() :]
|
154
|
+
continue
|
155
|
+
else:
|
156
|
+
# Invalid function name, reset state
|
157
|
+
logger.warning(f"Invalid function name: {function_name}")
|
158
|
+
self._reset_streaming_state()
|
159
|
+
normal += self._buf
|
160
|
+
self._buf = ""
|
161
|
+
break
|
162
|
+
else:
|
163
|
+
# Function name not complete yet, wait for more text
|
164
|
+
break
|
165
|
+
|
166
|
+
# Parse parameters incrementally
|
167
|
+
if self._function_name_sent:
|
168
|
+
# Process parameters and get any calls to emit
|
169
|
+
parameter_calls = self._parse_and_stream_parameters(self._buf)
|
170
|
+
calls.extend(parameter_calls)
|
171
|
+
|
172
|
+
# Check if tool call is complete
|
173
|
+
if self.tool_call_end_token in self._buf:
|
174
|
+
end_pos = self._buf.find(self.tool_call_end_token)
|
175
|
+
|
176
|
+
# Add closing brace to complete the JSON object
|
177
|
+
current_streamed = self.streamed_args_for_tool[self.current_tool_id]
|
178
|
+
if current_streamed:
|
179
|
+
# Count opening and closing braces to check if JSON is complete
|
180
|
+
open_braces = current_streamed.count("{")
|
181
|
+
close_braces = current_streamed.count("}")
|
182
|
+
if open_braces > close_braces:
|
183
|
+
calls.append(
|
184
|
+
ToolCallItem(
|
185
|
+
tool_index=self.current_tool_id,
|
186
|
+
name=None,
|
187
|
+
parameters="}",
|
188
|
+
)
|
189
|
+
)
|
190
|
+
self.streamed_args_for_tool[self.current_tool_id] = (
|
191
|
+
current_streamed + "}"
|
192
|
+
)
|
193
|
+
|
194
|
+
# Complete the tool call
|
195
|
+
self._buf = self._buf[end_pos + len(self.tool_call_end_token) :]
|
196
|
+
self._reset_streaming_state()
|
197
|
+
self.current_tool_id += 1
|
198
|
+
continue
|
199
|
+
else:
|
200
|
+
# Tool call not complete yet, wait for more text
|
201
|
+
break
|
202
|
+
|
88
203
|
return StreamingParseResult(normal_text=normal, calls=calls)
|
89
204
|
|
205
|
+
def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
|
206
|
+
"""
|
207
|
+
Parse complete parameter blocks from text and return any tool call items to emit.
|
208
|
+
|
209
|
+
This method:
|
210
|
+
1. Finds all complete <parameter> blocks
|
211
|
+
2. Parses them into a dictionary
|
212
|
+
3. Compares with current parameters and generates diff if needed
|
213
|
+
4. Updates internal state
|
214
|
+
|
215
|
+
Args:
|
216
|
+
text_to_parse: The text to search for parameter blocks
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
List of ToolCallItem objects to emit (may be empty)
|
220
|
+
"""
|
221
|
+
calls: List[ToolCallItem] = []
|
222
|
+
|
223
|
+
# Find all complete parameter patterns
|
224
|
+
param_matches = list(
|
225
|
+
re.finditer(
|
226
|
+
r"<parameter=([^>]+)>(.*?)</parameter>", text_to_parse, re.DOTALL
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
# Build new parameters dictionary
|
231
|
+
new_params = {}
|
232
|
+
for match in param_matches:
|
233
|
+
param_name = match.group(1).strip()
|
234
|
+
param_value = match.group(2)
|
235
|
+
new_params[param_name] = _safe_val(param_value)
|
236
|
+
|
237
|
+
# Calculate parameter diff to stream with proper incremental JSON building
|
238
|
+
if new_params != self._current_parameters:
|
239
|
+
previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
|
240
|
+
|
241
|
+
# Build incremental JSON properly
|
242
|
+
if not self._current_parameters:
|
243
|
+
# First parameter(s) - start JSON object but don't close it yet
|
244
|
+
items = []
|
245
|
+
for key, value in new_params.items():
|
246
|
+
items.append(
|
247
|
+
f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
|
248
|
+
)
|
249
|
+
json_fragment = "{" + ", ".join(items)
|
250
|
+
|
251
|
+
calls.append(
|
252
|
+
ToolCallItem(
|
253
|
+
tool_index=self.current_tool_id,
|
254
|
+
name=None,
|
255
|
+
parameters=json_fragment,
|
256
|
+
)
|
257
|
+
)
|
258
|
+
self.streamed_args_for_tool[self.current_tool_id] = json_fragment
|
259
|
+
|
260
|
+
else:
|
261
|
+
# Additional parameters - add them incrementally
|
262
|
+
new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
|
263
|
+
if new_keys:
|
264
|
+
# Build the continuation part (no closing brace yet)
|
265
|
+
continuation_parts = []
|
266
|
+
for key in new_keys:
|
267
|
+
value = new_params[key]
|
268
|
+
continuation_parts.append(
|
269
|
+
f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
|
270
|
+
)
|
271
|
+
|
272
|
+
json_fragment = ", " + ", ".join(continuation_parts)
|
273
|
+
|
274
|
+
calls.append(
|
275
|
+
ToolCallItem(
|
276
|
+
tool_index=self.current_tool_id,
|
277
|
+
name=None,
|
278
|
+
parameters=json_fragment,
|
279
|
+
)
|
280
|
+
)
|
281
|
+
self.streamed_args_for_tool[self.current_tool_id] = (
|
282
|
+
previous_args_json + json_fragment
|
283
|
+
)
|
284
|
+
|
285
|
+
# Update current state
|
286
|
+
self._current_parameters = new_params
|
287
|
+
self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
|
288
|
+
|
289
|
+
return calls
|
290
|
+
|
291
|
+
def _reset_streaming_state(self):
|
292
|
+
"""Reset streaming state for the next tool call"""
|
293
|
+
self._in_tool_call = False
|
294
|
+
self._function_name_sent = False
|
295
|
+
self._current_function_name = ""
|
296
|
+
self._current_parameters = {}
|
297
|
+
self._streamed_parameters = {}
|
298
|
+
self.current_tool_name_sent = False
|
299
|
+
|
90
300
|
def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
|
91
301
|
normal_parts: List[str] = []
|
92
302
|
calls: List[ToolCallItem] = []
|
@@ -14,10 +14,11 @@
|
|
14
14
|
"""Utilities for Huggingface Transformers."""
|
15
15
|
|
16
16
|
import contextlib
|
17
|
+
import json
|
17
18
|
import os
|
18
19
|
import warnings
|
19
20
|
from pathlib import Path
|
20
|
-
from typing import Dict, Optional, Type, Union
|
21
|
+
from typing import Any, Dict, Optional, Type, Union
|
21
22
|
|
22
23
|
import torch
|
23
24
|
from huggingface_hub import snapshot_download
|
@@ -62,11 +63,17 @@ for name, cls in _CONFIG_REGISTRY.items():
|
|
62
63
|
AutoConfig.register(name, cls)
|
63
64
|
|
64
65
|
|
65
|
-
def download_from_hf(
|
66
|
+
def download_from_hf(
|
67
|
+
model_path: str,
|
68
|
+
allow_patterns: Optional[Union[str, list]] = None,
|
69
|
+
):
|
66
70
|
if os.path.exists(model_path):
|
67
71
|
return model_path
|
68
72
|
|
69
|
-
|
73
|
+
if not allow_patterns:
|
74
|
+
allow_patterns = ["*.json", "*.bin", "*.model"]
|
75
|
+
|
76
|
+
return snapshot_download(model_path, allow_patterns=allow_patterns)
|
70
77
|
|
71
78
|
|
72
79
|
def get_hf_text_config(config: PretrainedConfig):
|
@@ -171,6 +178,26 @@ def get_generation_config(
|
|
171
178
|
return None
|
172
179
|
|
173
180
|
|
181
|
+
# Qwen-1M related
|
182
|
+
def get_sparse_attention_config(
|
183
|
+
model: str,
|
184
|
+
sparse_attention_config_filename: str = "sparse_attention_config.json",
|
185
|
+
) -> Dict[str, Any]:
|
186
|
+
is_local = os.path.isdir(model)
|
187
|
+
if not is_local:
|
188
|
+
# Download the config files.
|
189
|
+
model = download_from_hf(model, allow_patterns=["*.json"])
|
190
|
+
|
191
|
+
config_file = os.path.join(model, sparse_attention_config_filename)
|
192
|
+
if not os.path.exists(config_file):
|
193
|
+
return {}
|
194
|
+
|
195
|
+
# Load the sparse attention config.
|
196
|
+
with open(config_file) as f:
|
197
|
+
config = json.load(f)
|
198
|
+
return config
|
199
|
+
|
200
|
+
|
174
201
|
# Models don't use the same configuration key for determining the maximum
|
175
202
|
# context length. Store them here so we can sanely check them.
|
176
203
|
# NOTE: The ordering here is important. Some models have two of these and we
|
@@ -9,6 +9,8 @@ import logging
|
|
9
9
|
import jinja2
|
10
10
|
import transformers.utils.chat_template_utils as hf_chat_utils
|
11
11
|
|
12
|
+
from sglang.srt.utils import ImageData
|
13
|
+
|
12
14
|
logger = logging.getLogger(__name__)
|
13
15
|
|
14
16
|
# ============================================================================
|
@@ -100,6 +102,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
|
|
100
102
|
if _is_var_or_elems_access(loop_iter, "message", "content"):
|
101
103
|
return "openai" # Found content iteration → openai format
|
102
104
|
|
105
|
+
# Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
|
106
|
+
if _is_var_or_elems_access(
|
107
|
+
loop_iter, "msg", "content"
|
108
|
+
) or _is_var_or_elems_access(loop_iter, "m", "content"):
|
109
|
+
return "openai" # Found content iteration → openai format (glm4v)
|
110
|
+
|
103
111
|
return "string" # No content loops found → string format
|
104
112
|
except Exception as e:
|
105
113
|
logger.debug(f"Error when parsing AST of Jinja template: {e}")
|
@@ -140,7 +148,12 @@ def process_content_for_template_format(
|
|
140
148
|
chunk_type = chunk.get("type")
|
141
149
|
|
142
150
|
if chunk_type == "image_url":
|
143
|
-
image_data.append(
|
151
|
+
image_data.append(
|
152
|
+
ImageData(
|
153
|
+
url=chunk["image_url"]["url"],
|
154
|
+
detail=chunk["image_url"].get("detail", "auto"),
|
155
|
+
)
|
156
|
+
)
|
144
157
|
if chunk.get("modalities"):
|
145
158
|
modalities.append(chunk.get("modalities"))
|
146
159
|
# Normalize to simple 'image' type for template compatibility
|