sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +16 -1
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mooncake/conn.py +16 -0
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/entrypoints/engine.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +132 -79
- sglang/srt/function_call/ebnf_composer.py +10 -3
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/qwen3_coder_detector.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +14 -3
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +84 -22
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +25 -10
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/lora/lora_registry.py +93 -29
- sglang/srt/managers/cache_controller.py +9 -7
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +14 -8
- sglang/srt/managers/scheduler.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +37 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -2
- sglang/srt/model_executor/model_runner.py +68 -14
- sglang/srt/models/deepseek_v2.py +62 -28
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/qwen3_moe.py +5 -2
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +57 -6
- sglang/srt/utils.py +96 -1
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -412,6 +412,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
412
412
|
is_firsts = {}
|
413
413
|
stream_buffers = {}
|
414
414
|
n_prev_tokens = {}
|
415
|
+
has_tool_calls = {}
|
416
|
+
finish_reasons = {}
|
415
417
|
|
416
418
|
# Usage tracking
|
417
419
|
prompt_tokens = {}
|
@@ -443,6 +445,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
443
445
|
finish_reason = content["meta_info"]["finish_reason"]
|
444
446
|
finish_reason_type = finish_reason["type"] if finish_reason else None
|
445
447
|
|
448
|
+
# Track finish_reason for each index
|
449
|
+
if finish_reason_type:
|
450
|
+
finish_reasons[index] = finish_reason
|
451
|
+
|
446
452
|
# First chunk with role
|
447
453
|
if is_firsts.get(index, True):
|
448
454
|
is_firsts[index] = False
|
@@ -450,13 +456,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
450
456
|
choice_data = ChatCompletionResponseStreamChoice(
|
451
457
|
index=index,
|
452
458
|
delta=delta,
|
453
|
-
finish_reason=
|
454
|
-
|
455
|
-
finish_reason["matched"]
|
456
|
-
if finish_reason and "matched" in finish_reason
|
457
|
-
else None
|
458
|
-
),
|
459
|
-
logprobs=choice_logprobs,
|
459
|
+
finish_reason=None,
|
460
|
+
logprobs=None,
|
460
461
|
)
|
461
462
|
chunk = ChatCompletionStreamResponse(
|
462
463
|
id=content["meta_info"]["id"],
|
@@ -483,7 +484,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
483
484
|
choice_data = ChatCompletionResponseStreamChoice(
|
484
485
|
index=index,
|
485
486
|
delta=DeltaMessage(reasoning_content=reasoning_text),
|
486
|
-
finish_reason=
|
487
|
+
finish_reason=None,
|
487
488
|
)
|
488
489
|
chunk = ChatCompletionStreamResponse(
|
489
490
|
id=content["meta_info"]["id"],
|
@@ -493,45 +494,36 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
493
494
|
)
|
494
495
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
495
496
|
|
496
|
-
if not delta:
|
497
|
-
continue
|
498
|
-
|
499
497
|
# Handle tool calls
|
500
498
|
if request.tool_choice != "none" and request.tools:
|
501
|
-
async for (
|
502
|
-
chunk,
|
503
|
-
tool_call_finish_reason_type,
|
504
|
-
) in self._process_tool_call_stream(
|
499
|
+
async for chunk in self._process_tool_call_stream(
|
505
500
|
index,
|
506
501
|
delta,
|
507
502
|
parser_dict,
|
508
503
|
content,
|
509
504
|
request,
|
510
|
-
|
505
|
+
has_tool_calls,
|
511
506
|
):
|
512
507
|
if chunk:
|
513
508
|
yield chunk
|
514
|
-
|
509
|
+
|
510
|
+
# Send any remaining tool call arguments when generation finishes
|
511
|
+
if finish_reason_type is not None and index in parser_dict:
|
512
|
+
parser = parser_dict[index]
|
513
|
+
remaining_chunk = self._check_for_unstreamed_tool_args(
|
514
|
+
parser, content, request, index
|
515
|
+
)
|
516
|
+
if remaining_chunk:
|
517
|
+
yield remaining_chunk
|
515
518
|
|
516
519
|
else:
|
517
520
|
# Regular content
|
518
|
-
if delta
|
519
|
-
request.stream_options and request.stream_options.include_usage
|
520
|
-
):
|
521
|
+
if delta:
|
521
522
|
choice_data = ChatCompletionResponseStreamChoice(
|
522
523
|
index=index,
|
523
524
|
delta=DeltaMessage(content=delta if delta else None),
|
524
|
-
finish_reason=
|
525
|
-
|
526
|
-
if request.stream_options
|
527
|
-
and request.stream_options.include_usage
|
528
|
-
else finish_reason_type
|
529
|
-
),
|
530
|
-
matched_stop=(
|
531
|
-
finish_reason["matched"]
|
532
|
-
if finish_reason and "matched" in finish_reason
|
533
|
-
else None
|
534
|
-
),
|
525
|
+
finish_reason=None,
|
526
|
+
matched_stop=None,
|
535
527
|
logprobs=choice_logprobs,
|
536
528
|
)
|
537
529
|
chunk = ChatCompletionStreamResponse(
|
@@ -542,26 +534,36 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
542
534
|
)
|
543
535
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
544
536
|
|
545
|
-
#
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
537
|
+
# Send finish_reason chunks for each index that completed
|
538
|
+
for idx, finish_reason_data in finish_reasons.items():
|
539
|
+
finish_reason_type = finish_reason_data["type"]
|
540
|
+
|
541
|
+
# Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
|
542
|
+
final_finish_reason = finish_reason_type
|
543
|
+
if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
|
544
|
+
final_finish_reason = "tool_calls"
|
545
|
+
|
546
|
+
finish_reason_chunk = ChatCompletionStreamResponse(
|
547
|
+
id=content["meta_info"][
|
548
|
+
"id"
|
549
|
+
], # NOTE: openai uses the same chatcmpl-id for all indices
|
550
|
+
created=int(time.time()),
|
551
|
+
choices=[
|
552
|
+
ChatCompletionResponseStreamChoice(
|
553
|
+
index=idx,
|
554
|
+
delta=DeltaMessage(),
|
555
|
+
finish_reason=final_finish_reason,
|
556
|
+
matched_stop=(
|
557
|
+
finish_reason_data["matched"]
|
558
|
+
if "matched" in finish_reason_data
|
559
|
+
else None
|
560
|
+
),
|
561
|
+
)
|
562
|
+
],
|
563
|
+
model=request.model,
|
564
|
+
usage=None,
|
565
|
+
)
|
566
|
+
yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
|
565
567
|
|
566
568
|
# Send hidden states if requested
|
567
569
|
if request.return_hidden_states and hidden_states:
|
@@ -581,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
581
583
|
delta=DeltaMessage(
|
582
584
|
hidden_states=last_token_hidden_states
|
583
585
|
),
|
584
|
-
finish_reason=
|
586
|
+
finish_reason=None, # Hidden states don't need finish_reason
|
585
587
|
)
|
586
588
|
],
|
587
589
|
model=request.model,
|
@@ -860,7 +862,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
860
862
|
parser_dict: Dict[int, FunctionCallParser],
|
861
863
|
content: Dict[str, Any],
|
862
864
|
request: ChatCompletionRequest,
|
863
|
-
|
865
|
+
has_tool_calls: Dict[int, bool],
|
864
866
|
):
|
865
867
|
"""Process tool calls in streaming response"""
|
866
868
|
if index not in parser_dict:
|
@@ -877,7 +879,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
877
879
|
choice_data = ChatCompletionResponseStreamChoice(
|
878
880
|
index=index,
|
879
881
|
delta=DeltaMessage(content=normal_text),
|
880
|
-
finish_reason=
|
882
|
+
finish_reason=None,
|
881
883
|
)
|
882
884
|
chunk = ChatCompletionStreamResponse(
|
883
885
|
id=content["meta_info"]["id"],
|
@@ -885,10 +887,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
885
887
|
choices=[choice_data],
|
886
888
|
model=request.model,
|
887
889
|
)
|
888
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
890
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
889
891
|
|
890
892
|
# Yield tool calls
|
891
893
|
for call_item in calls:
|
894
|
+
# Mark that this choice has tool calls
|
895
|
+
has_tool_calls[index] = True
|
896
|
+
|
892
897
|
# Tool call ID should be generated only once per tool call
|
893
898
|
if call_item.name:
|
894
899
|
# First chunk: include ID and function name
|
@@ -899,23 +904,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
899
904
|
tool_call_id = None
|
900
905
|
function_name = None
|
901
906
|
|
902
|
-
if finish_reason_type == "stop":
|
903
|
-
# Handle remaining arguments
|
904
|
-
latest_delta_len = 0
|
905
|
-
if isinstance(call_item.parameters, str):
|
906
|
-
latest_delta_len = len(call_item.parameters)
|
907
|
-
|
908
|
-
expected_call = json.dumps(
|
909
|
-
parser.detector.prev_tool_call_arr[index].get("arguments", {}),
|
910
|
-
ensure_ascii=False,
|
911
|
-
)
|
912
|
-
actual_call = parser.detector.streamed_args_for_tool[index]
|
913
|
-
if latest_delta_len > 0:
|
914
|
-
actual_call = actual_call[:-latest_delta_len]
|
915
|
-
remaining_call = expected_call.replace(actual_call, "", 1)
|
916
|
-
call_item.parameters = remaining_call
|
917
|
-
finish_reason_type = "tool_calls"
|
918
|
-
|
919
907
|
tool_call = ToolCall(
|
920
908
|
id=tool_call_id,
|
921
909
|
index=call_item.tool_index,
|
@@ -928,19 +916,84 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
928
916
|
choice_data = ChatCompletionResponseStreamChoice(
|
929
917
|
index=index,
|
930
918
|
delta=DeltaMessage(tool_calls=[tool_call]),
|
931
|
-
finish_reason=
|
932
|
-
|
933
|
-
|
934
|
-
|
919
|
+
finish_reason=None,
|
920
|
+
)
|
921
|
+
chunk = ChatCompletionStreamResponse(
|
922
|
+
id=content["meta_info"]["id"],
|
923
|
+
created=int(time.time()),
|
924
|
+
choices=[choice_data],
|
925
|
+
model=request.model,
|
926
|
+
)
|
927
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
928
|
+
|
929
|
+
def _check_for_unstreamed_tool_args(
|
930
|
+
self,
|
931
|
+
parser: FunctionCallParser,
|
932
|
+
content: Dict[str, Any],
|
933
|
+
request: ChatCompletionRequest,
|
934
|
+
index: int,
|
935
|
+
) -> Optional[str]:
|
936
|
+
"""
|
937
|
+
Check for any remaining tool call arguments that need to be streamed
|
938
|
+
when generation finishes. This ensures tool calls are properly completed
|
939
|
+
even if the model generates the final arguments in the last chunk.
|
940
|
+
"""
|
941
|
+
# Only check if we have tool calls and the parser has tracked data
|
942
|
+
if (
|
943
|
+
not hasattr(parser.detector, "prev_tool_call_arr")
|
944
|
+
or not parser.detector.prev_tool_call_arr
|
945
|
+
):
|
946
|
+
return None
|
947
|
+
|
948
|
+
if (
|
949
|
+
not hasattr(parser.detector, "streamed_args_for_tool")
|
950
|
+
or not parser.detector.streamed_args_for_tool
|
951
|
+
):
|
952
|
+
return None
|
953
|
+
|
954
|
+
# Get the last tool call that was being processed
|
955
|
+
tool_index = len(parser.detector.prev_tool_call_arr) - 1
|
956
|
+
if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
|
957
|
+
return None
|
958
|
+
|
959
|
+
# Get expected vs actual arguments
|
960
|
+
expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
|
961
|
+
"arguments", {}
|
962
|
+
)
|
963
|
+
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
964
|
+
actual_call = parser.detector.streamed_args_for_tool[tool_index]
|
965
|
+
|
966
|
+
# Check if there are remaining arguments to send
|
967
|
+
remaining_call = (
|
968
|
+
expected_call.replace(actual_call, "", 1)
|
969
|
+
if actual_call in expected_call
|
970
|
+
else ""
|
971
|
+
)
|
972
|
+
|
973
|
+
if remaining_call:
|
974
|
+
# Create tool call chunk with remaining arguments
|
975
|
+
tool_call = ToolCall(
|
976
|
+
id=None, # No ID for argument deltas
|
977
|
+
index=tool_index,
|
978
|
+
function=FunctionResponse(
|
979
|
+
name=None, # No name for argument deltas
|
980
|
+
arguments=remaining_call,
|
935
981
|
),
|
936
982
|
)
|
983
|
+
|
984
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
985
|
+
index=index,
|
986
|
+
delta=DeltaMessage(tool_calls=[tool_call]),
|
987
|
+
finish_reason=None, # Don't send finish_reason with this chunk
|
988
|
+
)
|
989
|
+
|
937
990
|
chunk = ChatCompletionStreamResponse(
|
938
991
|
id=content["meta_info"]["id"],
|
939
992
|
created=int(time.time()),
|
940
993
|
choices=[choice_data],
|
941
994
|
model=request.model,
|
942
995
|
)
|
943
|
-
yield f"data: {chunk.model_dump_json()}\n\n", finish_reason_type
|
944
996
|
|
945
|
-
|
946
|
-
|
997
|
+
return f"data: {chunk.model_dump_json()}\n\n"
|
998
|
+
|
999
|
+
return None
|
@@ -165,6 +165,7 @@ class EBNFComposer:
|
|
165
165
|
tool_call_separator: Optional[str] = None,
|
166
166
|
call_rule_fmt: Optional[str] = None,
|
167
167
|
key_value_rule_fmt: Optional[str] = None,
|
168
|
+
key_value_separator: str = ",",
|
168
169
|
):
|
169
170
|
"""
|
170
171
|
Generalized EBNF builder for all detectors.
|
@@ -279,7 +280,11 @@ class EBNFComposer:
|
|
279
280
|
|
280
281
|
# Add required properties joined by commas
|
281
282
|
if required:
|
282
|
-
rule_parts.append(
|
283
|
+
rule_parts.append(
|
284
|
+
f' "{key_value_separator}" '.join(
|
285
|
+
prop_kv_pairs[k] for k in required
|
286
|
+
)
|
287
|
+
)
|
283
288
|
|
284
289
|
# Add optional properties with flexible ordering
|
285
290
|
if optional:
|
@@ -292,13 +297,15 @@ class EBNFComposer:
|
|
292
297
|
if j == i:
|
293
298
|
opt_parts.append(prop_kv_pairs[optional[j]])
|
294
299
|
else:
|
295
|
-
opt_parts.append(
|
300
|
+
opt_parts.append(
|
301
|
+
f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
|
302
|
+
)
|
296
303
|
opt_alternatives.append("".join(opt_parts))
|
297
304
|
|
298
305
|
# Wrap with appropriate comma handling based on whether we have required properties
|
299
306
|
if required:
|
300
307
|
# Required properties exist, so optional group needs outer comma
|
301
|
-
rule_parts.append(' ( "
|
308
|
+
rule_parts.append(f' ( "{key_value_separator}" ( ')
|
302
309
|
rule_parts.append(" | ".join(opt_alternatives))
|
303
310
|
rule_parts.append(" ) )?")
|
304
311
|
else:
|
@@ -10,6 +10,7 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
10
10
|
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
11
11
|
from sglang.srt.function_call.core_types import ToolCallItem
|
12
12
|
from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
|
13
|
+
from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
|
13
14
|
from sglang.srt.function_call.kimik2_detector import KimiK2Detector
|
14
15
|
from sglang.srt.function_call.llama32_detector import Llama32Detector
|
15
16
|
from sglang.srt.function_call.mistral_detector import MistralDetector
|
@@ -37,6 +38,7 @@ class FunctionCallParser:
|
|
37
38
|
"pythonic": PythonicDetector,
|
38
39
|
"kimi_k2": KimiK2Detector,
|
39
40
|
"qwen3_coder": Qwen3CoderDetector,
|
41
|
+
"glm45": Glm4MoeDetector,
|
40
42
|
}
|
41
43
|
|
42
44
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import ast
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
from sglang.srt.entrypoints.openai.protocol import Tool
|
8
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
9
|
+
from sglang.srt.function_call.core_types import (
|
10
|
+
StreamingParseResult,
|
11
|
+
StructureInfo,
|
12
|
+
_GetInfoFunc,
|
13
|
+
)
|
14
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def get_argument_type(func_name: str, arg_key: str, defined_tools: list):
|
20
|
+
name2tool = {tool.function.name: tool for tool in defined_tools}
|
21
|
+
if func_name not in name2tool:
|
22
|
+
return None
|
23
|
+
tool = name2tool[func_name]
|
24
|
+
if arg_key not in tool.function.parameters["properties"]:
|
25
|
+
return None
|
26
|
+
return tool.function.parameters["properties"][arg_key].get("type", None)
|
27
|
+
|
28
|
+
|
29
|
+
def parse_arguments(json_value):
|
30
|
+
try:
|
31
|
+
try:
|
32
|
+
parsed_value = json.loads(json_value)
|
33
|
+
except:
|
34
|
+
parsed_value = ast.literal_eval(json_value)
|
35
|
+
return parsed_value, True
|
36
|
+
except:
|
37
|
+
return json_value, False
|
38
|
+
|
39
|
+
|
40
|
+
class Glm4MoeDetector(BaseFormatDetector):
|
41
|
+
"""
|
42
|
+
Detector for GLM-4.5 models.
|
43
|
+
Assumes function call format:
|
44
|
+
<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(self):
|
48
|
+
super().__init__()
|
49
|
+
self.bot_token = "<tool_call>"
|
50
|
+
self.eot_token = "</tool_call>"
|
51
|
+
self.func_call_regex = r"<tool_call>.*?</tool_call>"
|
52
|
+
self.func_detail_regex = r"<tool_call>([^\n]*)\n(.*)</tool_call>"
|
53
|
+
self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
|
54
|
+
|
55
|
+
def has_tool_call(self, text: str) -> bool:
|
56
|
+
"""Check if the text contains a glm-4.5 format tool call."""
|
57
|
+
return self.bot_token in text
|
58
|
+
|
59
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
60
|
+
"""
|
61
|
+
One-time parsing: Detects and parses tool calls in the provided text.
|
62
|
+
|
63
|
+
:param text: The complete text to parse.
|
64
|
+
:param tools: List of available tools.
|
65
|
+
:return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
|
66
|
+
"""
|
67
|
+
idx = text.find(self.bot_token)
|
68
|
+
normal_text = text[:idx].strip() if idx != -1 else text
|
69
|
+
if self.bot_token not in text:
|
70
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
71
|
+
match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
|
72
|
+
calls = []
|
73
|
+
try:
|
74
|
+
for match_result in match_result_list:
|
75
|
+
# Get function name
|
76
|
+
func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
|
77
|
+
func_name = func_detail.group(1)
|
78
|
+
func_args = func_detail.group(2)
|
79
|
+
pairs = re.findall(
|
80
|
+
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
|
81
|
+
func_args,
|
82
|
+
re.DOTALL,
|
83
|
+
)
|
84
|
+
arguments = {}
|
85
|
+
for arg_key, arg_value in pairs:
|
86
|
+
arg_key = arg_key.strip()
|
87
|
+
arg_value = arg_value.strip()
|
88
|
+
arg_type = get_argument_type(func_name, arg_key, tools)
|
89
|
+
if arg_type != "string":
|
90
|
+
arg_value, is_good_json = parse_arguments(arg_value)
|
91
|
+
arguments[arg_key] = arg_value
|
92
|
+
# construct match_result for parse_base_json
|
93
|
+
match_result = {"name": func_name, "parameters": arguments}
|
94
|
+
calls.extend(self.parse_base_json(match_result, tools))
|
95
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
96
|
+
except Exception as e:
|
97
|
+
logger.error(f"Error in detect_and_parse: {e}")
|
98
|
+
# return the normal text if parsing fails
|
99
|
+
return StreamingParseResult(normal_text=text)
|
100
|
+
|
101
|
+
def parse_streaming_increment(
|
102
|
+
self, new_text: str, tools: List[Tool]
|
103
|
+
) -> StreamingParseResult:
|
104
|
+
"""
|
105
|
+
Streaming incremental parsing tool calls for GLM-4.5 format.
|
106
|
+
"""
|
107
|
+
self._buffer += new_text
|
108
|
+
current_text = self._buffer
|
109
|
+
|
110
|
+
start = current_text.find(self.bot_token)
|
111
|
+
if start == -1:
|
112
|
+
self._buffer = ""
|
113
|
+
if self.current_tool_id > 0:
|
114
|
+
current_text = ""
|
115
|
+
return StreamingParseResult(normal_text=current_text)
|
116
|
+
# find ensures we find the first self.eot_token so there will be at most one tool_call in current_text[:end+len(self.eot_token)
|
117
|
+
end = current_text.find(self.eot_token)
|
118
|
+
if end != -1:
|
119
|
+
# Initialize state if this is the first tool call
|
120
|
+
if self.current_tool_id == -1:
|
121
|
+
self.current_tool_id = 0
|
122
|
+
self.prev_tool_call_arr = []
|
123
|
+
self.streamed_args_for_tool = [""]
|
124
|
+
# Ensure we have enough entries in our tracking arrays
|
125
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
126
|
+
self.prev_tool_call_arr.append({})
|
127
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
128
|
+
self.streamed_args_for_tool.append("")
|
129
|
+
result = self.detect_and_parse(
|
130
|
+
current_text[: end + len(self.eot_token)], tools=tools
|
131
|
+
)
|
132
|
+
if result.calls:
|
133
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
134
|
+
"name": result.calls[0].name,
|
135
|
+
"arguments": json.loads(result.calls[0].parameters),
|
136
|
+
}
|
137
|
+
self.streamed_args_for_tool[self.current_tool_id] = result.calls[
|
138
|
+
0
|
139
|
+
].parameters
|
140
|
+
result.calls[0].tool_index = self.current_tool_id
|
141
|
+
self.current_tool_id += 1
|
142
|
+
self._buffer = current_text[end + len(self.eot_token) :]
|
143
|
+
return result
|
144
|
+
normal_text = current_text[:start]
|
145
|
+
self._buffer = current_text[start:]
|
146
|
+
return StreamingParseResult(normal_text=normal_text)
|
147
|
+
|
148
|
+
def supports_structural_tag(self) -> bool:
|
149
|
+
return False
|
150
|
+
|
151
|
+
def structure_info(self) -> _GetInfoFunc:
|
152
|
+
raise NotImplementedError()
|
153
|
+
|
154
|
+
def build_ebnf(self, tools: List[Tool]):
|
155
|
+
return EBNFComposer.build_ebnf(
|
156
|
+
tools,
|
157
|
+
individual_call_start_token=self.bot_token,
|
158
|
+
individual_call_end_token=self.eot_token,
|
159
|
+
tool_call_separator="\\n",
|
160
|
+
function_format="xml",
|
161
|
+
call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
|
162
|
+
key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
|
163
|
+
key_value_separator="\\n",
|
164
|
+
)
|
@@ -148,4 +148,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
|
|
148
148
|
function_format="xml",
|
149
149
|
call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
150
150
|
key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
151
|
+
key_value_separator="\\n",
|
151
152
|
)
|
@@ -0,0 +1,100 @@
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Union
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
6
|
+
from sglang.srt.layers.radix_attention import RadixAttention
|
7
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
8
|
+
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
9
|
+
|
10
|
+
|
11
|
+
class HybridAttnBackend(AttentionBackend):
|
12
|
+
"""Support different backends for prefill and decode."""
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self, prefill_backend: AttentionBackend, decode_backend: AttentionBackend
|
16
|
+
):
|
17
|
+
self.prefill_backend = prefill_backend
|
18
|
+
self.decode_backend = decode_backend
|
19
|
+
|
20
|
+
def init_forward_metadata(self, forward_batch: ForwardBatch):
|
21
|
+
if forward_batch.forward_mode.is_decode():
|
22
|
+
self.decode_backend.init_forward_metadata(forward_batch)
|
23
|
+
else:
|
24
|
+
self.prefill_backend.init_forward_metadata(forward_batch)
|
25
|
+
|
26
|
+
def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
|
27
|
+
self.decode_backend.init_cuda_graph_state(max_bs, max_num_tokens)
|
28
|
+
|
29
|
+
def init_forward_metadata_capture_cuda_graph(
|
30
|
+
self,
|
31
|
+
bs: int,
|
32
|
+
num_tokens: int,
|
33
|
+
req_pool_indices: torch.Tensor,
|
34
|
+
seq_lens: torch.Tensor,
|
35
|
+
encoder_lens: Optional[torch.Tensor],
|
36
|
+
forward_mode: ForwardMode,
|
37
|
+
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
38
|
+
):
|
39
|
+
self.decode_backend.init_forward_metadata_capture_cuda_graph(
|
40
|
+
bs,
|
41
|
+
num_tokens,
|
42
|
+
req_pool_indices,
|
43
|
+
seq_lens,
|
44
|
+
encoder_lens,
|
45
|
+
forward_mode,
|
46
|
+
spec_info,
|
47
|
+
)
|
48
|
+
|
49
|
+
def init_forward_metadata_replay_cuda_graph(
|
50
|
+
self,
|
51
|
+
bs: int,
|
52
|
+
req_pool_indices: torch.Tensor,
|
53
|
+
seq_lens: torch.Tensor,
|
54
|
+
seq_lens_sum: int,
|
55
|
+
encoder_lens: Optional[torch.Tensor],
|
56
|
+
forward_mode: ForwardMode,
|
57
|
+
spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
|
58
|
+
seq_lens_cpu: Optional[torch.Tensor],
|
59
|
+
):
|
60
|
+
self.decode_backend.init_forward_metadata_replay_cuda_graph(
|
61
|
+
bs,
|
62
|
+
req_pool_indices,
|
63
|
+
seq_lens,
|
64
|
+
seq_lens_sum,
|
65
|
+
encoder_lens,
|
66
|
+
forward_mode,
|
67
|
+
spec_info,
|
68
|
+
seq_lens_cpu,
|
69
|
+
)
|
70
|
+
|
71
|
+
def get_cuda_graph_seq_len_fill_value(self):
|
72
|
+
return self.decode_backend.get_cuda_graph_seq_len_fill_value()
|
73
|
+
|
74
|
+
def forward_decode(
|
75
|
+
self,
|
76
|
+
q: torch.Tensor,
|
77
|
+
k: torch.Tensor,
|
78
|
+
v: torch.Tensor,
|
79
|
+
layer: RadixAttention,
|
80
|
+
forward_batch: ForwardBatch,
|
81
|
+
save_kv_cache: bool = True,
|
82
|
+
**kwargs,
|
83
|
+
):
|
84
|
+
return self.decode_backend.forward_decode(
|
85
|
+
q, k, v, layer, forward_batch, save_kv_cache, **kwargs
|
86
|
+
)
|
87
|
+
|
88
|
+
def forward_extend(
|
89
|
+
self,
|
90
|
+
q: torch.Tensor,
|
91
|
+
k: torch.Tensor,
|
92
|
+
v: torch.Tensor,
|
93
|
+
layer: RadixAttention,
|
94
|
+
forward_batch: ForwardBatch,
|
95
|
+
save_kv_cache: bool = True,
|
96
|
+
**kwargs,
|
97
|
+
):
|
98
|
+
return self.prefill_backend.forward_extend(
|
99
|
+
q, k, v, layer, forward_batch, save_kv_cache, **kwargs
|
100
|
+
)
|