sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +10 -8
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +2 -1
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +93 -76
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +103 -15
- sglang/srt/entrypoints/engine.py +31 -33
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +48 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +95 -63
- sglang/srt/function_call/function_call_parser.py +4 -2
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/qwen3_coder_detector.py +151 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +24 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/logits_processor.py +34 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +190 -23
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +34 -112
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +340 -9
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +162 -164
- sglang/srt/lora/lora_registry.py +124 -0
- sglang/srt/lora/mem_pool.py +83 -35
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +288 -0
- sglang/srt/managers/io_struct.py +60 -30
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +163 -113
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +256 -86
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +38 -27
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +74 -23
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +168 -0
- sglang/srt/mem_cache/hiradix_cache.py +194 -5
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +44 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +66 -31
- sglang/srt/model_executor/forward_batch_info.py +210 -25
- sglang/srt/model_executor/model_runner.py +147 -42
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +192 -173
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +13 -6
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -9
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +57 -24
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/reasoning_parser.py +46 -4
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +454 -270
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +10 -5
- sglang/srt/utils.py +44 -69
- sglang/test/runners.py +14 -3
- sglang/test/test_activation.py +50 -1
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -1,51 +1,73 @@
|
|
1
|
-
from typing import Literal, Optional
|
1
|
+
from typing import Any, Dict, Literal, Optional
|
2
2
|
|
3
3
|
|
4
4
|
class EBNFComposer:
|
5
5
|
# Adapted from https://xgrammar.mlc.ai/docs/how_to/ebnf_guided_generation.html#try-out-via-hf-transformers
|
6
|
-
|
7
|
-
|
8
|
-
basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
|
9
|
-
basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
|
10
|
-
basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
6
|
+
# Shared primitive grammar rules used across all formats
|
7
|
+
BASE_PRIMITIVE_GRAMMAR = r"""
|
11
8
|
basic_string ::= (([\"] basic_string_1 [\"]))
|
12
9
|
basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
|
13
|
-
escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9]
|
14
|
-
|
15
|
-
|
10
|
+
escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9]{4}
|
11
|
+
basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
|
12
|
+
basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
16
13
|
basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
|
17
14
|
basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
|
18
15
|
ws ::= [ \n\t]*
|
19
|
-
|
16
|
+
"""
|
20
17
|
|
21
|
-
|
18
|
+
# Format-specific extensions
|
19
|
+
json_grammar_ebnf_str = (
|
20
|
+
r"""
|
21
|
+
json ::= basic_array | basic_object
|
22
|
+
basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
|
23
|
+
basic_boolean ::= "true" | "false"
|
24
|
+
basic_null ::= "null"
|
25
|
+
"""
|
26
|
+
+ BASE_PRIMITIVE_GRAMMAR
|
27
|
+
)
|
28
|
+
|
29
|
+
pythonic_grammar_ebnf_str = (
|
30
|
+
r"""
|
22
31
|
pythonic ::= basic_number | basic_string | basic_array | "True" | "False" | "None"
|
23
32
|
basic_any ::= basic_number | basic_string | basic_array | basic_object
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
basic_boolean ::= "True" | "False"
|
34
|
+
basic_null ::= "None"
|
35
|
+
"""
|
36
|
+
+ BASE_PRIMITIVE_GRAMMAR
|
37
|
+
)
|
38
|
+
|
39
|
+
xml_grammar_ebnf_str = (
|
40
|
+
r"""
|
41
|
+
xml ::= xml_element | xml_text
|
42
|
+
xml_element ::= basic_string | basic_number | basic_boolean | basic_null | basic_array | basic_object
|
43
|
+
xml_text ::= [^<>]*
|
44
|
+
basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
|
45
|
+
basic_boolean ::= "true" | "false"
|
46
|
+
basic_null ::= "null"
|
31
47
|
"""
|
48
|
+
+ BASE_PRIMITIVE_GRAMMAR
|
49
|
+
)
|
32
50
|
|
33
51
|
CALL_RULE_MAP = {
|
34
52
|
"pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
|
35
53
|
"json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
|
54
|
+
"xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
36
55
|
}
|
37
56
|
|
38
57
|
ARGUMENTS_RULE_MAP = {
|
39
58
|
"pythonic": "{arg_rules}",
|
40
59
|
"json": '"{{" {arg_rules} "}}"',
|
60
|
+
"xml": "{arg_rules}",
|
41
61
|
}
|
42
62
|
|
43
63
|
KEY_VALUE_RULE_MAP = {
|
44
64
|
"pythonic": '"{key}" "=" {valrule}',
|
45
65
|
"json": '"\\"{key}\\"" ":" {valrule}',
|
66
|
+
"xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
46
67
|
}
|
47
68
|
|
48
|
-
|
69
|
+
# Base type mapping - most types are the same across formats
|
70
|
+
BASE_TYPE_MAPPING = {
|
49
71
|
"string": "basic_string",
|
50
72
|
"number": "basic_number",
|
51
73
|
"integer": "basic_number",
|
@@ -55,19 +77,20 @@ class EBNFComposer:
|
|
55
77
|
"object": "basic_object",
|
56
78
|
}
|
57
79
|
|
58
|
-
|
59
|
-
|
60
|
-
"
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
"
|
65
|
-
|
80
|
+
# Format-specific overrides for types that differ
|
81
|
+
FORMAT_TYPE_OVERRIDES = {
|
82
|
+
"pythonic": {
|
83
|
+
"boolean": '"True" | "False"',
|
84
|
+
"null": '"None"',
|
85
|
+
},
|
86
|
+
"xml": {
|
87
|
+
"string": "xml_text",
|
88
|
+
},
|
66
89
|
}
|
67
90
|
|
68
91
|
@staticmethod
|
69
92
|
def get_value_rule(
|
70
|
-
prop: dict, function_format: Literal["pythonic", "json"] = "json"
|
93
|
+
prop: dict, function_format: Literal["pythonic", "json", "xml"] = "json"
|
71
94
|
) -> str:
|
72
95
|
if "enum" in prop:
|
73
96
|
return EBNFComposer._handle_enum(prop, function_format)
|
@@ -83,48 +106,46 @@ class EBNFComposer:
|
|
83
106
|
enum_values = prop["enum"]
|
84
107
|
prop_type = prop.get("type", "string")
|
85
108
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
("boolean", "json"): lambda v: "true" if v else "false",
|
95
|
-
("boolean", "pythonic"): lambda v: "True" if v else "False",
|
96
|
-
}
|
109
|
+
def format_enum_val(v: Any) -> str:
|
110
|
+
if prop_type == "boolean":
|
111
|
+
if function_format == "json" or function_format == "xml":
|
112
|
+
return "true" if v else "false"
|
113
|
+
elif function_format == "pythonic":
|
114
|
+
return "True" if v else "False"
|
115
|
+
else:
|
116
|
+
return str(v) # fallback
|
97
117
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
118
|
+
if prop_type == "string":
|
119
|
+
if function_format == "xml":
|
120
|
+
return f'"{v}"'
|
121
|
+
else: # json or pythonic
|
122
|
+
return f'"\\"{v}\\""' # escape quote-wrapped string
|
103
123
|
|
104
|
-
|
105
|
-
|
124
|
+
# All other types (number, integer, etc.)
|
125
|
+
return str(v)
|
106
126
|
|
107
|
-
|
108
|
-
|
109
|
-
|
127
|
+
formatted_values = [format_enum_val(v) for v in enum_values]
|
128
|
+
enum_rule = " | ".join(formatted_values)
|
129
|
+
return f"({enum_rule})" if len(formatted_values) > 1 else enum_rule
|
110
130
|
|
111
|
-
|
131
|
+
@staticmethod
|
132
|
+
def get_type_mapping(function_format: str) -> Dict[str, str]:
|
133
|
+
"""Get the complete type mapping for a given format."""
|
134
|
+
mapping = EBNFComposer.BASE_TYPE_MAPPING.copy()
|
135
|
+
overrides = EBNFComposer.FORMAT_TYPE_OVERRIDES.get(function_format, {})
|
136
|
+
mapping.update({k: v for k, v in overrides.items() if v is not None})
|
137
|
+
return mapping
|
112
138
|
|
113
139
|
@staticmethod
|
114
140
|
def _handle_type(prop: dict, function_format: str) -> str:
|
115
141
|
"""Handle type properties using the appropriate type mapping."""
|
116
142
|
prop_type = prop["type"]
|
117
|
-
type_mapping = (
|
118
|
-
EBNFComposer.PYTHONIC_TYPE_MAPPING
|
119
|
-
if function_format == "pythonic"
|
120
|
-
else EBNFComposer.JSON_TYPE_MAPPING
|
121
|
-
)
|
143
|
+
type_mapping = EBNFComposer.get_type_mapping(function_format)
|
122
144
|
|
123
145
|
if isinstance(prop_type, list):
|
124
146
|
type_rules = [
|
125
|
-
type_mapping
|
147
|
+
type_mapping.get(single_type, function_format)
|
126
148
|
for single_type in prop_type
|
127
|
-
if single_type in type_mapping
|
128
149
|
]
|
129
150
|
return " | ".join(type_rules) if type_rules else function_format
|
130
151
|
|
@@ -133,7 +154,7 @@ class EBNFComposer:
|
|
133
154
|
@staticmethod
|
134
155
|
def build_ebnf(
|
135
156
|
tools,
|
136
|
-
function_format: Literal["pythonic", "json"] = "json",
|
157
|
+
function_format: Literal["pythonic", "json", "xml"] = "json",
|
137
158
|
# Parameters for wrapping the entire sequence of tool calls
|
138
159
|
sequence_start_token: Optional[str] = None,
|
139
160
|
sequence_end_token: Optional[str] = None,
|
@@ -143,6 +164,7 @@ class EBNFComposer:
|
|
143
164
|
# Parameter for separating multiple tool calls
|
144
165
|
tool_call_separator: Optional[str] = None,
|
145
166
|
call_rule_fmt: Optional[str] = None,
|
167
|
+
key_value_rule_fmt: Optional[str] = None,
|
146
168
|
):
|
147
169
|
"""
|
148
170
|
Generalized EBNF builder for all detectors.
|
@@ -157,6 +179,9 @@ class EBNFComposer:
|
|
157
179
|
call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
|
158
180
|
the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
|
159
181
|
format based on function_format will be used.
|
182
|
+
key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
|
183
|
+
with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
|
184
|
+
based on function_format will be used.
|
160
185
|
"""
|
161
186
|
# =================================================================
|
162
187
|
# Step 1: Determine the root tool calls rule
|
@@ -200,7 +225,11 @@ class EBNFComposer:
|
|
200
225
|
else EBNFComposer.CALL_RULE_MAP[function_format]
|
201
226
|
)
|
202
227
|
args_template = EBNFComposer.ARGUMENTS_RULE_MAP[function_format]
|
203
|
-
key_value_template =
|
228
|
+
key_value_template = (
|
229
|
+
key_value_rule_fmt
|
230
|
+
if key_value_rule_fmt
|
231
|
+
else EBNFComposer.KEY_VALUE_RULE_MAP[function_format]
|
232
|
+
)
|
204
233
|
|
205
234
|
# =================================================================
|
206
235
|
# Step 4: Build rules for each tool
|
@@ -292,10 +321,13 @@ class EBNFComposer:
|
|
292
321
|
# =================================================================
|
293
322
|
# Step 5: Add base grammar rules
|
294
323
|
# =================================================================
|
295
|
-
|
296
|
-
EBNFComposer.pythonic_grammar_ebnf_str
|
297
|
-
|
298
|
-
|
324
|
+
grammar_dict = {
|
325
|
+
"pythonic": EBNFComposer.pythonic_grammar_ebnf_str,
|
326
|
+
"json": EBNFComposer.json_grammar_ebnf_str,
|
327
|
+
"xml": EBNFComposer.xml_grammar_ebnf_str,
|
328
|
+
}
|
329
|
+
base_grammar = grammar_dict.get(
|
330
|
+
function_format, EBNFComposer.json_grammar_ebnf_str
|
299
331
|
)
|
300
332
|
ebnf_lines.append(base_grammar)
|
301
333
|
|
@@ -14,6 +14,7 @@ from sglang.srt.function_call.kimik2_detector import KimiK2Detector
|
|
14
14
|
from sglang.srt.function_call.llama32_detector import Llama32Detector
|
15
15
|
from sglang.srt.function_call.mistral_detector import MistralDetector
|
16
16
|
from sglang.srt.function_call.pythonic_detector import PythonicDetector
|
17
|
+
from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
|
17
18
|
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
|
18
19
|
|
19
20
|
logger = logging.getLogger(__name__)
|
@@ -35,6 +36,7 @@ class FunctionCallParser:
|
|
35
36
|
"deepseekv3": DeepSeekV3Detector,
|
36
37
|
"pythonic": PythonicDetector,
|
37
38
|
"kimi_k2": KimiK2Detector,
|
39
|
+
"qwen3_coder": Qwen3CoderDetector,
|
38
40
|
}
|
39
41
|
|
40
42
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -153,9 +155,9 @@ class FunctionCallParser:
|
|
153
155
|
or None if no constraint applies.
|
154
156
|
"""
|
155
157
|
# NOTE: structural_tag only supports JSON-compatible content between the begin and end.
|
156
|
-
# It cannot parse or validate
|
158
|
+
# It cannot parse or validate function call Pythonic or XML-ish syntax.
|
157
159
|
if (
|
158
|
-
|
160
|
+
self.detector.supports_structural_tag()
|
159
161
|
and tool_choice == "auto"
|
160
162
|
and any(tool.function.strict for tool in self.tools)
|
161
163
|
):
|
@@ -18,16 +18,21 @@ logger = logging.getLogger(__name__)
|
|
18
18
|
|
19
19
|
|
20
20
|
class KimiK2Detector(BaseFormatDetector):
|
21
|
+
"""
|
22
|
+
Detector for Kimi K2 model function call format.
|
23
|
+
|
24
|
+
Format Structure:
|
25
|
+
```
|
26
|
+
<|tool_calls_section_begin|>
|
27
|
+
<|tool_call_begin|>functions.{func_name}:{index} <|tool_call_argument_begin|>{json_args}<|tool_call_end|>
|
28
|
+
<|tool_calls_section_end|>
|
29
|
+
```
|
30
|
+
|
31
|
+
Reference: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md
|
32
|
+
"""
|
21
33
|
|
22
34
|
def __init__(self):
|
23
35
|
super().__init__()
|
24
|
-
self._buffer = ""
|
25
|
-
self.current_tool_name_sent: bool = False
|
26
|
-
self.prev_tool_call_arr: list[dict] = []
|
27
|
-
self.current_tool_id: int = -1
|
28
|
-
self.streamed_args_for_tool: list[str] = (
|
29
|
-
[]
|
30
|
-
) # map what has been streamed for each tool so far to a list
|
31
36
|
|
32
37
|
self.bot_token: str = "<|tool_calls_section_begin|>"
|
33
38
|
self.eot_token: str = "<|tool_calls_section_end|>"
|
@@ -114,11 +119,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
114
119
|
return StreamingParseResult(normal_text=new_text)
|
115
120
|
|
116
121
|
if not hasattr(self, "_tool_indices"):
|
117
|
-
self._tool_indices =
|
118
|
-
tool.function.name: i
|
119
|
-
for i, tool in enumerate(tools)
|
120
|
-
if tool.function and tool.function.name
|
121
|
-
}
|
122
|
+
self._tool_indices = self._get_tool_indices(tools)
|
122
123
|
|
123
124
|
calls: list[ToolCallItem] = []
|
124
125
|
try:
|
@@ -150,7 +151,7 @@ class KimiK2Detector(BaseFormatDetector):
|
|
150
151
|
)
|
151
152
|
)
|
152
153
|
self.current_tool_name_sent = True
|
153
|
-
# Store the tool call info for
|
154
|
+
# Store the tool call info for serving layer completions endpoint
|
154
155
|
self.prev_tool_call_arr[self.current_tool_id] = {
|
155
156
|
"name": function_name,
|
156
157
|
"arguments": {},
|
@@ -214,7 +215,31 @@ class KimiK2Detector(BaseFormatDetector):
|
|
214
215
|
return StreamingParseResult(normal_text=current_text)
|
215
216
|
|
216
217
|
def structure_info(self) -> _GetInfoFunc:
|
217
|
-
|
218
|
+
"""Return function that creates StructureInfo for guided generation."""
|
219
|
+
|
220
|
+
def get_info(name: str) -> StructureInfo:
|
221
|
+
return StructureInfo(
|
222
|
+
begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0 <|tool_call_argument_begin|>",
|
223
|
+
end="<|tool_call_end|><|tool_calls_section_end|>",
|
224
|
+
trigger="<|tool_calls_section_begin|>",
|
225
|
+
)
|
226
|
+
|
227
|
+
return get_info
|
218
228
|
|
219
|
-
def build_ebnf(self, tools: List[Tool]):
|
220
|
-
|
229
|
+
def build_ebnf(self, tools: List[Tool]) -> str:
|
230
|
+
"""
|
231
|
+
Build EBNF grammar for KimiK2 tool call format.
|
232
|
+
|
233
|
+
NOTE: The call_rule_fmt uses [0-9]+ for the function index to allow the grammar
|
234
|
+
to accept any numeric index (0, 1, 2, etc.) for proper sequential indexing in
|
235
|
+
multiple function call scenarios, while still maintaining the correct KimiK2
|
236
|
+
format structure for constrained generation.
|
237
|
+
"""
|
238
|
+
return EBNFComposer.build_ebnf(
|
239
|
+
tools,
|
240
|
+
sequence_start_token=self.bot_token,
|
241
|
+
sequence_end_token=self.eot_token,
|
242
|
+
tool_call_separator="",
|
243
|
+
call_rule_fmt='"<|tool_call_begin|>functions.{name}:" [0-9]+ " <|tool_call_argument_begin|>" {arguments_rule} "<|tool_call_end|>"',
|
244
|
+
function_format="json",
|
245
|
+
)
|
@@ -16,9 +16,12 @@ logger = logging.getLogger(__name__)
|
|
16
16
|
|
17
17
|
class Llama32Detector(BaseFormatDetector):
|
18
18
|
"""
|
19
|
-
Detector for Llama 3.2 models.
|
20
|
-
|
21
|
-
|
19
|
+
Detector for Llama 3.2 models with json tool call format.
|
20
|
+
|
21
|
+
Format Structure:
|
22
|
+
```
|
23
|
+
<python_tag>{"name":"xxx", "arguments":{...}}
|
24
|
+
```
|
22
25
|
"""
|
23
26
|
|
24
27
|
def __init__(self):
|
@@ -17,9 +17,17 @@ logger = logging.getLogger(__name__)
|
|
17
17
|
|
18
18
|
class MistralDetector(BaseFormatDetector):
|
19
19
|
"""
|
20
|
-
Detector for Mistral
|
21
|
-
|
22
|
-
|
20
|
+
Detector for Mistral model function call format.
|
21
|
+
|
22
|
+
The Mistral format uses a simple bracket-delimited structure with JSON arrays
|
23
|
+
containing function call objects.
|
24
|
+
|
25
|
+
Format Structure:
|
26
|
+
```
|
27
|
+
[TOOL_CALLS] [{"name": "function_name", "arguments": {json_args}}, ...]
|
28
|
+
```
|
29
|
+
|
30
|
+
Reference: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3?chat_template=default
|
23
31
|
"""
|
24
32
|
|
25
33
|
def __init__(self):
|
@@ -8,7 +8,6 @@ from sglang.srt.entrypoints.openai.protocol import Tool
|
|
8
8
|
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
9
9
|
from sglang.srt.function_call.core_types import (
|
10
10
|
StreamingParseResult,
|
11
|
-
StructureInfo,
|
12
11
|
ToolCallItem,
|
13
12
|
_GetInfoFunc,
|
14
13
|
)
|
@@ -19,10 +18,17 @@ logger = logging.getLogger(__name__)
|
|
19
18
|
|
20
19
|
class PythonicDetector(BaseFormatDetector):
|
21
20
|
"""
|
22
|
-
Detector for Llama-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
Detector for Llama-4 models with Pythonic tool call format.
|
22
|
+
|
23
|
+
The Pythonic format uses Python function call syntax within square brackets,
|
24
|
+
with arguments as Python literals rather than JSON.
|
25
|
+
|
26
|
+
Format Structure:
|
27
|
+
```
|
28
|
+
[tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
|
29
|
+
```
|
30
|
+
|
31
|
+
Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct?chat_template=default
|
26
32
|
"""
|
27
33
|
|
28
34
|
def __init__(self):
|
@@ -75,11 +81,7 @@ class PythonicDetector(BaseFormatDetector):
|
|
75
81
|
return StreamingParseResult(normal_text=normal_text, calls=[])
|
76
82
|
|
77
83
|
calls = []
|
78
|
-
tool_indices =
|
79
|
-
tool.function.name: i
|
80
|
-
for i, tool in enumerate(tools)
|
81
|
-
if tool.function.name
|
82
|
-
}
|
84
|
+
tool_indices = self._get_tool_indices(tools)
|
83
85
|
for call_index, call in enumerate(parsed.elts):
|
84
86
|
if not isinstance(call.func, ast.Name):
|
85
87
|
continue
|
@@ -213,11 +215,11 @@ class PythonicDetector(BaseFormatDetector):
|
|
213
215
|
else:
|
214
216
|
raise ValueError("Tool call arguments must be literals")
|
215
217
|
|
216
|
-
def
|
217
|
-
|
218
|
-
return StructureInfo(begin=f"[{name}(", end=")]", trigger=f"[{name}(")
|
218
|
+
def supports_structural_tag(self) -> bool:
|
219
|
+
return False
|
219
220
|
|
220
|
-
|
221
|
+
def structure_info(self) -> _GetInfoFunc:
|
222
|
+
raise NotImplementedError
|
221
223
|
|
222
224
|
def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
|
223
225
|
return EBNFComposer.build_ebnf(
|
@@ -17,9 +17,18 @@ logger = logging.getLogger(__name__)
|
|
17
17
|
|
18
18
|
class Qwen25Detector(BaseFormatDetector):
|
19
19
|
"""
|
20
|
-
Detector for Qwen 2.5
|
21
|
-
|
22
|
-
|
20
|
+
Detector for Qwen 2.5 and Qwen 3 model function call format.
|
21
|
+
|
22
|
+
Format Structure:
|
23
|
+
```
|
24
|
+
<tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
|
25
|
+
```
|
26
|
+
|
27
|
+
Key Components:
|
28
|
+
- Tool Call Tags: `<tool_call>` and `</tool_call>` wrap each individual call
|
29
|
+
- Function Call Object: JSON object with "name" and "arguments" fields
|
30
|
+
|
31
|
+
Reference: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct?chat_template=default
|
23
32
|
"""
|
24
33
|
|
25
34
|
def __init__(self):
|
@@ -0,0 +1,151 @@
|
|
1
|
+
import ast
|
2
|
+
import html
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
from typing import Any, Dict, List, Tuple
|
7
|
+
|
8
|
+
from sglang.srt.entrypoints.openai.protocol import Tool
|
9
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
10
|
+
from sglang.srt.function_call.core_types import (
|
11
|
+
StreamingParseResult,
|
12
|
+
ToolCallItem,
|
13
|
+
_GetInfoFunc,
|
14
|
+
)
|
15
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def _safe_val(raw: str) -> Any:
|
21
|
+
raw = html.unescape(raw.strip())
|
22
|
+
try:
|
23
|
+
return json.loads(raw)
|
24
|
+
except Exception:
|
25
|
+
try:
|
26
|
+
return ast.literal_eval(raw)
|
27
|
+
except Exception:
|
28
|
+
return raw
|
29
|
+
|
30
|
+
|
31
|
+
class Qwen3CoderDetector(BaseFormatDetector):
|
32
|
+
"""
|
33
|
+
Detector for Qwen 3 models.
|
34
|
+
Assumes function call format:
|
35
|
+
<tool_call>
|
36
|
+
<function=execute_bash>
|
37
|
+
<parameter=command>
|
38
|
+
pwd && ls
|
39
|
+
</parameter>
|
40
|
+
</function>
|
41
|
+
</tool_call>
|
42
|
+
"""
|
43
|
+
|
44
|
+
def __init__(self):
|
45
|
+
super().__init__()
|
46
|
+
self.tool_call_start_token: str = "<tool_call>"
|
47
|
+
self.tool_call_end_token: str = "</tool_call>"
|
48
|
+
self.tool_call_prefix: str = "<function="
|
49
|
+
self.tool_call_regex = re.compile(
|
50
|
+
r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
|
51
|
+
)
|
52
|
+
self.tool_call_function_regex = re.compile(
|
53
|
+
r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
|
54
|
+
)
|
55
|
+
self.tool_call_parameter_regex = re.compile(
|
56
|
+
r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL
|
57
|
+
)
|
58
|
+
self._buf: str = ""
|
59
|
+
|
60
|
+
def has_tool_call(self, text: str) -> bool:
|
61
|
+
return self.tool_call_start_token in text
|
62
|
+
|
63
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
64
|
+
normal, calls = self._extract(text, tools)
|
65
|
+
return StreamingParseResult(normal_text=normal, calls=calls)
|
66
|
+
|
67
|
+
def parse_streaming_increment(
|
68
|
+
self, new_text: str, tools: List[Tool]
|
69
|
+
) -> StreamingParseResult:
|
70
|
+
self._buf += new_text
|
71
|
+
normal = ""
|
72
|
+
calls: List[ToolCallItem] = []
|
73
|
+
while True:
|
74
|
+
if self.tool_call_start_token not in self._buf:
|
75
|
+
normal += self._buf
|
76
|
+
self._buf = ""
|
77
|
+
break
|
78
|
+
s = self._buf.find(self.tool_call_start_token)
|
79
|
+
if s > 0:
|
80
|
+
normal += self._buf[:s]
|
81
|
+
self._buf = self._buf[s:]
|
82
|
+
e = self._buf.find(self.tool_call_end_token)
|
83
|
+
if e == -1:
|
84
|
+
break
|
85
|
+
block = self._buf[: e + len(self.tool_call_end_token)]
|
86
|
+
self._buf = self._buf[e + len(self.tool_call_end_token) :]
|
87
|
+
calls.extend(self._parse_block(block, tools))
|
88
|
+
return StreamingParseResult(normal_text=normal, calls=calls)
|
89
|
+
|
90
|
+
def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
|
91
|
+
normal_parts: List[str] = []
|
92
|
+
calls: List[ToolCallItem] = []
|
93
|
+
cursor = 0
|
94
|
+
while True:
|
95
|
+
s = text.find(self.tool_call_start_token, cursor)
|
96
|
+
if s == -1:
|
97
|
+
normal_parts.append(text[cursor:])
|
98
|
+
break
|
99
|
+
normal_parts.append(text[cursor:s])
|
100
|
+
e = text.find(self.tool_call_end_token, s)
|
101
|
+
if e == -1:
|
102
|
+
normal_parts.append(text[s:])
|
103
|
+
break
|
104
|
+
block = text[s : e + len(self.tool_call_end_token)]
|
105
|
+
cursor = e + len(self.tool_call_end_token)
|
106
|
+
calls.extend(self._parse_block(block, tools))
|
107
|
+
return "".join(normal_parts), calls
|
108
|
+
|
109
|
+
def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
|
110
|
+
res: List[ToolCallItem] = []
|
111
|
+
for m in self.tool_call_function_regex.findall(block):
|
112
|
+
txt = m[0] if m[0] else m[1]
|
113
|
+
if ">" not in txt:
|
114
|
+
continue
|
115
|
+
idx = txt.index(">")
|
116
|
+
fname = txt[:idx].strip()
|
117
|
+
body = txt[idx + 1 :]
|
118
|
+
params: Dict[str, Any] = {}
|
119
|
+
for pm in self.tool_call_parameter_regex.findall(body):
|
120
|
+
ptxt = pm[0] if pm[0] else pm[1]
|
121
|
+
if ">" not in ptxt:
|
122
|
+
continue
|
123
|
+
pidx = ptxt.index(">")
|
124
|
+
pname = ptxt[:pidx].strip()
|
125
|
+
pval = ptxt[pidx + 1 :].lstrip("\n").rstrip("\n")
|
126
|
+
params[pname] = _safe_val(pval)
|
127
|
+
raw = {"name": fname, "arguments": params}
|
128
|
+
try:
|
129
|
+
# TODO: fix idx in function call, the index for a function
|
130
|
+
# call will always be -1 in parse_base_json
|
131
|
+
res.extend(self.parse_base_json(raw, tools))
|
132
|
+
except Exception:
|
133
|
+
logger.warning("invalid tool call for %s dropped", fname)
|
134
|
+
return res
|
135
|
+
|
136
|
+
def supports_structural_tag(self) -> bool:
|
137
|
+
return False
|
138
|
+
|
139
|
+
def structure_info(self) -> _GetInfoFunc:
|
140
|
+
raise NotImplementedError
|
141
|
+
|
142
|
+
def build_ebnf(self, tools: List[Tool]):
|
143
|
+
return EBNFComposer.build_ebnf(
|
144
|
+
tools,
|
145
|
+
individual_call_start_token=self.tool_call_start_token.replace("\n", "\\n"),
|
146
|
+
individual_call_end_token=self.tool_call_end_token.replace("\n", "\\n"),
|
147
|
+
tool_call_separator="\\n",
|
148
|
+
function_format="xml",
|
149
|
+
call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
|
150
|
+
key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
|
151
|
+
)
|