sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -30,11 +30,6 @@ class EBNFComposer:
|
|
30
30
|
ws ::= [ \n\t]*
|
31
31
|
"""
|
32
32
|
|
33
|
-
TOOL_CALLS_MAP = {
|
34
|
-
"pythonic": '"[" function_call ("," function_call)* "]"',
|
35
|
-
"json": "function_call",
|
36
|
-
}
|
37
|
-
|
38
33
|
CALL_RULE_MAP = {
|
39
34
|
"pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
|
40
35
|
"json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
|
@@ -138,35 +133,54 @@ class EBNFComposer:
|
|
138
133
|
@staticmethod
|
139
134
|
def build_ebnf(
|
140
135
|
tools,
|
141
|
-
*,
|
142
|
-
call_rule_fmt: Optional[str] = None,
|
143
136
|
function_format: Literal["pythonic", "json"] = "json",
|
144
|
-
|
145
|
-
|
137
|
+
# Parameters for wrapping the entire sequence of tool calls
|
138
|
+
sequence_start_token: Optional[str] = None,
|
139
|
+
sequence_end_token: Optional[str] = None,
|
140
|
+
# Parameters for wrapping individual tool calls
|
141
|
+
individual_call_start_token: Optional[str] = None,
|
142
|
+
individual_call_end_token: Optional[str] = None,
|
143
|
+
# Parameter for separating multiple tool calls
|
146
144
|
tool_call_separator: Optional[str] = None,
|
145
|
+
call_rule_fmt: Optional[str] = None,
|
147
146
|
):
|
148
147
|
"""
|
149
148
|
Generalized EBNF builder for all detectors.
|
150
149
|
Args:
|
151
150
|
tools: List of Tool objects to generate EBNF grammar for
|
151
|
+
function_format: The format of function calls, either "pythonic" or "json"
|
152
|
+
sequence_start_token: Token that wraps the entire sequence of tool calls (start)
|
153
|
+
sequence_end_token: Token that wraps the entire sequence of tool calls (end)
|
154
|
+
individual_call_start_token: Token that wraps each individual tool call (start)
|
155
|
+
individual_call_end_token: Token that wraps each individual tool call (end)
|
156
|
+
tool_call_separator: The separator between multiple tool calls
|
152
157
|
call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
|
153
158
|
the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
|
154
159
|
format based on function_format will be used.
|
155
|
-
function_format: The format of function calls, either "pythonic" or "json"
|
156
|
-
bot_token: The token that indicates the start of a tool call section
|
157
|
-
eot_token: The token that indicates the end of a tool call section
|
158
|
-
tool_call_separator: The separator between multiple tool calls
|
159
160
|
"""
|
160
161
|
# =================================================================
|
161
162
|
# Step 1: Determine the root tool calls rule
|
162
163
|
# =================================================================
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
164
|
+
# Handle a single function call
|
165
|
+
if individual_call_start_token and individual_call_end_token:
|
166
|
+
function_call_unit = f'"{individual_call_start_token}" function_call "{individual_call_end_token}"'
|
167
|
+
else:
|
168
|
+
function_call_unit = "function_call"
|
169
|
+
|
170
|
+
# Handle multiple function calls with separators
|
171
|
+
if tool_call_separator is not None:
|
172
|
+
base_pattern = f'{function_call_unit} ( "{tool_call_separator}" {function_call_unit} )*'
|
173
|
+
else:
|
174
|
+
# Assume only support single function call
|
175
|
+
base_pattern = function_call_unit
|
176
|
+
|
177
|
+
# Apply sequence-level wrapping if needed
|
178
|
+
if sequence_start_token and sequence_end_token:
|
179
|
+
root_rule = (
|
180
|
+
f'"{sequence_start_token}" {base_pattern} "{sequence_end_token}"'
|
181
|
+
)
|
168
182
|
else:
|
169
|
-
root_rule =
|
183
|
+
root_rule = base_pattern
|
170
184
|
|
171
185
|
# =================================================================
|
172
186
|
# Step 2: Build the header rules
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
|
2
3
|
|
3
4
|
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
@@ -14,6 +15,8 @@ from sglang.srt.openai_api.protocol import (
|
|
14
15
|
ToolChoice,
|
15
16
|
)
|
16
17
|
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
17
20
|
|
18
21
|
class FunctionCallParser:
|
19
22
|
"""
|
@@ -165,11 +168,35 @@ class FunctionCallParser:
|
|
165
168
|
) -> Optional[str]:
|
166
169
|
"""
|
167
170
|
Get the EBNF grammar for the specified tool choice.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
tool_choice: The tool choice specification
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
EBNF grammar string, or None if no valid tools found
|
177
|
+
|
178
|
+
Note:
|
179
|
+
If a specific function is requested but not found in available tools,
|
180
|
+
logs a warning and falls back to using all available tools for backward compatibility.
|
168
181
|
"""
|
169
182
|
filtered_tools = []
|
170
183
|
if isinstance(tool_choice, ToolChoice):
|
171
184
|
fn_name = tool_choice.function.name
|
172
185
|
filtered_tools = [t for t in self.tools if t.function.name == fn_name]
|
186
|
+
|
187
|
+
# Check if the requested function exists in available tools
|
188
|
+
if not filtered_tools:
|
189
|
+
available_functions = [t.function.name for t in self.tools]
|
190
|
+
logger.warning(
|
191
|
+
f"Function '{fn_name}' not found in available tools. "
|
192
|
+
f"Available functions: {available_functions}. "
|
193
|
+
f"Skipping tool choice."
|
194
|
+
)
|
195
|
+
|
196
|
+
# TODO: Return a 400 error instead of warning when adapter supports proper error handling
|
197
|
+
# For now, fall back to return None
|
198
|
+
return None
|
173
199
|
else:
|
174
200
|
filtered_tools = self.tools
|
201
|
+
|
175
202
|
return self.detector.build_ebnf(filtered_tools)
|
@@ -24,6 +24,11 @@ class Llama32Detector(BaseFormatDetector):
|
|
24
24
|
def __init__(self):
|
25
25
|
super().__init__()
|
26
26
|
self.bot_token = "<|python_tag|>"
|
27
|
+
# NOTE: technically Llama3.2 doesn't support well with parallel tool calls
|
28
|
+
# They need specific prompt engineering to support parallel tool calls
|
29
|
+
# Here we use ';' as the separator, which might have compatibility issues
|
30
|
+
# if users define to use a different separator in their prompt
|
31
|
+
self.tool_call_separator = ";"
|
27
32
|
|
28
33
|
def has_tool_call(self, text: str) -> bool:
|
29
34
|
"""Check if the text contains a Llama 3.2 format tool call."""
|
@@ -37,27 +42,41 @@ class Llama32Detector(BaseFormatDetector):
|
|
37
42
|
return StreamingParseResult(normal_text=text, calls=[])
|
38
43
|
|
39
44
|
if "<|python_tag|>" in text:
|
40
|
-
normal_text, action_text = text.split("<|python_tag|>")
|
45
|
+
normal_text, action_text = text.split("<|python_tag|>", maxsplit=1)
|
41
46
|
else:
|
42
47
|
normal_text, action_text = "", text
|
43
48
|
|
44
|
-
|
45
|
-
|
49
|
+
decoder = json.JSONDecoder()
|
50
|
+
idx = 0
|
51
|
+
safe_idx = idx # the index of the last valid JSON object
|
46
52
|
all_actions = []
|
47
|
-
|
53
|
+
action_text_len = len(action_text)
|
54
|
+
while idx < action_text_len:
|
48
55
|
try:
|
49
|
-
|
50
|
-
|
51
|
-
|
56
|
+
obj, end = decoder.raw_decode(action_text[idx:])
|
57
|
+
all_actions.append(obj)
|
58
|
+
idx += end + len(self.tool_call_separator)
|
59
|
+
safe_idx = idx
|
52
60
|
except json.JSONDecodeError as e:
|
53
|
-
|
54
|
-
logger.warning(
|
61
|
+
# Find where next `{"name"` appears and try again
|
62
|
+
logger.warning(
|
63
|
+
f"Failed to parse JSON part: {action_text[idx:]}, JSON parse error: {str(e)}"
|
64
|
+
)
|
65
|
+
next_obj_start = action_text.find('{"name":', idx + 1)
|
66
|
+
if next_obj_start == -1:
|
67
|
+
break
|
68
|
+
idx = next_obj_start
|
55
69
|
continue
|
56
|
-
|
70
|
+
|
57
71
|
# Only process if we found valid JSON objects
|
58
|
-
if all_actions
|
59
|
-
|
60
|
-
|
72
|
+
calls = self.parse_base_json(all_actions, tools) if all_actions else []
|
73
|
+
# Use safe_idx to avoid idx containing the last part of an invalid JSON object
|
74
|
+
trailing_text = (
|
75
|
+
action_text[safe_idx:].strip() if safe_idx < action_text_len else ""
|
76
|
+
)
|
77
|
+
return StreamingParseResult(
|
78
|
+
normal_text=normal_text + trailing_text, calls=calls
|
79
|
+
)
|
61
80
|
|
62
81
|
def structure_info(self) -> _GetInfoFunc:
|
63
82
|
return lambda name: StructureInfo(
|
@@ -70,5 +89,5 @@ class Llama32Detector(BaseFormatDetector):
|
|
70
89
|
return EBNFComposer.build_ebnf(
|
71
90
|
tools,
|
72
91
|
function_format="json",
|
73
|
-
tool_call_separator=
|
92
|
+
tool_call_separator=self.tool_call_separator,
|
74
93
|
)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import logging
|
2
3
|
import re
|
3
4
|
from typing import List
|
4
5
|
|
@@ -11,12 +12,14 @@ from sglang.srt.function_call.core_types import (
|
|
11
12
|
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
12
13
|
from sglang.srt.openai_api.protocol import Tool
|
13
14
|
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
14
17
|
|
15
18
|
class MistralDetector(BaseFormatDetector):
|
16
19
|
"""
|
17
20
|
Detector for Mistral models.
|
18
21
|
Assumes function call format:
|
19
|
-
[TOOL_CALLS] [{"name":"
|
22
|
+
[TOOL_CALLS] [{"name":"func1", "arguments":{...}}, {"name":"func2", "arguments":{...}}]
|
20
23
|
"""
|
21
24
|
|
22
25
|
def __init__(self):
|
@@ -27,26 +30,12 @@ class MistralDetector(BaseFormatDetector):
|
|
27
30
|
self.bot_token = "[TOOL_CALLS] ["
|
28
31
|
self.eot_token = "]"
|
29
32
|
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
33
|
+
self.tool_call_separator = ", "
|
30
34
|
|
31
35
|
def has_tool_call(self, text: str) -> bool:
|
32
36
|
"""Check if the text contains a Mistral format tool call."""
|
33
37
|
return self.bot_token in text
|
34
38
|
|
35
|
-
def _clean_text(self, text: str) -> str:
|
36
|
-
"""
|
37
|
-
clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
|
38
|
-
for example,
|
39
|
-
text = '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]\n\nToday\'s weather in Boston is :{function call result} (in Fahrenheit)\n\nIf you prefer Celsius, please let me know.'
|
40
|
-
return '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]'
|
41
|
-
The key pattern is [TOOL_CALLS] [...]
|
42
|
-
"""
|
43
|
-
# TODO: check if Mistral supports multiple tool calls, currently assume only support one tool call
|
44
|
-
find_results = re.findall(r"\[TOOL_CALLS\] \[.*?\]", text, re.DOTALL)
|
45
|
-
if len(find_results) > 0:
|
46
|
-
return find_results[0]
|
47
|
-
else:
|
48
|
-
return ""
|
49
|
-
|
50
39
|
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
51
40
|
"""
|
52
41
|
One-time parsing: Detects and parses tool calls in the provided text.
|
@@ -57,17 +46,74 @@ class MistralDetector(BaseFormatDetector):
|
|
57
46
|
"""
|
58
47
|
idx = text.find(self.bot_token)
|
59
48
|
normal_text = text[:idx].strip() if idx != -1 else text
|
60
|
-
|
61
|
-
|
62
|
-
|
49
|
+
|
50
|
+
if self.bot_token not in text:
|
51
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
52
|
+
|
53
|
+
# Extract the JSON array part from [TOOL_CALLS] [...]
|
54
|
+
# Use bracket counting to properly handle nested brackets in JSON content
|
55
|
+
json_array_str = self._extract_json_array(text)
|
56
|
+
if not json_array_str:
|
57
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
58
|
+
|
63
59
|
calls = []
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
60
|
+
try:
|
61
|
+
function_call_arr = json.loads(json_array_str)
|
62
|
+
# Handle both single object and array of objects
|
63
|
+
if not isinstance(function_call_arr, list):
|
64
|
+
function_call_arr = [function_call_arr]
|
65
|
+
calls = self.parse_base_json(function_call_arr, tools)
|
66
|
+
except json.JSONDecodeError as e:
|
67
|
+
logger.warning(
|
68
|
+
f"Failed to parse JSON part: {json_array_str}, JSON parse error: {str(e)}"
|
69
|
+
)
|
70
|
+
|
69
71
|
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
70
72
|
|
73
|
+
def _extract_json_array(self, text: str) -> str:
|
74
|
+
"""
|
75
|
+
Extract the JSON array part using bracket counting to handle nested brackets.
|
76
|
+
|
77
|
+
:param text: The complete text containing [TOOL_CALLS] [...]
|
78
|
+
:return: The JSON array string or None if not found
|
79
|
+
"""
|
80
|
+
start_idx = text.find(self.bot_token)
|
81
|
+
if start_idx == -1:
|
82
|
+
return None
|
83
|
+
|
84
|
+
# Start from the opening bracket after [TOOL_CALLS]
|
85
|
+
json_start = (
|
86
|
+
start_idx + len(self.bot_token) - 1
|
87
|
+
) # -1 to include the opening bracket
|
88
|
+
bracket_count = 0
|
89
|
+
in_string = False
|
90
|
+
escape_next = False
|
91
|
+
|
92
|
+
for i in range(json_start, len(text)):
|
93
|
+
char = text[i]
|
94
|
+
|
95
|
+
if escape_next:
|
96
|
+
escape_next = False
|
97
|
+
continue
|
98
|
+
|
99
|
+
if char == "\\":
|
100
|
+
escape_next = True
|
101
|
+
continue
|
102
|
+
|
103
|
+
if char == '"' and not escape_next:
|
104
|
+
in_string = not in_string
|
105
|
+
continue
|
106
|
+
|
107
|
+
if not in_string:
|
108
|
+
if char == "[":
|
109
|
+
bracket_count += 1
|
110
|
+
elif char == "]":
|
111
|
+
bracket_count -= 1
|
112
|
+
if bracket_count == 0:
|
113
|
+
return text[json_start : i + 1]
|
114
|
+
|
115
|
+
return None
|
116
|
+
|
71
117
|
def structure_info(self) -> _GetInfoFunc:
|
72
118
|
return lambda name: StructureInfo(
|
73
119
|
begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
|
@@ -78,7 +124,8 @@ class MistralDetector(BaseFormatDetector):
|
|
78
124
|
def build_ebnf(self, tools: List[Tool]):
|
79
125
|
return EBNFComposer.build_ebnf(
|
80
126
|
tools,
|
81
|
-
|
82
|
-
|
127
|
+
sequence_start_token=self.bot_token,
|
128
|
+
sequence_end_token=self.eot_token,
|
83
129
|
function_format="json",
|
130
|
+
tool_call_separator=self.tool_call_separator,
|
84
131
|
)
|
@@ -32,47 +32,79 @@ class PythonicDetector(BaseFormatDetector):
|
|
32
32
|
re.DOTALL,
|
33
33
|
)
|
34
34
|
|
35
|
+
@staticmethod
|
36
|
+
def _text_strip(text: str) -> str:
|
37
|
+
# Llama 4 model sometime will output <|python_start|> and <|python_end|> tokens
|
38
|
+
# remove those tokens
|
39
|
+
text = text.replace("<|python_start|>", "")
|
40
|
+
text = text.replace("<|python_end|>", "")
|
41
|
+
return text
|
42
|
+
|
35
43
|
def has_tool_call(self, text: str) -> bool:
|
36
|
-
return bool(self.tool_call_regex.
|
44
|
+
return bool(self.tool_call_regex.search(self._text_strip(text.strip())))
|
37
45
|
|
38
46
|
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
39
47
|
# Try parsing the text as a Python list of function calls
|
40
48
|
text = text.strip()
|
41
|
-
|
42
|
-
|
49
|
+
|
50
|
+
# Remove unexpected <|python_start|> and <|python_end|> for llama4
|
51
|
+
text = self._text_strip(text)
|
52
|
+
|
53
|
+
match = self.tool_call_regex.search(text)
|
54
|
+
if match is None:
|
43
55
|
return StreamingParseResult(normal_text=text, calls=[])
|
56
|
+
|
57
|
+
# Extract the tool call part and any text before/after it
|
58
|
+
tool_call_start = match.start()
|
59
|
+
tool_call_end = match.end()
|
60
|
+
|
61
|
+
normal_text_before = text[:tool_call_start] if tool_call_start > 0 else ""
|
62
|
+
tool_call_text = text[tool_call_start:tool_call_end]
|
63
|
+
normal_text_after = text[tool_call_end:] if tool_call_end < len(text) else ""
|
64
|
+
|
65
|
+
# Combine normal text
|
66
|
+
normal_text = normal_text_before + normal_text_after
|
67
|
+
|
44
68
|
try:
|
45
|
-
module = ast.parse(
|
69
|
+
module = ast.parse(tool_call_text)
|
46
70
|
parsed = getattr(module.body[0], "value", None)
|
47
71
|
if not (
|
48
72
|
isinstance(parsed, ast.List)
|
49
73
|
and all(isinstance(e, ast.Call) for e in parsed.elts)
|
50
74
|
):
|
51
|
-
return StreamingParseResult(normal_text=
|
75
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
76
|
+
|
52
77
|
calls = []
|
53
78
|
tool_indices = {
|
54
79
|
tool.function.name: i
|
55
80
|
for i, tool in enumerate(tools)
|
56
81
|
if tool.function.name
|
57
82
|
}
|
58
|
-
for call in parsed.elts:
|
83
|
+
for call_index, call in enumerate(parsed.elts):
|
59
84
|
if not isinstance(call.func, ast.Name):
|
60
85
|
continue
|
61
86
|
function_name = call.func.id
|
87
|
+
# Validate that the function exists in the tools
|
88
|
+
if function_name not in tool_indices:
|
89
|
+
logger.warning(
|
90
|
+
f"Model attempted to call undefined function: {function_name}"
|
91
|
+
)
|
92
|
+
continue
|
62
93
|
arguments = {}
|
63
94
|
for keyword in call.keywords:
|
64
95
|
arguments[keyword.arg] = self._get_parameter_value(keyword.value)
|
65
96
|
calls.append(
|
66
97
|
ToolCallItem(
|
67
|
-
tool_index=
|
98
|
+
tool_index=call_index, # Use the call index in the response, not tool position
|
68
99
|
name=function_name,
|
69
100
|
parameters=json.dumps(arguments, ensure_ascii=False),
|
70
101
|
)
|
71
102
|
)
|
72
|
-
|
103
|
+
|
104
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
73
105
|
except Exception:
|
74
106
|
logger.exception("Error in pythonic tool call parsing.")
|
75
|
-
return StreamingParseResult(normal_text=
|
107
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
76
108
|
|
77
109
|
def _find_matching_bracket(self, buffer: str, start: int) -> int:
|
78
110
|
"""
|
@@ -96,6 +128,30 @@ class PythonicDetector(BaseFormatDetector):
|
|
96
128
|
return i
|
97
129
|
return -1 # No matching bracket found
|
98
130
|
|
131
|
+
def _strip_and_split_buffer(self, buffer: str) -> tuple[str, str]:
|
132
|
+
"""
|
133
|
+
Strip special tokens from buffer and split into safe_text and held_back_text.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
tuple of (safe_text_to_output, text_to_hold_in_buffer)
|
137
|
+
"""
|
138
|
+
# Check if original buffer ends with a partial token at the end
|
139
|
+
special_tokens = ["<|python_start|>", "<|python_end|>"]
|
140
|
+
|
141
|
+
for token in special_tokens:
|
142
|
+
partial_length = self._ends_with_partial_token(buffer, token)
|
143
|
+
if partial_length > 0:
|
144
|
+
# Split buffer: safe part + held back partial token
|
145
|
+
safe_text = buffer[:-partial_length]
|
146
|
+
held_back = buffer[-partial_length:]
|
147
|
+
# Strip complete special tokens from safe part only
|
148
|
+
safe_text = self._text_strip(safe_text)
|
149
|
+
return safe_text, held_back
|
150
|
+
|
151
|
+
# No partial tokens found, strip complete tokens from entire buffer
|
152
|
+
safe_text = self._text_strip(buffer)
|
153
|
+
return safe_text, ""
|
154
|
+
|
99
155
|
def parse_streaming_increment(
|
100
156
|
self, new_text: str, tools: List[Tool]
|
101
157
|
) -> StreamingParseResult:
|
@@ -105,20 +161,28 @@ class PythonicDetector(BaseFormatDetector):
|
|
105
161
|
then parses and emits any detected calls.
|
106
162
|
"""
|
107
163
|
self._buffer += new_text
|
108
|
-
|
164
|
+
|
165
|
+
# Strip special tokens from entire buffer and handle partial tokens
|
166
|
+
stripped_buffer, held_back = self._strip_and_split_buffer(self._buffer)
|
167
|
+
|
168
|
+
start = stripped_buffer.find("[")
|
109
169
|
|
110
170
|
if start == -1:
|
111
|
-
|
112
|
-
self._buffer =
|
113
|
-
return StreamingParseResult(normal_text=
|
171
|
+
# No tool call bracket found
|
172
|
+
self._buffer = held_back
|
173
|
+
return StreamingParseResult(normal_text=stripped_buffer)
|
114
174
|
|
115
|
-
normal_text =
|
175
|
+
normal_text = stripped_buffer[:start] if start > 0 else ""
|
116
176
|
|
117
|
-
end = self._find_matching_bracket(
|
177
|
+
end = self._find_matching_bracket(stripped_buffer, start)
|
118
178
|
if end != -1:
|
119
|
-
|
179
|
+
# Found complete tool call
|
180
|
+
call_text = stripped_buffer[start : end + 1]
|
120
181
|
result = self.detect_and_parse(call_text, tools)
|
121
|
-
|
182
|
+
|
183
|
+
# Update buffer with remaining text after tool call plus any held back text
|
184
|
+
remaining_text = stripped_buffer[end + 1 :] + held_back
|
185
|
+
self._buffer = remaining_text
|
122
186
|
|
123
187
|
# If we had normal text before the tool call, add it to the result
|
124
188
|
if normal_text:
|
@@ -127,8 +191,10 @@ class PythonicDetector(BaseFormatDetector):
|
|
127
191
|
return result
|
128
192
|
|
129
193
|
# We have an opening bracket but no closing bracket yet
|
194
|
+
# Put back everything from the bracket onwards plus held back text
|
195
|
+
self._buffer = stripped_buffer[start:] + held_back
|
196
|
+
|
130
197
|
if normal_text:
|
131
|
-
self._buffer = self._buffer[start:]
|
132
198
|
return StreamingParseResult(normal_text=normal_text)
|
133
199
|
|
134
200
|
# Otherwise, we're still accumulating a potential tool call
|
@@ -156,8 +222,8 @@ class PythonicDetector(BaseFormatDetector):
|
|
156
222
|
def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
|
157
223
|
return EBNFComposer.build_ebnf(
|
158
224
|
tools,
|
159
|
-
|
160
|
-
|
225
|
+
sequence_start_token="[",
|
226
|
+
sequence_end_token="]",
|
161
227
|
tool_call_separator=",",
|
162
228
|
function_format="pythonic",
|
163
229
|
)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import logging
|
2
3
|
import re
|
3
4
|
from typing import List
|
4
5
|
|
@@ -11,12 +12,14 @@ from sglang.srt.function_call.core_types import (
|
|
11
12
|
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
12
13
|
from sglang.srt.openai_api.protocol import Tool
|
13
14
|
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
14
17
|
|
15
18
|
class Qwen25Detector(BaseFormatDetector):
|
16
19
|
"""
|
17
20
|
Detector for Qwen 2.5 models.
|
18
21
|
Assumes function call format:
|
19
|
-
<tool_call
|
22
|
+
<tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
|
20
23
|
"""
|
21
24
|
|
22
25
|
def __init__(self):
|
@@ -24,8 +27,10 @@ class Qwen25Detector(BaseFormatDetector):
|
|
24
27
|
Initializes the detector with necessary state variables.
|
25
28
|
"""
|
26
29
|
super().__init__()
|
27
|
-
self.bot_token = "<tool_call
|
28
|
-
self.eot_token = "</tool_call>"
|
30
|
+
self.bot_token = "<tool_call>\n"
|
31
|
+
self.eot_token = "\n</tool_call>"
|
32
|
+
self.tool_call_separator = "\n"
|
33
|
+
self._normal_text_buffer = "" # Buffer for handling partial end tokens
|
29
34
|
|
30
35
|
def has_tool_call(self, text: str) -> bool:
|
31
36
|
"""Check if the text contains a Qwen 2.5 format tool call."""
|
@@ -43,25 +48,74 @@ class Qwen25Detector(BaseFormatDetector):
|
|
43
48
|
normal_text = text[:idx].strip() if idx != -1 else text
|
44
49
|
if self.bot_token not in text:
|
45
50
|
return StreamingParseResult(normal_text=normal_text, calls=[])
|
46
|
-
|
51
|
+
|
52
|
+
# Find all <tool_call>\n...\n</tool_call> blocks
|
53
|
+
pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
|
47
54
|
match_result_list = re.findall(pattern, text, re.DOTALL)
|
48
55
|
calls = []
|
49
56
|
for match_result in match_result_list:
|
50
|
-
|
51
|
-
|
57
|
+
try:
|
58
|
+
parsed_call = json.loads(match_result.strip())
|
59
|
+
calls.extend(self.parse_base_json(parsed_call, tools))
|
60
|
+
except json.JSONDecodeError as e:
|
61
|
+
logger.warning(
|
62
|
+
f"Failed to parse JSON part: {match_result}, JSON parse error: {str(e)}"
|
63
|
+
)
|
64
|
+
continue
|
52
65
|
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
53
66
|
|
67
|
+
def parse_streaming_increment(
|
68
|
+
self, new_text: str, tools: List[Tool]
|
69
|
+
) -> StreamingParseResult:
|
70
|
+
"""
|
71
|
+
Streaming incremental parsing for Qwen 2.5 tool calls.
|
72
|
+
Uses base class implementation with buffering to handle partial end tokens.
|
73
|
+
"""
|
74
|
+
result = super().parse_streaming_increment(new_text, tools)
|
75
|
+
|
76
|
+
# Handle partial end tokens that are streamed character by character
|
77
|
+
if result.normal_text:
|
78
|
+
self._normal_text_buffer += result.normal_text
|
79
|
+
|
80
|
+
# Check if buffer contains complete end token (without leading newline)
|
81
|
+
end_token_without_newline = self.eot_token[1:] # "</tool_call>"
|
82
|
+
if end_token_without_newline in self._normal_text_buffer:
|
83
|
+
cleaned_text = self._normal_text_buffer.replace(
|
84
|
+
end_token_without_newline, ""
|
85
|
+
)
|
86
|
+
self._normal_text_buffer = ""
|
87
|
+
result.normal_text = cleaned_text
|
88
|
+
else:
|
89
|
+
# Check if buffer might contain partial end token at the end
|
90
|
+
partial_match_len = self._ends_with_partial_token(
|
91
|
+
self._normal_text_buffer, end_token_without_newline
|
92
|
+
)
|
93
|
+
|
94
|
+
if partial_match_len:
|
95
|
+
# Keep potential partial match in buffer, return the rest
|
96
|
+
result.normal_text = self._normal_text_buffer[:-partial_match_len]
|
97
|
+
self._normal_text_buffer = self._normal_text_buffer[
|
98
|
+
-partial_match_len:
|
99
|
+
]
|
100
|
+
else:
|
101
|
+
# No partial match, return all buffered text
|
102
|
+
result.normal_text = self._normal_text_buffer
|
103
|
+
self._normal_text_buffer = ""
|
104
|
+
|
105
|
+
return result
|
106
|
+
|
54
107
|
def structure_info(self) -> _GetInfoFunc:
|
55
108
|
return lambda name: StructureInfo(
|
56
|
-
begin='<tool_call
|
57
|
-
end="}</tool_call>",
|
109
|
+
begin='<tool_call>\n{"name":"' + name + '", "arguments":',
|
110
|
+
end="}\n</tool_call>",
|
58
111
|
trigger="<tool_call>",
|
59
112
|
)
|
60
113
|
|
61
114
|
def build_ebnf(self, tools: List[Tool]):
|
62
115
|
return EBNFComposer.build_ebnf(
|
63
116
|
tools,
|
64
|
-
|
65
|
-
|
117
|
+
individual_call_start_token=self.bot_token.replace("\n", "\\n"),
|
118
|
+
individual_call_end_token=self.eot_token.replace("\n", "\\n"),
|
119
|
+
tool_call_separator="\\n",
|
66
120
|
function_format="json",
|
67
121
|
)
|
@@ -18,6 +18,23 @@ def _find_common_prefix(s1: str, s2: str) -> str:
|
|
18
18
|
|
19
19
|
|
20
20
|
def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
|
21
|
+
"""
|
22
|
+
Parse incomplete or partial JSON strings commonly encountered during streaming.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
input_str (str): The potentially incomplete JSON string to parse.
|
26
|
+
flags (Allow): Bitwise flags controlling what types of partial data are allowed.
|
27
|
+
Common flags include:
|
28
|
+
- Allow.STR: Allow partial strings (e.g., '"hello wo' -> 'hello wo')
|
29
|
+
- Allow.OBJ: Allow partial objects (e.g., '{"key":' -> {'key': None})
|
30
|
+
- Allow.ARR: Allow partial arrays (e.g., '[1, 2,' -> [1, 2])
|
31
|
+
- Allow.ALL: Allow all types of partial data
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Tuple[Any, int]: A tuple containing:
|
35
|
+
- parsed_object: The Python object parsed from the JSON
|
36
|
+
- consumed_length: Number of characters consumed from input_str
|
37
|
+
"""
|
21
38
|
try:
|
22
39
|
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
23
40
|
except JSONDecodeError as e:
|