sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
7
|
+
from sglang.srt.function_call.core_types import (
|
8
|
+
StreamingParseResult,
|
9
|
+
StructureInfo,
|
10
|
+
ToolCallItem,
|
11
|
+
_GetInfoFunc,
|
12
|
+
)
|
13
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
14
|
+
from sglang.srt.function_call.utils import _is_complete_json
|
15
|
+
from sglang.srt.openai_api.protocol import Tool
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class DeepSeekV3Detector(BaseFormatDetector):
|
21
|
+
"""
|
22
|
+
Detector for DeepSeek models.
|
23
|
+
Assumes function call format:
|
24
|
+
'<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
super().__init__()
|
29
|
+
self.bot_token = "<|tool▁calls▁begin|>"
|
30
|
+
self.eot_token = "<|tool▁calls▁end|>"
|
31
|
+
self.func_call_regex = r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
|
32
|
+
self.func_detail_regex = r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```<|tool▁call▁end|>"
|
33
|
+
self._last_arguments = ""
|
34
|
+
self.current_tool_id = -1
|
35
|
+
|
36
|
+
def has_tool_call(self, text: str) -> bool:
|
37
|
+
"""Check if the text contains a deepseek format tool call."""
|
38
|
+
return self.bot_token in text
|
39
|
+
|
40
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
41
|
+
"""
|
42
|
+
One-time parsing: Detects and parses tool calls in the provided text.
|
43
|
+
|
44
|
+
:param text: The complete text to parse.
|
45
|
+
:param tools: List of available tools.
|
46
|
+
:return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
|
47
|
+
"""
|
48
|
+
idx = text.find(self.bot_token)
|
49
|
+
normal_text = text[:idx].strip() if idx != -1 else text
|
50
|
+
if self.bot_token not in text:
|
51
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
52
|
+
match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
|
53
|
+
calls = []
|
54
|
+
try:
|
55
|
+
for match_result in match_result_list:
|
56
|
+
# Get function name
|
57
|
+
func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
|
58
|
+
func_name = func_detail.group(2)
|
59
|
+
func_args = func_detail.group(3)
|
60
|
+
func_args = json.loads(func_args)
|
61
|
+
# construct match_result for parse_base_json
|
62
|
+
match_result = {"name": func_name, "parameters": func_args}
|
63
|
+
calls.extend(self.parse_base_json(match_result, tools))
|
64
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
65
|
+
except Exception as e:
|
66
|
+
logger.error(f"Error in detect_and_parse: {e}")
|
67
|
+
# return the normal text if parsing fails
|
68
|
+
return StreamingParseResult(normal_text=text)
|
69
|
+
|
70
|
+
def parse_streaming_increment(
|
71
|
+
self, new_text: str, tools: List[Tool]
|
72
|
+
) -> StreamingParseResult:
|
73
|
+
"""
|
74
|
+
Streaming incremental parsing tool calls for DeepSeekV3 format.
|
75
|
+
"""
|
76
|
+
self._buffer += new_text
|
77
|
+
current_text = self._buffer
|
78
|
+
|
79
|
+
# Check if we have a tool call (either the start token or individual tool call)
|
80
|
+
has_tool_call = (
|
81
|
+
self.bot_token in current_text or "<|tool▁call▁begin|>" in current_text
|
82
|
+
)
|
83
|
+
|
84
|
+
if not has_tool_call:
|
85
|
+
self._buffer = ""
|
86
|
+
for e_token in [self.eot_token, "```", "<|tool▁call▁end|>"]:
|
87
|
+
if e_token in new_text:
|
88
|
+
new_text = new_text.replace(e_token, "")
|
89
|
+
return StreamingParseResult(normal_text=new_text)
|
90
|
+
|
91
|
+
if not hasattr(self, "_tool_indices"):
|
92
|
+
self._tool_indices = {
|
93
|
+
tool.function.name: i
|
94
|
+
for i, tool in enumerate(tools)
|
95
|
+
if tool.function and tool.function.name
|
96
|
+
}
|
97
|
+
|
98
|
+
calls: list[ToolCallItem] = []
|
99
|
+
try:
|
100
|
+
partial_match = re.search(
|
101
|
+
pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)",
|
102
|
+
string=current_text,
|
103
|
+
flags=re.DOTALL,
|
104
|
+
)
|
105
|
+
if partial_match:
|
106
|
+
func_name = partial_match.group(2).strip()
|
107
|
+
func_args_raw = partial_match.group(3).strip()
|
108
|
+
|
109
|
+
# Initialize state if this is the first tool call
|
110
|
+
if self.current_tool_id == -1:
|
111
|
+
self.current_tool_id = 0
|
112
|
+
self.prev_tool_call_arr = []
|
113
|
+
self.streamed_args_for_tool = [""]
|
114
|
+
|
115
|
+
# Ensure we have enough entries in our tracking arrays
|
116
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
117
|
+
self.prev_tool_call_arr.append({})
|
118
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
119
|
+
self.streamed_args_for_tool.append("")
|
120
|
+
|
121
|
+
if not self.current_tool_name_sent:
|
122
|
+
calls.append(
|
123
|
+
ToolCallItem(
|
124
|
+
tool_index=self.current_tool_id,
|
125
|
+
name=func_name,
|
126
|
+
parameters="",
|
127
|
+
)
|
128
|
+
)
|
129
|
+
self.current_tool_name_sent = True
|
130
|
+
# Store the tool call info for adapter.py
|
131
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
132
|
+
"name": func_name,
|
133
|
+
"arguments": {},
|
134
|
+
}
|
135
|
+
else:
|
136
|
+
argument_diff = (
|
137
|
+
func_args_raw[len(self._last_arguments) :]
|
138
|
+
if func_args_raw.startswith(self._last_arguments)
|
139
|
+
else func_args_raw
|
140
|
+
)
|
141
|
+
|
142
|
+
if argument_diff:
|
143
|
+
calls.append(
|
144
|
+
ToolCallItem(
|
145
|
+
tool_index=self.current_tool_id,
|
146
|
+
name=None,
|
147
|
+
parameters=argument_diff,
|
148
|
+
)
|
149
|
+
)
|
150
|
+
self._last_arguments += argument_diff
|
151
|
+
self.streamed_args_for_tool[
|
152
|
+
self.current_tool_id
|
153
|
+
] += argument_diff
|
154
|
+
|
155
|
+
if _is_complete_json(func_args_raw):
|
156
|
+
# Update the stored arguments for adapter.py
|
157
|
+
try:
|
158
|
+
parsed_args = json.loads(func_args_raw)
|
159
|
+
self.prev_tool_call_arr[self.current_tool_id][
|
160
|
+
"arguments"
|
161
|
+
] = parsed_args
|
162
|
+
except json.JSONDecodeError:
|
163
|
+
pass
|
164
|
+
|
165
|
+
# Find the end of the current tool call and remove only that part from buffer
|
166
|
+
tool_call_end_pattern = (
|
167
|
+
r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
|
168
|
+
)
|
169
|
+
match = re.search(
|
170
|
+
tool_call_end_pattern, current_text, re.DOTALL
|
171
|
+
)
|
172
|
+
if match:
|
173
|
+
# Remove the completed tool call from buffer, keep any remaining content
|
174
|
+
self._buffer = current_text[match.end() :]
|
175
|
+
else:
|
176
|
+
self._buffer = ""
|
177
|
+
|
178
|
+
result = StreamingParseResult(normal_text="", calls=calls)
|
179
|
+
self.current_tool_id += 1
|
180
|
+
self._last_arguments = ""
|
181
|
+
self.current_tool_name_sent = False
|
182
|
+
return result
|
183
|
+
|
184
|
+
return StreamingParseResult(normal_text="", calls=calls)
|
185
|
+
|
186
|
+
except Exception as e:
|
187
|
+
logger.error(f"Error in parse_streaming_increment: {e}")
|
188
|
+
return StreamingParseResult(normal_text=current_text)
|
189
|
+
|
190
|
+
def structure_info(self) -> _GetInfoFunc:
|
191
|
+
return lambda name: StructureInfo(
|
192
|
+
begin=">" + name + "\n```json\n",
|
193
|
+
end="\n```<",
|
194
|
+
trigger=">" + name + "\n```json\n",
|
195
|
+
)
|
196
|
+
|
197
|
+
def build_ebnf(self, tools: List[Tool]):
|
198
|
+
return EBNFComposer.build_ebnf(
|
199
|
+
tools,
|
200
|
+
sequence_start_token=self.bot_token,
|
201
|
+
sequence_end_token=self.eot_token,
|
202
|
+
tool_call_separator="",
|
203
|
+
call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n" {arguments_rule} "\\n```<|tool▁call▁end|>"',
|
204
|
+
function_format="json",
|
205
|
+
)
|
@@ -0,0 +1,248 @@
|
|
1
|
+
from typing import Literal, Optional
|
2
|
+
|
3
|
+
|
4
|
+
class EBNFComposer:
|
5
|
+
# Adapted from https://xgrammar.mlc.ai/docs/how_to/ebnf_guided_generation.html#try-out-via-hf-transformers
|
6
|
+
json_grammar_ebnf_str = r"""
|
7
|
+
json ::= basic_array | basic_object
|
8
|
+
basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
|
9
|
+
basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
|
10
|
+
basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
11
|
+
basic_string ::= (([\"] basic_string_1 [\"]))
|
12
|
+
basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
|
13
|
+
escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]
|
14
|
+
basic_boolean ::= "true" | "false"
|
15
|
+
basic_null ::= "null"
|
16
|
+
basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
|
17
|
+
basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
|
18
|
+
ws ::= [ \n\t]*
|
19
|
+
"""
|
20
|
+
|
21
|
+
pythonic_grammar_ebnf_str = r"""
|
22
|
+
pythonic ::= basic_number | basic_string | basic_array | "True" | "False" | "None"
|
23
|
+
basic_any ::= basic_number | basic_string | basic_array | basic_object
|
24
|
+
basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
|
25
|
+
basic_string ::= (([\"] basic_string_1 [\"]))
|
26
|
+
basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
|
27
|
+
escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]
|
28
|
+
basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
|
29
|
+
basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
|
30
|
+
ws ::= [ \n\t]*
|
31
|
+
"""
|
32
|
+
|
33
|
+
CALL_RULE_MAP = {
|
34
|
+
"pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
|
35
|
+
"json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
|
36
|
+
}
|
37
|
+
|
38
|
+
ARGUMENTS_RULE_MAP = {
|
39
|
+
"pythonic": "{arg_rules}",
|
40
|
+
"json": '"{{" {arg_rules} "}}"',
|
41
|
+
}
|
42
|
+
|
43
|
+
KEY_VALUE_RULE_MAP = {
|
44
|
+
"pythonic": '"{key}" "=" {valrule}',
|
45
|
+
"json": '"\\"{key}\\"" ":" {valrule}',
|
46
|
+
}
|
47
|
+
|
48
|
+
JSON_TYPE_MAPPING = {
|
49
|
+
"string": "basic_string",
|
50
|
+
"number": "basic_number",
|
51
|
+
"integer": "basic_number",
|
52
|
+
"boolean": "basic_boolean",
|
53
|
+
"null": "basic_null",
|
54
|
+
"array": "basic_array",
|
55
|
+
"object": "basic_object",
|
56
|
+
}
|
57
|
+
|
58
|
+
PYTHONIC_TYPE_MAPPING = {
|
59
|
+
"string": "basic_string",
|
60
|
+
"number": "basic_number",
|
61
|
+
"integer": "basic_number",
|
62
|
+
"boolean": '"True" | "False"',
|
63
|
+
"null": '"None"',
|
64
|
+
"array": "basic_array",
|
65
|
+
"object": "basic_object",
|
66
|
+
}
|
67
|
+
|
68
|
+
@staticmethod
|
69
|
+
def get_value_rule(
|
70
|
+
prop: dict, function_format: Literal["pythonic", "json"] = "json"
|
71
|
+
) -> str:
|
72
|
+
if "enum" in prop:
|
73
|
+
return EBNFComposer._handle_enum(prop, function_format)
|
74
|
+
|
75
|
+
if "type" in prop:
|
76
|
+
return EBNFComposer._handle_type(prop, function_format)
|
77
|
+
|
78
|
+
return function_format
|
79
|
+
|
80
|
+
@staticmethod
|
81
|
+
def _handle_enum(prop: dict, function_format: str) -> str:
|
82
|
+
"""Handle enum properties by formatting each value according to type and format."""
|
83
|
+
enum_values = prop["enum"]
|
84
|
+
prop_type = prop.get("type", "string")
|
85
|
+
|
86
|
+
# Define formatters for different type/format combinations
|
87
|
+
formatters = {
|
88
|
+
("string", "json"): lambda v: f'"\\"{v}\\""',
|
89
|
+
("string", "pythonic"): lambda v: f'"\\"{v}\\""',
|
90
|
+
("number", "json"): str,
|
91
|
+
("number", "pythonic"): str,
|
92
|
+
("integer", "json"): str,
|
93
|
+
("integer", "pythonic"): str,
|
94
|
+
("boolean", "json"): lambda v: "true" if v else "false",
|
95
|
+
("boolean", "pythonic"): lambda v: "True" if v else "False",
|
96
|
+
}
|
97
|
+
|
98
|
+
# Get the formatter or default to string handling
|
99
|
+
formatter = formatters.get(
|
100
|
+
(prop_type, function_format),
|
101
|
+
formatters[("string", function_format)], # Default to string handling
|
102
|
+
)
|
103
|
+
|
104
|
+
formatted_values = [formatter(value) for value in enum_values]
|
105
|
+
enum_rule = " | ".join(formatted_values)
|
106
|
+
|
107
|
+
# Wrap in parentheses if there are multiple values to ensure correct EBNF precedence
|
108
|
+
if len(formatted_values) > 1:
|
109
|
+
enum_rule = f"({enum_rule})"
|
110
|
+
|
111
|
+
return enum_rule
|
112
|
+
|
113
|
+
@staticmethod
|
114
|
+
def _handle_type(prop: dict, function_format: str) -> str:
|
115
|
+
"""Handle type properties using the appropriate type mapping."""
|
116
|
+
prop_type = prop["type"]
|
117
|
+
type_mapping = (
|
118
|
+
EBNFComposer.PYTHONIC_TYPE_MAPPING
|
119
|
+
if function_format == "pythonic"
|
120
|
+
else EBNFComposer.JSON_TYPE_MAPPING
|
121
|
+
)
|
122
|
+
|
123
|
+
if isinstance(prop_type, list):
|
124
|
+
type_rules = [
|
125
|
+
type_mapping[single_type]
|
126
|
+
for single_type in prop_type
|
127
|
+
if single_type in type_mapping
|
128
|
+
]
|
129
|
+
return " | ".join(type_rules) if type_rules else function_format
|
130
|
+
|
131
|
+
return type_mapping.get(prop_type, function_format)
|
132
|
+
|
133
|
+
@staticmethod
|
134
|
+
def build_ebnf(
|
135
|
+
tools,
|
136
|
+
function_format: Literal["pythonic", "json"] = "json",
|
137
|
+
# Parameters for wrapping the entire sequence of tool calls
|
138
|
+
sequence_start_token: Optional[str] = None,
|
139
|
+
sequence_end_token: Optional[str] = None,
|
140
|
+
# Parameters for wrapping individual tool calls
|
141
|
+
individual_call_start_token: Optional[str] = None,
|
142
|
+
individual_call_end_token: Optional[str] = None,
|
143
|
+
# Parameter for separating multiple tool calls
|
144
|
+
tool_call_separator: Optional[str] = None,
|
145
|
+
call_rule_fmt: Optional[str] = None,
|
146
|
+
):
|
147
|
+
"""
|
148
|
+
Generalized EBNF builder for all detectors.
|
149
|
+
Args:
|
150
|
+
tools: List of Tool objects to generate EBNF grammar for
|
151
|
+
function_format: The format of function calls, either "pythonic" or "json"
|
152
|
+
sequence_start_token: Token that wraps the entire sequence of tool calls (start)
|
153
|
+
sequence_end_token: Token that wraps the entire sequence of tool calls (end)
|
154
|
+
individual_call_start_token: Token that wraps each individual tool call (start)
|
155
|
+
individual_call_end_token: Token that wraps each individual tool call (end)
|
156
|
+
tool_call_separator: The separator between multiple tool calls
|
157
|
+
call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
|
158
|
+
the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
|
159
|
+
format based on function_format will be used.
|
160
|
+
"""
|
161
|
+
# =================================================================
|
162
|
+
# Step 1: Determine the root tool calls rule
|
163
|
+
# =================================================================
|
164
|
+
# Handle a single function call
|
165
|
+
if individual_call_start_token and individual_call_end_token:
|
166
|
+
function_call_unit = f'"{individual_call_start_token}" function_call "{individual_call_end_token}"'
|
167
|
+
else:
|
168
|
+
function_call_unit = "function_call"
|
169
|
+
|
170
|
+
# Handle multiple function calls with separators
|
171
|
+
if tool_call_separator is not None:
|
172
|
+
base_pattern = f'{function_call_unit} ( "{tool_call_separator}" {function_call_unit} )*'
|
173
|
+
else:
|
174
|
+
# Assume only support single function call
|
175
|
+
base_pattern = function_call_unit
|
176
|
+
|
177
|
+
# Apply sequence-level wrapping if needed
|
178
|
+
if sequence_start_token and sequence_end_token:
|
179
|
+
root_rule = (
|
180
|
+
f'"{sequence_start_token}" {base_pattern} "{sequence_end_token}"'
|
181
|
+
)
|
182
|
+
else:
|
183
|
+
root_rule = base_pattern
|
184
|
+
|
185
|
+
# =================================================================
|
186
|
+
# Step 2: Build the header rules
|
187
|
+
# =================================================================
|
188
|
+
ebnf_lines = [
|
189
|
+
f"root ::= {root_rule}",
|
190
|
+
"function_call ::= "
|
191
|
+
+ " | ".join([f"call_{tool.function.name}" for tool in tools]),
|
192
|
+
]
|
193
|
+
|
194
|
+
# =================================================================
|
195
|
+
# Step 3: Set up formatting templates
|
196
|
+
# =================================================================
|
197
|
+
call_template = (
|
198
|
+
f"call_{{name}} ::= {call_rule_fmt}"
|
199
|
+
if call_rule_fmt
|
200
|
+
else EBNFComposer.CALL_RULE_MAP[function_format]
|
201
|
+
)
|
202
|
+
args_template = EBNFComposer.ARGUMENTS_RULE_MAP[function_format]
|
203
|
+
key_value_template = EBNFComposer.KEY_VALUE_RULE_MAP[function_format]
|
204
|
+
|
205
|
+
# =================================================================
|
206
|
+
# Step 4: Build rules for each tool
|
207
|
+
# =================================================================
|
208
|
+
for tool in tools:
|
209
|
+
tool_name = tool.function.name
|
210
|
+
params = tool.function.parameters or {}
|
211
|
+
properties = params.get("properties", {})
|
212
|
+
required_props = set(params.get("required", []))
|
213
|
+
|
214
|
+
# Build argument rules for this tool
|
215
|
+
arg_rules = []
|
216
|
+
for prop_name, prop_schema in properties.items():
|
217
|
+
value_rule = EBNFComposer.get_value_rule(prop_schema, function_format)
|
218
|
+
# Create key=value pair
|
219
|
+
pair = key_value_template.format(key=prop_name, valrule=value_rule)
|
220
|
+
|
221
|
+
if prop_name not in required_props:
|
222
|
+
pair = f"[ {pair} ]"
|
223
|
+
|
224
|
+
arg_rules.append(pair)
|
225
|
+
|
226
|
+
# Combine all argument rules
|
227
|
+
combined_args = ' "," '.join(arg_rules) if arg_rules else ""
|
228
|
+
arguments_rule = args_template.format(arg_rules=combined_args)
|
229
|
+
|
230
|
+
# Add the function call rule and its arguments rule
|
231
|
+
ebnf_lines.append(
|
232
|
+
call_template.format(
|
233
|
+
name=tool_name, arguments_rule=f"arguments_{tool_name}"
|
234
|
+
)
|
235
|
+
)
|
236
|
+
ebnf_lines.append(f"arguments_{tool_name} ::= {arguments_rule}")
|
237
|
+
|
238
|
+
# =================================================================
|
239
|
+
# Step 5: Add base grammar rules
|
240
|
+
# =================================================================
|
241
|
+
base_grammar = (
|
242
|
+
EBNFComposer.pythonic_grammar_ebnf_str
|
243
|
+
if function_format == "pythonic"
|
244
|
+
else EBNFComposer.json_grammar_ebnf_str
|
245
|
+
)
|
246
|
+
ebnf_lines.append(base_grammar)
|
247
|
+
|
248
|
+
return "\n".join(ebnf_lines)
|
@@ -0,0 +1,202 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
|
3
|
+
|
4
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
5
|
+
from sglang.srt.function_call.core_types import ToolCallItem
|
6
|
+
from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
|
7
|
+
from sglang.srt.function_call.llama32_detector import Llama32Detector
|
8
|
+
from sglang.srt.function_call.mistral_detector import MistralDetector
|
9
|
+
from sglang.srt.function_call.pythonic_detector import PythonicDetector
|
10
|
+
from sglang.srt.function_call.qwen25_detector import Qwen25Detector
|
11
|
+
from sglang.srt.openai_api.protocol import (
|
12
|
+
StructuralTagResponseFormat,
|
13
|
+
StructuresResponseFormat,
|
14
|
+
Tool,
|
15
|
+
ToolChoice,
|
16
|
+
)
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class FunctionCallParser:
|
22
|
+
"""
|
23
|
+
Parser for function/tool calls in model outputs.
|
24
|
+
|
25
|
+
This class handles both streaming and non-streaming parsing of function calls using a detector.
|
26
|
+
In streaming scenarios, each time new_text is received, it calls detector.parse_streaming_increment
|
27
|
+
and returns the resulting normal_text and calls to the upper layer (or SSE).
|
28
|
+
"""
|
29
|
+
|
30
|
+
ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
|
31
|
+
"llama3": Llama32Detector,
|
32
|
+
"qwen25": Qwen25Detector,
|
33
|
+
"mistral": MistralDetector,
|
34
|
+
"deepseekv3": DeepSeekV3Detector,
|
35
|
+
"pythonic": PythonicDetector,
|
36
|
+
}
|
37
|
+
|
38
|
+
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
39
|
+
detector: Type[BaseFormatDetector] = None
|
40
|
+
detector_class = self.ToolCallParserEnum.get(tool_call_parser)
|
41
|
+
if detector_class:
|
42
|
+
detector = detector_class()
|
43
|
+
else:
|
44
|
+
raise ValueError(f"Unsupported tool_call_parser: {tool_call_parser}")
|
45
|
+
|
46
|
+
self.detector = detector
|
47
|
+
self.tools = tools
|
48
|
+
|
49
|
+
def has_tool_call(self, text: str) -> bool:
|
50
|
+
"""
|
51
|
+
Check if the given text contains a tool call in the format supported by this parser.
|
52
|
+
This delegates to the detector's implementation.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
text: The text to check for tool calls
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
True if the text contains a tool call, False otherwise
|
59
|
+
"""
|
60
|
+
return self.detector.has_tool_call(text)
|
61
|
+
|
62
|
+
def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
|
63
|
+
"""
|
64
|
+
One-time parsing of the full text to extract tool calls.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
full_text: The complete text to parse
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
A tuple containing:
|
71
|
+
- The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
|
72
|
+
- A list of tool calls parsed from the text
|
73
|
+
"""
|
74
|
+
parsed_result = self.detector.detect_and_parse(full_text, self.tools)
|
75
|
+
tool_call_list = parsed_result.calls
|
76
|
+
if tool_call_list:
|
77
|
+
return parsed_result.normal_text, tool_call_list
|
78
|
+
else:
|
79
|
+
return full_text, []
|
80
|
+
|
81
|
+
def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, list[ToolCallItem]]:
|
82
|
+
"""
|
83
|
+
Streaming incremental parsing of chunks of text as they arrive.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
chunk_text: The new chunk of text to parse
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
A tuple containing:
|
90
|
+
- The normal text that should be displayed to the user
|
91
|
+
- A list of tool calls parsed from the chunk
|
92
|
+
"""
|
93
|
+
final_normal_text = ""
|
94
|
+
final_calls = []
|
95
|
+
|
96
|
+
sp_result = self.detector.parse_streaming_increment(chunk_text, self.tools)
|
97
|
+
if sp_result.normal_text:
|
98
|
+
final_normal_text = sp_result.normal_text
|
99
|
+
if sp_result.calls:
|
100
|
+
final_calls.extend(sp_result.calls)
|
101
|
+
final_normal_text = sp_result.normal_text
|
102
|
+
|
103
|
+
return final_normal_text, final_calls
|
104
|
+
|
105
|
+
def get_structure_tag(self) -> StructuralTagResponseFormat:
|
106
|
+
"""
|
107
|
+
Generate a structural tag response format for all available tools.
|
108
|
+
|
109
|
+
This creates the necessary structural tags that guide the model's output format.
|
110
|
+
"""
|
111
|
+
tool_structures: List[StructuresResponseFormat] = list()
|
112
|
+
tool_trigger_set: Set[str] = set()
|
113
|
+
|
114
|
+
get_structure_info = self.detector.structure_info()
|
115
|
+
for tool in self.tools:
|
116
|
+
function = tool.function
|
117
|
+
name = function.name
|
118
|
+
assert name is not None
|
119
|
+
info = get_structure_info(name)
|
120
|
+
|
121
|
+
# accept all if not strict, otherwise only accept the schema
|
122
|
+
schema = function.parameters if function.strict else {}
|
123
|
+
|
124
|
+
tool_structures.append(
|
125
|
+
StructuresResponseFormat(
|
126
|
+
begin=info.begin,
|
127
|
+
schema=schema, # type: ignore
|
128
|
+
end=info.end,
|
129
|
+
)
|
130
|
+
)
|
131
|
+
tool_trigger_set.add(info.trigger)
|
132
|
+
|
133
|
+
return StructuralTagResponseFormat(
|
134
|
+
type="structural_tag",
|
135
|
+
structures=tool_structures,
|
136
|
+
triggers=list(tool_trigger_set),
|
137
|
+
)
|
138
|
+
|
139
|
+
def get_structure_constraint(
|
140
|
+
self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
|
141
|
+
) -> Optional[Tuple[str, Any]]:
|
142
|
+
"""
|
143
|
+
Returns the appropriate structure constraint for tool calls based on the tool_choice.
|
144
|
+
The constraint is used to guide the model's output format.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
tool_choice: The tool choice setting from the request
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
A tuple of (constraint_type, constraint_value) to be added to sampling parameters,
|
151
|
+
or None if no constraint applies.
|
152
|
+
"""
|
153
|
+
# NOTE: structural_tag only supports JSON-compatible content between the begin and end.
|
154
|
+
# It cannot parse or validate Python syntax like function calls.
|
155
|
+
if (
|
156
|
+
not isinstance(self.detector, PythonicDetector)
|
157
|
+
and tool_choice == "auto"
|
158
|
+
and any(tool.function.strict for tool in self.tools)
|
159
|
+
):
|
160
|
+
strict_tag = self.get_structure_tag()
|
161
|
+
return ("structural_tag", strict_tag)
|
162
|
+
elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
|
163
|
+
ebnf = self.get_ebnf(tool_choice)
|
164
|
+
return ("ebnf", ebnf) if ebnf is not None else None
|
165
|
+
|
166
|
+
def get_ebnf(
|
167
|
+
self, tool_choice: Union[ToolChoice, Literal["required"]]
|
168
|
+
) -> Optional[str]:
|
169
|
+
"""
|
170
|
+
Get the EBNF grammar for the specified tool choice.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
tool_choice: The tool choice specification
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
EBNF grammar string, or None if no valid tools found
|
177
|
+
|
178
|
+
Note:
|
179
|
+
If a specific function is requested but not found in available tools,
|
180
|
+
logs a warning and falls back to using all available tools for backward compatibility.
|
181
|
+
"""
|
182
|
+
filtered_tools = []
|
183
|
+
if isinstance(tool_choice, ToolChoice):
|
184
|
+
fn_name = tool_choice.function.name
|
185
|
+
filtered_tools = [t for t in self.tools if t.function.name == fn_name]
|
186
|
+
|
187
|
+
# Check if the requested function exists in available tools
|
188
|
+
if not filtered_tools:
|
189
|
+
available_functions = [t.function.name for t in self.tools]
|
190
|
+
logger.warning(
|
191
|
+
f"Function '{fn_name}' not found in available tools. "
|
192
|
+
f"Available functions: {available_functions}. "
|
193
|
+
f"Skipping tool choice."
|
194
|
+
)
|
195
|
+
|
196
|
+
# TODO: Return a 400 error instead of warning when adapter supports proper error handling
|
197
|
+
# For now, fall back to return None
|
198
|
+
return None
|
199
|
+
else:
|
200
|
+
filtered_tools = self.tools
|
201
|
+
|
202
|
+
return self.detector.build_ebnf(filtered_tools)
|