sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
6
|
+
from sglang.srt.function_call.core_types import (
|
7
|
+
StreamingParseResult,
|
8
|
+
StructureInfo,
|
9
|
+
_GetInfoFunc,
|
10
|
+
)
|
11
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
12
|
+
from sglang.srt.openai_api.protocol import Tool
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class Llama32Detector(BaseFormatDetector):
|
18
|
+
"""
|
19
|
+
Detector for Llama 3.2 models.
|
20
|
+
Assumes function call format:
|
21
|
+
<|python_tag|>{"name":"xxx", "arguments":{...}}
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self):
|
25
|
+
super().__init__()
|
26
|
+
self.bot_token = "<|python_tag|>"
|
27
|
+
# NOTE: technically Llama3.2 doesn't support well with parallel tool calls
|
28
|
+
# They need specific prompt engineering to support parallel tool calls
|
29
|
+
# Here we use ';' as the separator, which might have compatibility issues
|
30
|
+
# if users define to use a different separator in their prompt
|
31
|
+
self.tool_call_separator = ";"
|
32
|
+
|
33
|
+
def has_tool_call(self, text: str) -> bool:
|
34
|
+
"""Check if the text contains a Llama 3.2 format tool call."""
|
35
|
+
# depending on the prompt format the Llama model may or may not
|
36
|
+
# prefix the output with the <|python_tag|> token
|
37
|
+
return "<|python_tag|>" in text or text.startswith("{")
|
38
|
+
|
39
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
40
|
+
"""Parse function calls from text, handling multiple JSON objects."""
|
41
|
+
if "<|python_tag|>" not in text and not text.startswith("{"):
|
42
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
43
|
+
|
44
|
+
if "<|python_tag|>" in text:
|
45
|
+
normal_text, action_text = text.split("<|python_tag|>", maxsplit=1)
|
46
|
+
else:
|
47
|
+
normal_text, action_text = "", text
|
48
|
+
|
49
|
+
decoder = json.JSONDecoder()
|
50
|
+
idx = 0
|
51
|
+
safe_idx = idx # the index of the last valid JSON object
|
52
|
+
all_actions = []
|
53
|
+
action_text_len = len(action_text)
|
54
|
+
while idx < action_text_len:
|
55
|
+
try:
|
56
|
+
obj, end = decoder.raw_decode(action_text[idx:])
|
57
|
+
all_actions.append(obj)
|
58
|
+
idx += end + len(self.tool_call_separator)
|
59
|
+
safe_idx = idx
|
60
|
+
except json.JSONDecodeError as e:
|
61
|
+
# Find where next `{"name"` appears and try again
|
62
|
+
logger.warning(
|
63
|
+
f"Failed to parse JSON part: {action_text[idx:]}, JSON parse error: {str(e)}"
|
64
|
+
)
|
65
|
+
next_obj_start = action_text.find('{"name":', idx + 1)
|
66
|
+
if next_obj_start == -1:
|
67
|
+
break
|
68
|
+
idx = next_obj_start
|
69
|
+
continue
|
70
|
+
|
71
|
+
# Only process if we found valid JSON objects
|
72
|
+
calls = self.parse_base_json(all_actions, tools) if all_actions else []
|
73
|
+
# Use safe_idx to avoid idx containing the last part of an invalid JSON object
|
74
|
+
trailing_text = (
|
75
|
+
action_text[safe_idx:].strip() if safe_idx < action_text_len else ""
|
76
|
+
)
|
77
|
+
return StreamingParseResult(
|
78
|
+
normal_text=normal_text + trailing_text, calls=calls
|
79
|
+
)
|
80
|
+
|
81
|
+
def structure_info(self) -> _GetInfoFunc:
|
82
|
+
return lambda name: StructureInfo(
|
83
|
+
begin='<|python_tag|>{"name":"' + name + '", "arguments":',
|
84
|
+
end="}",
|
85
|
+
trigger="<|python_tag|>",
|
86
|
+
)
|
87
|
+
|
88
|
+
def build_ebnf(self, tools: List[Tool]):
|
89
|
+
return EBNFComposer.build_ebnf(
|
90
|
+
tools,
|
91
|
+
function_format="json",
|
92
|
+
tool_call_separator=self.tool_call_separator,
|
93
|
+
)
|
@@ -0,0 +1,131 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
7
|
+
from sglang.srt.function_call.core_types import (
|
8
|
+
StreamingParseResult,
|
9
|
+
StructureInfo,
|
10
|
+
_GetInfoFunc,
|
11
|
+
)
|
12
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
13
|
+
from sglang.srt.openai_api.protocol import Tool
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class MistralDetector(BaseFormatDetector):
|
19
|
+
"""
|
20
|
+
Detector for Mistral models.
|
21
|
+
Assumes function call format:
|
22
|
+
[TOOL_CALLS] [{"name":"func1", "arguments":{...}}, {"name":"func2", "arguments":{...}}]
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
"""
|
27
|
+
Initializes the detector with necessary state variables.
|
28
|
+
"""
|
29
|
+
super().__init__()
|
30
|
+
self.bot_token = "[TOOL_CALLS] ["
|
31
|
+
self.eot_token = "]"
|
32
|
+
self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
|
33
|
+
self.tool_call_separator = ", "
|
34
|
+
|
35
|
+
def has_tool_call(self, text: str) -> bool:
|
36
|
+
"""Check if the text contains a Mistral format tool call."""
|
37
|
+
return self.bot_token in text
|
38
|
+
|
39
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
40
|
+
"""
|
41
|
+
One-time parsing: Detects and parses tool calls in the provided text.
|
42
|
+
|
43
|
+
:param text: The complete text to parse.
|
44
|
+
:param tools: List of available tools.
|
45
|
+
:return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
|
46
|
+
"""
|
47
|
+
idx = text.find(self.bot_token)
|
48
|
+
normal_text = text[:idx].strip() if idx != -1 else text
|
49
|
+
|
50
|
+
if self.bot_token not in text:
|
51
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
52
|
+
|
53
|
+
# Extract the JSON array part from [TOOL_CALLS] [...]
|
54
|
+
# Use bracket counting to properly handle nested brackets in JSON content
|
55
|
+
json_array_str = self._extract_json_array(text)
|
56
|
+
if not json_array_str:
|
57
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
58
|
+
|
59
|
+
calls = []
|
60
|
+
try:
|
61
|
+
function_call_arr = json.loads(json_array_str)
|
62
|
+
# Handle both single object and array of objects
|
63
|
+
if not isinstance(function_call_arr, list):
|
64
|
+
function_call_arr = [function_call_arr]
|
65
|
+
calls = self.parse_base_json(function_call_arr, tools)
|
66
|
+
except json.JSONDecodeError as e:
|
67
|
+
logger.warning(
|
68
|
+
f"Failed to parse JSON part: {json_array_str}, JSON parse error: {str(e)}"
|
69
|
+
)
|
70
|
+
|
71
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
72
|
+
|
73
|
+
def _extract_json_array(self, text: str) -> str:
|
74
|
+
"""
|
75
|
+
Extract the JSON array part using bracket counting to handle nested brackets.
|
76
|
+
|
77
|
+
:param text: The complete text containing [TOOL_CALLS] [...]
|
78
|
+
:return: The JSON array string or None if not found
|
79
|
+
"""
|
80
|
+
start_idx = text.find(self.bot_token)
|
81
|
+
if start_idx == -1:
|
82
|
+
return None
|
83
|
+
|
84
|
+
# Start from the opening bracket after [TOOL_CALLS]
|
85
|
+
json_start = (
|
86
|
+
start_idx + len(self.bot_token) - 1
|
87
|
+
) # -1 to include the opening bracket
|
88
|
+
bracket_count = 0
|
89
|
+
in_string = False
|
90
|
+
escape_next = False
|
91
|
+
|
92
|
+
for i in range(json_start, len(text)):
|
93
|
+
char = text[i]
|
94
|
+
|
95
|
+
if escape_next:
|
96
|
+
escape_next = False
|
97
|
+
continue
|
98
|
+
|
99
|
+
if char == "\\":
|
100
|
+
escape_next = True
|
101
|
+
continue
|
102
|
+
|
103
|
+
if char == '"' and not escape_next:
|
104
|
+
in_string = not in_string
|
105
|
+
continue
|
106
|
+
|
107
|
+
if not in_string:
|
108
|
+
if char == "[":
|
109
|
+
bracket_count += 1
|
110
|
+
elif char == "]":
|
111
|
+
bracket_count -= 1
|
112
|
+
if bracket_count == 0:
|
113
|
+
return text[json_start : i + 1]
|
114
|
+
|
115
|
+
return None
|
116
|
+
|
117
|
+
def structure_info(self) -> _GetInfoFunc:
|
118
|
+
return lambda name: StructureInfo(
|
119
|
+
begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
|
120
|
+
end="}]",
|
121
|
+
trigger="[TOOL_CALLS]",
|
122
|
+
)
|
123
|
+
|
124
|
+
def build_ebnf(self, tools: List[Tool]):
|
125
|
+
return EBNFComposer.build_ebnf(
|
126
|
+
tools,
|
127
|
+
sequence_start_token=self.bot_token,
|
128
|
+
sequence_end_token=self.eot_token,
|
129
|
+
function_format="json",
|
130
|
+
tool_call_separator=self.tool_call_separator,
|
131
|
+
)
|
@@ -0,0 +1,229 @@
|
|
1
|
+
import ast
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
8
|
+
from sglang.srt.function_call.core_types import (
|
9
|
+
StreamingParseResult,
|
10
|
+
StructureInfo,
|
11
|
+
ToolCallItem,
|
12
|
+
_GetInfoFunc,
|
13
|
+
)
|
14
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
15
|
+
from sglang.srt.openai_api.protocol import Tool
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class PythonicDetector(BaseFormatDetector):
|
21
|
+
"""
|
22
|
+
Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
|
23
|
+
Assumes function call format:
|
24
|
+
[tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
|
25
|
+
Arguments are Python literals (not JSON).
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self):
|
29
|
+
super().__init__()
|
30
|
+
self.tool_call_regex = re.compile(
|
31
|
+
r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
|
32
|
+
re.DOTALL,
|
33
|
+
)
|
34
|
+
|
35
|
+
@staticmethod
|
36
|
+
def _text_strip(text: str) -> str:
|
37
|
+
# Llama 4 model sometime will output <|python_start|> and <|python_end|> tokens
|
38
|
+
# remove those tokens
|
39
|
+
text = text.replace("<|python_start|>", "")
|
40
|
+
text = text.replace("<|python_end|>", "")
|
41
|
+
return text
|
42
|
+
|
43
|
+
def has_tool_call(self, text: str) -> bool:
|
44
|
+
return bool(self.tool_call_regex.search(self._text_strip(text.strip())))
|
45
|
+
|
46
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
47
|
+
# Try parsing the text as a Python list of function calls
|
48
|
+
text = text.strip()
|
49
|
+
|
50
|
+
# Remove unexpected <|python_start|> and <|python_end|> for llama4
|
51
|
+
text = self._text_strip(text)
|
52
|
+
|
53
|
+
match = self.tool_call_regex.search(text)
|
54
|
+
if match is None:
|
55
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
56
|
+
|
57
|
+
# Extract the tool call part and any text before/after it
|
58
|
+
tool_call_start = match.start()
|
59
|
+
tool_call_end = match.end()
|
60
|
+
|
61
|
+
normal_text_before = text[:tool_call_start] if tool_call_start > 0 else ""
|
62
|
+
tool_call_text = text[tool_call_start:tool_call_end]
|
63
|
+
normal_text_after = text[tool_call_end:] if tool_call_end < len(text) else ""
|
64
|
+
|
65
|
+
# Combine normal text
|
66
|
+
normal_text = normal_text_before + normal_text_after
|
67
|
+
|
68
|
+
try:
|
69
|
+
module = ast.parse(tool_call_text)
|
70
|
+
parsed = getattr(module.body[0], "value", None)
|
71
|
+
if not (
|
72
|
+
isinstance(parsed, ast.List)
|
73
|
+
and all(isinstance(e, ast.Call) for e in parsed.elts)
|
74
|
+
):
|
75
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
76
|
+
|
77
|
+
calls = []
|
78
|
+
tool_indices = {
|
79
|
+
tool.function.name: i
|
80
|
+
for i, tool in enumerate(tools)
|
81
|
+
if tool.function.name
|
82
|
+
}
|
83
|
+
for call_index, call in enumerate(parsed.elts):
|
84
|
+
if not isinstance(call.func, ast.Name):
|
85
|
+
continue
|
86
|
+
function_name = call.func.id
|
87
|
+
# Validate that the function exists in the tools
|
88
|
+
if function_name not in tool_indices:
|
89
|
+
logger.warning(
|
90
|
+
f"Model attempted to call undefined function: {function_name}"
|
91
|
+
)
|
92
|
+
continue
|
93
|
+
arguments = {}
|
94
|
+
for keyword in call.keywords:
|
95
|
+
arguments[keyword.arg] = self._get_parameter_value(keyword.value)
|
96
|
+
calls.append(
|
97
|
+
ToolCallItem(
|
98
|
+
tool_index=call_index, # Use the call index in the response, not tool position
|
99
|
+
name=function_name,
|
100
|
+
parameters=json.dumps(arguments, ensure_ascii=False),
|
101
|
+
)
|
102
|
+
)
|
103
|
+
|
104
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
105
|
+
except Exception:
|
106
|
+
logger.exception("Error in pythonic tool call parsing.")
|
107
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
108
|
+
|
109
|
+
def _find_matching_bracket(self, buffer: str, start: int) -> int:
|
110
|
+
"""
|
111
|
+
Find the matching closing bracket for the opening bracket at start position.
|
112
|
+
Properly handles nested brackets.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
buffer: The text buffer to search in
|
116
|
+
start: Position of the opening bracket '['
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
Position of the matching closing bracket ']', or -1 if not found
|
120
|
+
"""
|
121
|
+
bracket_count = 0
|
122
|
+
for i in range(start, len(buffer)):
|
123
|
+
if buffer[i] == "[":
|
124
|
+
bracket_count += 1
|
125
|
+
elif buffer[i] == "]":
|
126
|
+
bracket_count -= 1
|
127
|
+
if bracket_count == 0:
|
128
|
+
return i
|
129
|
+
return -1 # No matching bracket found
|
130
|
+
|
131
|
+
def _strip_and_split_buffer(self, buffer: str) -> tuple[str, str]:
|
132
|
+
"""
|
133
|
+
Strip special tokens from buffer and split into safe_text and held_back_text.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
tuple of (safe_text_to_output, text_to_hold_in_buffer)
|
137
|
+
"""
|
138
|
+
# Check if original buffer ends with a partial token at the end
|
139
|
+
special_tokens = ["<|python_start|>", "<|python_end|>"]
|
140
|
+
|
141
|
+
for token in special_tokens:
|
142
|
+
partial_length = self._ends_with_partial_token(buffer, token)
|
143
|
+
if partial_length > 0:
|
144
|
+
# Split buffer: safe part + held back partial token
|
145
|
+
safe_text = buffer[:-partial_length]
|
146
|
+
held_back = buffer[-partial_length:]
|
147
|
+
# Strip complete special tokens from safe part only
|
148
|
+
safe_text = self._text_strip(safe_text)
|
149
|
+
return safe_text, held_back
|
150
|
+
|
151
|
+
# No partial tokens found, strip complete tokens from entire buffer
|
152
|
+
safe_text = self._text_strip(buffer)
|
153
|
+
return safe_text, ""
|
154
|
+
|
155
|
+
def parse_streaming_increment(
|
156
|
+
self, new_text: str, tools: List[Tool]
|
157
|
+
) -> StreamingParseResult:
|
158
|
+
"""
|
159
|
+
Streaming incremental parsing for pythonic tool calls.
|
160
|
+
Buffers input until a complete pythonic tool call (from [ to ]) is found,
|
161
|
+
then parses and emits any detected calls.
|
162
|
+
"""
|
163
|
+
self._buffer += new_text
|
164
|
+
|
165
|
+
# Strip special tokens from entire buffer and handle partial tokens
|
166
|
+
stripped_buffer, held_back = self._strip_and_split_buffer(self._buffer)
|
167
|
+
|
168
|
+
start = stripped_buffer.find("[")
|
169
|
+
|
170
|
+
if start == -1:
|
171
|
+
# No tool call bracket found
|
172
|
+
self._buffer = held_back
|
173
|
+
return StreamingParseResult(normal_text=stripped_buffer)
|
174
|
+
|
175
|
+
normal_text = stripped_buffer[:start] if start > 0 else ""
|
176
|
+
|
177
|
+
end = self._find_matching_bracket(stripped_buffer, start)
|
178
|
+
if end != -1:
|
179
|
+
# Found complete tool call
|
180
|
+
call_text = stripped_buffer[start : end + 1]
|
181
|
+
result = self.detect_and_parse(call_text, tools)
|
182
|
+
|
183
|
+
# Update buffer with remaining text after tool call plus any held back text
|
184
|
+
remaining_text = stripped_buffer[end + 1 :] + held_back
|
185
|
+
self._buffer = remaining_text
|
186
|
+
|
187
|
+
# If we had normal text before the tool call, add it to the result
|
188
|
+
if normal_text:
|
189
|
+
result.normal_text = normal_text + (result.normal_text or "")
|
190
|
+
|
191
|
+
return result
|
192
|
+
|
193
|
+
# We have an opening bracket but no closing bracket yet
|
194
|
+
# Put back everything from the bracket onwards plus held back text
|
195
|
+
self._buffer = stripped_buffer[start:] + held_back
|
196
|
+
|
197
|
+
if normal_text:
|
198
|
+
return StreamingParseResult(normal_text=normal_text)
|
199
|
+
|
200
|
+
# Otherwise, we're still accumulating a potential tool call
|
201
|
+
return StreamingParseResult(normal_text="")
|
202
|
+
|
203
|
+
def _get_parameter_value(self, val):
|
204
|
+
if isinstance(val, ast.Constant):
|
205
|
+
return val.value
|
206
|
+
elif isinstance(val, ast.Dict):
|
207
|
+
return {
|
208
|
+
k.value: self._get_parameter_value(v)
|
209
|
+
for k, v in zip(val.keys, val.values)
|
210
|
+
}
|
211
|
+
elif isinstance(val, ast.List):
|
212
|
+
return [self._get_parameter_value(v) for v in val.elts]
|
213
|
+
else:
|
214
|
+
raise ValueError("Tool call arguments must be literals")
|
215
|
+
|
216
|
+
def structure_info(self) -> _GetInfoFunc:
|
217
|
+
def info(name: str):
|
218
|
+
return StructureInfo(begin=f"[{name}(", end=")]", trigger=f"[{name}(")
|
219
|
+
|
220
|
+
return info
|
221
|
+
|
222
|
+
def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
|
223
|
+
return EBNFComposer.build_ebnf(
|
224
|
+
tools,
|
225
|
+
sequence_start_token="[",
|
226
|
+
sequence_end_token="]",
|
227
|
+
tool_call_separator=",",
|
228
|
+
function_format="pythonic",
|
229
|
+
)
|
@@ -0,0 +1,121 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
from typing import List
|
5
|
+
|
6
|
+
from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
7
|
+
from sglang.srt.function_call.core_types import (
|
8
|
+
StreamingParseResult,
|
9
|
+
StructureInfo,
|
10
|
+
_GetInfoFunc,
|
11
|
+
)
|
12
|
+
from sglang.srt.function_call.ebnf_composer import EBNFComposer
|
13
|
+
from sglang.srt.openai_api.protocol import Tool
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class Qwen25Detector(BaseFormatDetector):
|
19
|
+
"""
|
20
|
+
Detector for Qwen 2.5 models.
|
21
|
+
Assumes function call format:
|
22
|
+
<tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
"""
|
27
|
+
Initializes the detector with necessary state variables.
|
28
|
+
"""
|
29
|
+
super().__init__()
|
30
|
+
self.bot_token = "<tool_call>\n"
|
31
|
+
self.eot_token = "\n</tool_call>"
|
32
|
+
self.tool_call_separator = "\n"
|
33
|
+
self._normal_text_buffer = "" # Buffer for handling partial end tokens
|
34
|
+
|
35
|
+
def has_tool_call(self, text: str) -> bool:
|
36
|
+
"""Check if the text contains a Qwen 2.5 format tool call."""
|
37
|
+
return self.bot_token in text
|
38
|
+
|
39
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
40
|
+
"""
|
41
|
+
One-time parsing: Detects and parses tool calls in the provided text.
|
42
|
+
|
43
|
+
:param text: The complete text to parse.
|
44
|
+
:param tools: List of available tools.
|
45
|
+
:return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
|
46
|
+
"""
|
47
|
+
idx = text.find(self.bot_token)
|
48
|
+
normal_text = text[:idx].strip() if idx != -1 else text
|
49
|
+
if self.bot_token not in text:
|
50
|
+
return StreamingParseResult(normal_text=normal_text, calls=[])
|
51
|
+
|
52
|
+
# Find all <tool_call>\n...\n</tool_call> blocks
|
53
|
+
pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
|
54
|
+
match_result_list = re.findall(pattern, text, re.DOTALL)
|
55
|
+
calls = []
|
56
|
+
for match_result in match_result_list:
|
57
|
+
try:
|
58
|
+
parsed_call = json.loads(match_result.strip())
|
59
|
+
calls.extend(self.parse_base_json(parsed_call, tools))
|
60
|
+
except json.JSONDecodeError as e:
|
61
|
+
logger.warning(
|
62
|
+
f"Failed to parse JSON part: {match_result}, JSON parse error: {str(e)}"
|
63
|
+
)
|
64
|
+
continue
|
65
|
+
return StreamingParseResult(normal_text=normal_text, calls=calls)
|
66
|
+
|
67
|
+
def parse_streaming_increment(
|
68
|
+
self, new_text: str, tools: List[Tool]
|
69
|
+
) -> StreamingParseResult:
|
70
|
+
"""
|
71
|
+
Streaming incremental parsing for Qwen 2.5 tool calls.
|
72
|
+
Uses base class implementation with buffering to handle partial end tokens.
|
73
|
+
"""
|
74
|
+
result = super().parse_streaming_increment(new_text, tools)
|
75
|
+
|
76
|
+
# Handle partial end tokens that are streamed character by character
|
77
|
+
if result.normal_text:
|
78
|
+
self._normal_text_buffer += result.normal_text
|
79
|
+
|
80
|
+
# Check if buffer contains complete end token (without leading newline)
|
81
|
+
end_token_without_newline = self.eot_token[1:] # "</tool_call>"
|
82
|
+
if end_token_without_newline in self._normal_text_buffer:
|
83
|
+
cleaned_text = self._normal_text_buffer.replace(
|
84
|
+
end_token_without_newline, ""
|
85
|
+
)
|
86
|
+
self._normal_text_buffer = ""
|
87
|
+
result.normal_text = cleaned_text
|
88
|
+
else:
|
89
|
+
# Check if buffer might contain partial end token at the end
|
90
|
+
partial_match_len = self._ends_with_partial_token(
|
91
|
+
self._normal_text_buffer, end_token_without_newline
|
92
|
+
)
|
93
|
+
|
94
|
+
if partial_match_len:
|
95
|
+
# Keep potential partial match in buffer, return the rest
|
96
|
+
result.normal_text = self._normal_text_buffer[:-partial_match_len]
|
97
|
+
self._normal_text_buffer = self._normal_text_buffer[
|
98
|
+
-partial_match_len:
|
99
|
+
]
|
100
|
+
else:
|
101
|
+
# No partial match, return all buffered text
|
102
|
+
result.normal_text = self._normal_text_buffer
|
103
|
+
self._normal_text_buffer = ""
|
104
|
+
|
105
|
+
return result
|
106
|
+
|
107
|
+
def structure_info(self) -> _GetInfoFunc:
|
108
|
+
return lambda name: StructureInfo(
|
109
|
+
begin='<tool_call>\n{"name":"' + name + '", "arguments":',
|
110
|
+
end="}\n</tool_call>",
|
111
|
+
trigger="<tool_call>",
|
112
|
+
)
|
113
|
+
|
114
|
+
def build_ebnf(self, tools: List[Tool]):
|
115
|
+
return EBNFComposer.build_ebnf(
|
116
|
+
tools,
|
117
|
+
individual_call_start_token=self.bot_token.replace("\n", "\\n"),
|
118
|
+
individual_call_end_token=self.eot_token.replace("\n", "\\n"),
|
119
|
+
tool_call_separator="\\n",
|
120
|
+
function_format="json",
|
121
|
+
)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import json
|
2
|
+
from json import JSONDecodeError, JSONDecoder
|
3
|
+
from typing import Any, Tuple
|
4
|
+
|
5
|
+
import partial_json_parser
|
6
|
+
from partial_json_parser.core.options import Allow
|
7
|
+
|
8
|
+
|
9
|
+
def _find_common_prefix(s1: str, s2: str) -> str:
|
10
|
+
prefix = ""
|
11
|
+
min_length = min(len(s1), len(s2))
|
12
|
+
for i in range(0, min_length):
|
13
|
+
if s1[i] == s2[i]:
|
14
|
+
prefix += s1[i]
|
15
|
+
else:
|
16
|
+
break
|
17
|
+
return prefix
|
18
|
+
|
19
|
+
|
20
|
+
def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
|
21
|
+
"""
|
22
|
+
Parse incomplete or partial JSON strings commonly encountered during streaming.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
input_str (str): The potentially incomplete JSON string to parse.
|
26
|
+
flags (Allow): Bitwise flags controlling what types of partial data are allowed.
|
27
|
+
Common flags include:
|
28
|
+
- Allow.STR: Allow partial strings (e.g., '"hello wo' -> 'hello wo')
|
29
|
+
- Allow.OBJ: Allow partial objects (e.g., '{"key":' -> {'key': None})
|
30
|
+
- Allow.ARR: Allow partial arrays (e.g., '[1, 2,' -> [1, 2])
|
31
|
+
- Allow.ALL: Allow all types of partial data
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Tuple[Any, int]: A tuple containing:
|
35
|
+
- parsed_object: The Python object parsed from the JSON
|
36
|
+
- consumed_length: Number of characters consumed from input_str
|
37
|
+
"""
|
38
|
+
try:
|
39
|
+
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
40
|
+
except JSONDecodeError as e:
|
41
|
+
if "Extra data" in e.msg:
|
42
|
+
dec = JSONDecoder()
|
43
|
+
return dec.raw_decode(input_str)
|
44
|
+
raise
|
45
|
+
|
46
|
+
|
47
|
+
def _is_complete_json(input_str: str) -> bool:
|
48
|
+
try:
|
49
|
+
json.loads(input_str)
|
50
|
+
return True
|
51
|
+
except JSONDecodeError:
|
52
|
+
return False
|