sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/entrypoints/engine.py
CHANGED
@@ -167,11 +167,22 @@ class Engine(EngineBase):
|
|
167
167
|
bootstrap_host: Optional[Union[List[str], str]] = None,
|
168
168
|
bootstrap_port: Optional[Union[List[int], int]] = None,
|
169
169
|
bootstrap_room: Optional[Union[List[int], int]] = None,
|
170
|
+
data_parallel_rank: Optional[int] = None,
|
170
171
|
) -> Union[Dict, Iterator[Dict]]:
|
171
172
|
"""
|
172
173
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
173
174
|
Please refer to `GenerateReqInput` for the documentation.
|
174
175
|
"""
|
176
|
+
if self.server_args.enable_dp_attention:
|
177
|
+
if data_parallel_rank is None:
|
178
|
+
logger.info("data_parallel_rank not provided, using default dispatch")
|
179
|
+
elif data_parallel_rank < 0:
|
180
|
+
raise ValueError("data_parallel_rank must be non-negative")
|
181
|
+
elif data_parallel_rank >= self.server_args.dp_size:
|
182
|
+
raise ValueError(
|
183
|
+
f"data_parallel_rank must be less than dp_size: {self.server_args.dp_size}"
|
184
|
+
)
|
185
|
+
|
175
186
|
obj = GenerateReqInput(
|
176
187
|
text=prompt,
|
177
188
|
input_ids=input_ids,
|
@@ -188,6 +199,7 @@ class Engine(EngineBase):
|
|
188
199
|
bootstrap_host=bootstrap_host,
|
189
200
|
bootstrap_port=bootstrap_port,
|
190
201
|
bootstrap_room=bootstrap_room,
|
202
|
+
data_parallel_rank=data_parallel_rank,
|
191
203
|
)
|
192
204
|
loop = asyncio.get_event_loop()
|
193
205
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
@@ -237,11 +249,24 @@ class Engine(EngineBase):
|
|
237
249
|
bootstrap_host: Optional[Union[List[str], str]] = None,
|
238
250
|
bootstrap_port: Optional[Union[List[int], int]] = None,
|
239
251
|
bootstrap_room: Optional[Union[List[int], int]] = None,
|
252
|
+
data_parallel_rank: Optional[int] = None,
|
240
253
|
) -> Union[Dict, AsyncIterator[Dict]]:
|
241
254
|
"""
|
242
255
|
The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
|
243
256
|
Please refer to `GenerateReqInput` for the documentation.
|
244
257
|
"""
|
258
|
+
|
259
|
+
if self.server_args.enable_dp_attention:
|
260
|
+
if data_parallel_rank is None:
|
261
|
+
logger.info("data_parallel_rank not provided, using default dispatch")
|
262
|
+
elif data_parallel_rank < 0:
|
263
|
+
raise ValueError("data_parallel_rank must be non-negative")
|
264
|
+
elif data_parallel_rank >= self.server_args.dp_size:
|
265
|
+
raise ValueError(
|
266
|
+
f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
|
267
|
+
)
|
268
|
+
|
269
|
+
logger.info(f"data_parallel_rank: {data_parallel_rank}")
|
245
270
|
obj = GenerateReqInput(
|
246
271
|
text=prompt,
|
247
272
|
input_ids=input_ids,
|
@@ -257,6 +282,7 @@ class Engine(EngineBase):
|
|
257
282
|
bootstrap_host=bootstrap_host,
|
258
283
|
bootstrap_port=bootstrap_port,
|
259
284
|
bootstrap_room=bootstrap_room,
|
285
|
+
data_parallel_rank=data_parallel_rank,
|
260
286
|
)
|
261
287
|
generator = self.tokenizer_manager.generate_request(obj, None)
|
262
288
|
|
@@ -472,6 +498,79 @@ class Engine(EngineBase):
|
|
472
498
|
def save_sharded_model(self, **kwargs):
|
473
499
|
self.collective_rpc("save_sharded_model", **kwargs)
|
474
500
|
|
501
|
+
def score(
|
502
|
+
self,
|
503
|
+
query: Optional[Union[str, List[int]]] = None,
|
504
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
505
|
+
label_token_ids: Optional[List[int]] = None,
|
506
|
+
apply_softmax: bool = False,
|
507
|
+
item_first: bool = False,
|
508
|
+
) -> List[List[float]]:
|
509
|
+
"""
|
510
|
+
Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
|
511
|
+
query = "<|user|>Is the following city the capital of France? "
|
512
|
+
items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
|
513
|
+
label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
|
514
|
+
item_first = False
|
515
|
+
|
516
|
+
This would pass the following prompts to the model:
|
517
|
+
"<|user|>Is the following city the capital of France? Paris <|assistant|>"
|
518
|
+
"<|user|>Is the following city the capital of France? London <|assistant|>"
|
519
|
+
"<|user|>Is the following city the capital of France? Berlin <|assistant|>"
|
520
|
+
The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
|
521
|
+
The output would look like:
|
522
|
+
[[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
|
523
|
+
|
524
|
+
|
525
|
+
Args:
|
526
|
+
query: The query text or pre-tokenized query token IDs. Must be provided.
|
527
|
+
items: The item text(s) or pre-tokenized item token IDs. Must be provided.
|
528
|
+
label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
|
529
|
+
apply_softmax: Whether to normalize probabilities using softmax.
|
530
|
+
item_first: If True, prepend items to query. Otherwise append items to query.
|
531
|
+
|
532
|
+
Returns:
|
533
|
+
List of dictionaries mapping token IDs to their probabilities for each item.
|
534
|
+
Each dictionary in the list corresponds to one item input.
|
535
|
+
|
536
|
+
Raises:
|
537
|
+
ValueError: If query is not provided, or if items is not provided,
|
538
|
+
or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
|
539
|
+
"""
|
540
|
+
loop = asyncio.get_event_loop()
|
541
|
+
return loop.run_until_complete(
|
542
|
+
self.tokenizer_manager.score_request(
|
543
|
+
query=query,
|
544
|
+
items=items,
|
545
|
+
label_token_ids=label_token_ids,
|
546
|
+
apply_softmax=apply_softmax,
|
547
|
+
item_first=item_first,
|
548
|
+
request=None,
|
549
|
+
)
|
550
|
+
)
|
551
|
+
|
552
|
+
async def async_score(
|
553
|
+
self,
|
554
|
+
query: Optional[Union[str, List[int]]] = None,
|
555
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
556
|
+
label_token_ids: Optional[List[int]] = None,
|
557
|
+
apply_softmax: bool = False,
|
558
|
+
item_first: bool = False,
|
559
|
+
) -> List[List[float]]:
|
560
|
+
"""
|
561
|
+
Asynchronous version of score method.
|
562
|
+
|
563
|
+
See score() for detailed documentation.
|
564
|
+
"""
|
565
|
+
return await self.tokenizer_manager.score_request(
|
566
|
+
query=query,
|
567
|
+
items=items,
|
568
|
+
label_token_ids=label_token_ids,
|
569
|
+
apply_softmax=apply_softmax,
|
570
|
+
item_first=item_first,
|
571
|
+
request=None,
|
572
|
+
)
|
573
|
+
|
475
574
|
|
476
575
|
def _set_envs_and_config(server_args: ServerArgs):
|
477
576
|
# Set global environments
|
@@ -498,7 +597,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
498
597
|
if server_args.attention_backend == "flashinfer":
|
499
598
|
assert_pkg_version(
|
500
599
|
"flashinfer_python",
|
501
|
-
"0.2.
|
600
|
+
"0.2.6.post1",
|
502
601
|
"Please uninstall the old version and "
|
503
602
|
"reinstall the latest version by following the instructions "
|
504
603
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -506,7 +605,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
506
605
|
if _is_cuda:
|
507
606
|
assert_pkg_version(
|
508
607
|
"sgl-kernel",
|
509
|
-
"0.1.
|
608
|
+
"0.1.7",
|
510
609
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
511
610
|
)
|
512
611
|
|
@@ -514,9 +613,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
514
613
|
pid, exitcode = os.waitpid(0, os.WNOHANG)
|
515
614
|
if exitcode != 0:
|
516
615
|
logger.warning(
|
517
|
-
"Child process unexpectedly failed with
|
518
|
-
exitcode,
|
519
|
-
pid,
|
616
|
+
f"Child process unexpectedly failed with {exitcode=}. {pid=}"
|
520
617
|
)
|
521
618
|
|
522
619
|
signal.signal(signal.SIGCHLD, sigchld_handler)
|
@@ -82,6 +82,7 @@ from sglang.srt.openai_api.adapter import (
|
|
82
82
|
v1_retrieve_batch,
|
83
83
|
v1_retrieve_file,
|
84
84
|
v1_retrieve_file_content,
|
85
|
+
v1_score,
|
85
86
|
)
|
86
87
|
from sglang.srt.openai_api.protocol import ModelCard, ModelList
|
87
88
|
from sglang.srt.reasoning_parser import ReasoningParser
|
@@ -229,6 +230,11 @@ async def get_server_info():
|
|
229
230
|
}
|
230
231
|
|
231
232
|
|
233
|
+
@app.get("/get_load")
|
234
|
+
async def get_load():
|
235
|
+
return await _global_state.tokenizer_manager.get_load()
|
236
|
+
|
237
|
+
|
232
238
|
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
233
239
|
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
234
240
|
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
@@ -251,7 +257,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
251
257
|
) + b"\n\n"
|
252
258
|
except ValueError as e:
|
253
259
|
out = {"error": {"message": str(e)}}
|
254
|
-
logger.error(f"Error: {e}")
|
260
|
+
logger.error(f"[http_server] Error: {e}")
|
255
261
|
yield b"data: " + orjson.dumps(
|
256
262
|
out, option=orjson.OPT_NON_STR_KEYS
|
257
263
|
) + b"\n\n"
|
@@ -269,7 +275,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
269
275
|
).__anext__()
|
270
276
|
return ret
|
271
277
|
except ValueError as e:
|
272
|
-
logger.error(f"Error: {e}")
|
278
|
+
logger.error(f"[http_server] Error: {e}")
|
273
279
|
return _create_error_response(e)
|
274
280
|
|
275
281
|
|
@@ -345,6 +351,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
|
345
351
|
activities=obj.activities,
|
346
352
|
with_stack=obj.with_stack,
|
347
353
|
record_shapes=obj.record_shapes,
|
354
|
+
profile_by_stage=obj.profile_by_stage,
|
348
355
|
)
|
349
356
|
return Response(
|
350
357
|
content="Start profiling.\n",
|
@@ -714,6 +721,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
|
|
714
721
|
return ORJSONResponse({"predictions": ret})
|
715
722
|
|
716
723
|
|
724
|
+
@app.post("/v1/score")
|
725
|
+
async def v1_score_request(raw_request: Request):
|
726
|
+
"""Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
|
727
|
+
return await v1_score(_global_state.tokenizer_manager, raw_request)
|
728
|
+
|
729
|
+
|
717
730
|
def _create_error_response(e):
|
718
731
|
return ORJSONResponse(
|
719
732
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -36,6 +36,7 @@ class BaseFormatDetector(ABC):
|
|
36
36
|
) # map what has been streamed for each tool so far to a list
|
37
37
|
self.bot_token = ""
|
38
38
|
self.eot_token = ""
|
39
|
+
self.tool_call_separator = ", "
|
39
40
|
|
40
41
|
def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
|
41
42
|
tool_indices = {
|
@@ -50,7 +51,7 @@ class BaseFormatDetector(ABC):
|
|
50
51
|
if name and name in tool_indices:
|
51
52
|
results.append(
|
52
53
|
ToolCallItem(
|
53
|
-
tool_index
|
54
|
+
tool_index=-1, # Caller should update this based on the actual tools array called
|
54
55
|
name=name,
|
55
56
|
parameters=json.dumps(
|
56
57
|
act.get("parameters") or act.get("arguments", {}),
|
@@ -72,20 +73,61 @@ class BaseFormatDetector(ABC):
|
|
72
73
|
action = json.loads(text)
|
73
74
|
return StreamingParseResult(calls=self.parse_base_json(action, tools))
|
74
75
|
|
76
|
+
def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
|
77
|
+
"""
|
78
|
+
Check if buffer ends with a partial bot_token.
|
79
|
+
Return the length of the partial bot_token.
|
80
|
+
|
81
|
+
For some format, the bot_token is not a token in model's vocabulary, such as
|
82
|
+
`[TOOL_CALLS] [` in Mistral.
|
83
|
+
"""
|
84
|
+
for i in range(1, min(len(buffer) + 1, len(bot_token))):
|
85
|
+
if bot_token.startswith(buffer[-i:]):
|
86
|
+
return i
|
87
|
+
return 0
|
88
|
+
|
75
89
|
def parse_streaming_increment(
|
76
90
|
self, new_text: str, tools: List[Tool]
|
77
91
|
) -> StreamingParseResult:
|
78
92
|
"""
|
79
93
|
Streaming incremental parsing with tool validation.
|
94
|
+
|
95
|
+
This base implementation works best with formats where:
|
96
|
+
1. bot_token is followed immediately by JSON (e.g., bot_token + JSON_array)
|
97
|
+
2. JSON can be parsed incrementally using partial_json_loads
|
98
|
+
3. Multiple tool calls are separated by "; " or ", "
|
99
|
+
|
100
|
+
Examples of incompatible formats (need custom implementation, may reuse some logic from this class):
|
101
|
+
- Each tool call is wrapped in a separate block: See Qwen25Detector
|
102
|
+
- Multiple separate blocks: [TOOL_CALLS] [...] \n [TOOL_CALLS] [...]
|
103
|
+
- Tool call is Pythonic style
|
104
|
+
|
105
|
+
For incompatible formats, detectors should override this method with custom logic.
|
80
106
|
"""
|
81
107
|
# Append new text to buffer
|
82
108
|
self._buffer += new_text
|
83
109
|
current_text = self._buffer
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
110
|
+
|
111
|
+
# The current_text has tool_call if it is the start of a new tool call sequence
|
112
|
+
# or it is the start of a new tool call after a tool call separator, when there is a previous tool call
|
113
|
+
if not (
|
114
|
+
self.bot_token in current_text
|
115
|
+
or current_text.startswith("{")
|
116
|
+
or (
|
117
|
+
self.current_tool_id > 0
|
118
|
+
and current_text.startswith(self.tool_call_separator + "{")
|
119
|
+
)
|
120
|
+
):
|
121
|
+
# Only clear buffer if we're sure no tool call is starting
|
122
|
+
if not self._ends_with_partial_token(self._buffer, self.bot_token):
|
123
|
+
normal_text = self._buffer
|
124
|
+
self._buffer = ""
|
125
|
+
if self.eot_token in normal_text:
|
126
|
+
normal_text = normal_text.replace(self.eot_token, "")
|
127
|
+
return StreamingParseResult(normal_text=normal_text)
|
128
|
+
else:
|
129
|
+
# Might be partial bot_token, keep buffering
|
130
|
+
return StreamingParseResult()
|
89
131
|
|
90
132
|
# Build tool indices if not already built
|
91
133
|
if not hasattr(self, "_tool_indices"):
|
@@ -96,91 +138,73 @@ class BaseFormatDetector(ABC):
|
|
96
138
|
}
|
97
139
|
|
98
140
|
flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
|
141
|
+
|
99
142
|
try:
|
100
|
-
tool_call_arr = []
|
101
|
-
is_complete = []
|
102
143
|
try:
|
103
|
-
|
104
|
-
len(self.bot_token)
|
105
|
-
|
106
|
-
|
144
|
+
if current_text.startswith(self.bot_token):
|
145
|
+
start_idx = len(self.bot_token)
|
146
|
+
elif self.current_tool_id > 0 and current_text.startswith(
|
147
|
+
self.tool_call_separator
|
148
|
+
):
|
149
|
+
start_idx = len(self.tool_call_separator)
|
150
|
+
else:
|
151
|
+
start_idx = 0
|
152
|
+
|
153
|
+
if start_idx >= len(current_text):
|
154
|
+
return StreamingParseResult()
|
155
|
+
|
156
|
+
(obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags)
|
157
|
+
|
158
|
+
is_current_complete = _is_complete_json(
|
159
|
+
current_text[start_idx : start_idx + end_idx]
|
107
160
|
)
|
108
|
-
while start_idx < len(current_text):
|
109
|
-
(obj, end_idx) = _partial_json_loads(
|
110
|
-
current_text[start_idx:], flags
|
111
|
-
)
|
112
|
-
is_complete.append(
|
113
|
-
_is_complete_json(current_text[start_idx : start_idx + end_idx])
|
114
|
-
)
|
115
|
-
start_idx += end_idx + len("; ")
|
116
161
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
162
|
+
# Validate tool name if present
|
163
|
+
if "name" in obj and obj["name"] not in self._tool_indices:
|
164
|
+
# Invalid tool name - reset state
|
165
|
+
self._buffer = ""
|
166
|
+
self.current_tool_id = -1
|
167
|
+
self.current_tool_name_sent = False
|
168
|
+
if self.streamed_args_for_tool:
|
169
|
+
self.streamed_args_for_tool.pop()
|
170
|
+
return StreamingParseResult()
|
171
|
+
|
172
|
+
# Handle parameters/arguments consistency
|
173
|
+
# NOTE: we assume here that the obj is always partial of a single tool call
|
174
|
+
if "parameters" in obj:
|
175
|
+
assert (
|
176
|
+
"arguments" not in obj
|
177
|
+
), "model generated both parameters and arguments"
|
178
|
+
obj["arguments"] = obj["parameters"]
|
179
|
+
|
180
|
+
current_tool_call = obj
|
134
181
|
|
135
182
|
except MalformedJSON:
|
136
183
|
return StreamingParseResult()
|
137
184
|
|
138
|
-
if
|
185
|
+
if not current_tool_call:
|
139
186
|
return StreamingParseResult()
|
140
187
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
# Handle new tool in array
|
146
|
-
if len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1:
|
147
|
-
if self.current_tool_id >= 0:
|
148
|
-
cur_arguments = current_tool_call.get("arguments")
|
149
|
-
if cur_arguments:
|
150
|
-
cur_args_json = json.dumps(cur_arguments)
|
151
|
-
sent = len(self.streamed_args_for_tool[self.current_tool_id])
|
152
|
-
argument_diff = cur_args_json[sent:]
|
153
|
-
|
154
|
-
res = StreamingParseResult(
|
155
|
-
calls=[
|
156
|
-
ToolCallItem(
|
157
|
-
tool_index=self.current_tool_id,
|
158
|
-
name="",
|
159
|
-
parameters=argument_diff,
|
160
|
-
)
|
161
|
-
],
|
162
|
-
)
|
163
|
-
self.streamed_args_for_tool[
|
164
|
-
self.current_tool_id
|
165
|
-
] += argument_diff
|
166
|
-
else:
|
167
|
-
res = StreamingParseResult()
|
168
|
-
else:
|
169
|
-
res = StreamingParseResult()
|
170
|
-
|
171
|
-
self.current_tool_id = len(tool_call_arr) - 1
|
172
|
-
self.current_tool_name_sent = False
|
173
|
-
self.streamed_args_for_tool.append("")
|
174
|
-
return res
|
175
|
-
|
176
|
-
# Handle tool name
|
177
|
-
elif not self.current_tool_name_sent:
|
188
|
+
# Case 1: Handle tool name streaming
|
189
|
+
# This happens when we encounter a tool but haven't sent its name yet
|
190
|
+
if not self.current_tool_name_sent:
|
178
191
|
function_name = current_tool_call.get("name")
|
192
|
+
|
179
193
|
if function_name and function_name in self._tool_indices:
|
194
|
+
# If this is a new tool (current_tool_id was -1), initialize it
|
195
|
+
if self.current_tool_id == -1:
|
196
|
+
self.current_tool_id = 0
|
197
|
+
self.streamed_args_for_tool.append("")
|
198
|
+
# If this is a subsequent tool, ensure streamed_args_for_tool is large enough
|
199
|
+
elif self.current_tool_id >= len(self.streamed_args_for_tool):
|
200
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
201
|
+
self.streamed_args_for_tool.append("")
|
202
|
+
|
203
|
+
# Send the tool name with empty parameters
|
180
204
|
res = StreamingParseResult(
|
181
205
|
calls=[
|
182
206
|
ToolCallItem(
|
183
|
-
tool_index=self.
|
207
|
+
tool_index=self.current_tool_id,
|
184
208
|
name=function_name,
|
185
209
|
parameters="",
|
186
210
|
)
|
@@ -190,47 +214,75 @@ class BaseFormatDetector(ABC):
|
|
190
214
|
else:
|
191
215
|
res = StreamingParseResult()
|
192
216
|
|
193
|
-
# Handle streaming arguments
|
217
|
+
# Case 2: Handle streaming arguments
|
218
|
+
# This happens when we've already sent the tool name and now need to stream arguments incrementally
|
194
219
|
else:
|
195
220
|
cur_arguments = current_tool_call.get("arguments")
|
196
221
|
res = StreamingParseResult()
|
197
222
|
|
198
223
|
if cur_arguments:
|
224
|
+
# Calculate how much of the arguments we've already streamed
|
199
225
|
sent = len(self.streamed_args_for_tool[self.current_tool_id])
|
200
226
|
cur_args_json = json.dumps(cur_arguments)
|
201
|
-
prev_arguments =
|
202
|
-
|
203
|
-
|
227
|
+
prev_arguments = None
|
228
|
+
if self.current_tool_id < len(self.prev_tool_call_arr):
|
229
|
+
prev_arguments = self.prev_tool_call_arr[
|
230
|
+
self.current_tool_id
|
231
|
+
].get("arguments")
|
204
232
|
|
205
233
|
argument_diff = None
|
206
|
-
|
234
|
+
|
235
|
+
# If the current tool's JSON is complete, send all remaining arguments
|
236
|
+
if is_current_complete:
|
207
237
|
argument_diff = cur_args_json[sent:]
|
208
|
-
|
209
|
-
|
238
|
+
completing_tool_id = (
|
239
|
+
self.current_tool_id
|
240
|
+
) # Save the ID of the tool that's completing
|
241
|
+
|
242
|
+
# Only remove the processed portion, keep unprocessed content
|
243
|
+
self._buffer = current_text[start_idx + end_idx :]
|
244
|
+
|
245
|
+
if self.current_tool_id < len(self.prev_tool_call_arr):
|
246
|
+
self.prev_tool_call_arr[self.current_tool_id].clear()
|
210
247
|
self.current_tool_name_sent = False
|
211
248
|
self.streamed_args_for_tool[self.current_tool_id] = ""
|
249
|
+
self.current_tool_id += 1
|
212
250
|
|
251
|
+
# If the tool is still being parsed, send incremental changes
|
213
252
|
elif prev_arguments:
|
214
253
|
prev_args_json = json.dumps(prev_arguments)
|
215
254
|
if cur_args_json != prev_args_json:
|
216
255
|
prefix = _find_common_prefix(prev_args_json, cur_args_json)
|
217
256
|
argument_diff = prefix[sent:]
|
218
257
|
|
258
|
+
# Send the argument diff if there's something new
|
219
259
|
if argument_diff is not None:
|
260
|
+
# Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
|
261
|
+
tool_index_to_use = (
|
262
|
+
completing_tool_id
|
263
|
+
if is_current_complete
|
264
|
+
else self.current_tool_id
|
265
|
+
)
|
220
266
|
res = StreamingParseResult(
|
221
267
|
calls=[
|
222
268
|
ToolCallItem(
|
223
|
-
tool_index=
|
269
|
+
tool_index=tool_index_to_use,
|
224
270
|
parameters=argument_diff,
|
225
271
|
)
|
226
272
|
],
|
227
273
|
)
|
228
|
-
if not
|
274
|
+
if not is_current_complete:
|
229
275
|
self.streamed_args_for_tool[
|
230
276
|
self.current_tool_id
|
231
277
|
] += argument_diff
|
232
278
|
|
233
|
-
|
279
|
+
# Update prev_tool_call_arr with current state
|
280
|
+
if self.current_tool_id >= 0:
|
281
|
+
# Ensure prev_tool_call_arr is large enough
|
282
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
283
|
+
self.prev_tool_call_arr.append({})
|
284
|
+
self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
|
285
|
+
|
234
286
|
return res
|
235
287
|
|
236
288
|
except Exception as e:
|
@@ -31,6 +31,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
31
31
|
self.func_call_regex = r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
|
32
32
|
self.func_detail_regex = r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```<|tool▁call▁end|>"
|
33
33
|
self._last_arguments = ""
|
34
|
+
self.current_tool_id = -1
|
34
35
|
|
35
36
|
def has_tool_call(self, text: str) -> bool:
|
36
37
|
"""Check if the text contains a deepseek format tool call."""
|
@@ -75,7 +76,12 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
75
76
|
self._buffer += new_text
|
76
77
|
current_text = self._buffer
|
77
78
|
|
78
|
-
if
|
79
|
+
# Check if we have a tool call (either the start token or individual tool call)
|
80
|
+
has_tool_call = (
|
81
|
+
self.bot_token in current_text or "<|tool▁call▁begin|>" in current_text
|
82
|
+
)
|
83
|
+
|
84
|
+
if not has_tool_call:
|
79
85
|
self._buffer = ""
|
80
86
|
for e_token in [self.eot_token, "```", "<|tool▁call▁end|>"]:
|
81
87
|
if e_token in new_text:
|
@@ -100,15 +106,32 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
100
106
|
func_name = partial_match.group(2).strip()
|
101
107
|
func_args_raw = partial_match.group(3).strip()
|
102
108
|
|
109
|
+
# Initialize state if this is the first tool call
|
110
|
+
if self.current_tool_id == -1:
|
111
|
+
self.current_tool_id = 0
|
112
|
+
self.prev_tool_call_arr = []
|
113
|
+
self.streamed_args_for_tool = [""]
|
114
|
+
|
115
|
+
# Ensure we have enough entries in our tracking arrays
|
116
|
+
while len(self.prev_tool_call_arr) <= self.current_tool_id:
|
117
|
+
self.prev_tool_call_arr.append({})
|
118
|
+
while len(self.streamed_args_for_tool) <= self.current_tool_id:
|
119
|
+
self.streamed_args_for_tool.append("")
|
120
|
+
|
103
121
|
if not self.current_tool_name_sent:
|
104
122
|
calls.append(
|
105
123
|
ToolCallItem(
|
106
|
-
tool_index=self.
|
124
|
+
tool_index=self.current_tool_id,
|
107
125
|
name=func_name,
|
108
126
|
parameters="",
|
109
127
|
)
|
110
128
|
)
|
111
129
|
self.current_tool_name_sent = True
|
130
|
+
# Store the tool call info for adapter.py
|
131
|
+
self.prev_tool_call_arr[self.current_tool_id] = {
|
132
|
+
"name": func_name,
|
133
|
+
"arguments": {},
|
134
|
+
}
|
112
135
|
else:
|
113
136
|
argument_diff = (
|
114
137
|
func_args_raw[len(self._last_arguments) :]
|
@@ -119,16 +142,41 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
119
142
|
if argument_diff:
|
120
143
|
calls.append(
|
121
144
|
ToolCallItem(
|
122
|
-
tool_index=self.
|
145
|
+
tool_index=self.current_tool_id,
|
123
146
|
name=None,
|
124
147
|
parameters=argument_diff,
|
125
148
|
)
|
126
149
|
)
|
127
150
|
self._last_arguments += argument_diff
|
151
|
+
self.streamed_args_for_tool[
|
152
|
+
self.current_tool_id
|
153
|
+
] += argument_diff
|
128
154
|
|
129
155
|
if _is_complete_json(func_args_raw):
|
156
|
+
# Update the stored arguments for adapter.py
|
157
|
+
try:
|
158
|
+
parsed_args = json.loads(func_args_raw)
|
159
|
+
self.prev_tool_call_arr[self.current_tool_id][
|
160
|
+
"arguments"
|
161
|
+
] = parsed_args
|
162
|
+
except json.JSONDecodeError:
|
163
|
+
pass
|
164
|
+
|
165
|
+
# Find the end of the current tool call and remove only that part from buffer
|
166
|
+
tool_call_end_pattern = (
|
167
|
+
r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
|
168
|
+
)
|
169
|
+
match = re.search(
|
170
|
+
tool_call_end_pattern, current_text, re.DOTALL
|
171
|
+
)
|
172
|
+
if match:
|
173
|
+
# Remove the completed tool call from buffer, keep any remaining content
|
174
|
+
self._buffer = current_text[match.end() :]
|
175
|
+
else:
|
176
|
+
self._buffer = ""
|
177
|
+
|
130
178
|
result = StreamingParseResult(normal_text="", calls=calls)
|
131
|
-
self.
|
179
|
+
self.current_tool_id += 1
|
132
180
|
self._last_arguments = ""
|
133
181
|
self.current_tool_name_sent = False
|
134
182
|
return result
|
@@ -149,8 +197,8 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
149
197
|
def build_ebnf(self, tools: List[Tool]):
|
150
198
|
return EBNFComposer.build_ebnf(
|
151
199
|
tools,
|
152
|
-
|
153
|
-
|
200
|
+
sequence_start_token=self.bot_token,
|
201
|
+
sequence_end_token=self.eot_token,
|
154
202
|
tool_call_separator="",
|
155
203
|
call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n" {arguments_rule} "\\n```<|tool▁call▁end|>"',
|
156
204
|
function_format="json",
|