sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Union
|
3
|
+
|
4
|
+
from sglang.srt.managers.multimodal_processors.base_processor import (
|
5
|
+
BaseMultimodalProcessor,
|
6
|
+
MultimodalSpecialTokens,
|
7
|
+
)
|
8
|
+
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
9
|
+
from sglang.srt.models.phi4mm import Phi4MMForCausalLM
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
_IMAGE_SPECIAL_TOKEN = "<|endoftext10|>"
|
14
|
+
_IMAGE_SPECIAL_TOKEN_ID = 200010
|
15
|
+
|
16
|
+
|
17
|
+
class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
18
|
+
models = [Phi4MMForCausalLM]
|
19
|
+
|
20
|
+
def __init__(self, hf_config, server_args, _processor):
|
21
|
+
super().__init__(hf_config, server_args, _processor)
|
22
|
+
self.multimodal_tokens = MultimodalSpecialTokens(
|
23
|
+
image_token=_IMAGE_SPECIAL_TOKEN,
|
24
|
+
)
|
25
|
+
|
26
|
+
async def process_mm_data_async(
|
27
|
+
self,
|
28
|
+
image_data: List[Union[str, bytes]],
|
29
|
+
input_text,
|
30
|
+
request_obj,
|
31
|
+
max_req_input_len,
|
32
|
+
**kwargs,
|
33
|
+
):
|
34
|
+
audio_data = request_obj.audio_data
|
35
|
+
|
36
|
+
if not image_data and not audio_data:
|
37
|
+
return None
|
38
|
+
|
39
|
+
if not isinstance(image_data, list):
|
40
|
+
image_data = [image_data]
|
41
|
+
|
42
|
+
if not isinstance(audio_data, list):
|
43
|
+
audio_data = [audio_data]
|
44
|
+
|
45
|
+
if audio_data:
|
46
|
+
logger.warning(
|
47
|
+
"Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
|
48
|
+
)
|
49
|
+
audio_data = []
|
50
|
+
|
51
|
+
base_output = self.load_mm_data(
|
52
|
+
prompt=input_text,
|
53
|
+
max_req_input_len=max_req_input_len,
|
54
|
+
audio_data=audio_data,
|
55
|
+
image_data=image_data,
|
56
|
+
multimodal_tokens=self.multimodal_tokens,
|
57
|
+
)
|
58
|
+
if base_output is None:
|
59
|
+
return None
|
60
|
+
|
61
|
+
res = self.process_mm_data(
|
62
|
+
input_text=base_output.input_text,
|
63
|
+
images=base_output.images,
|
64
|
+
audios=base_output.audios,
|
65
|
+
)
|
66
|
+
|
67
|
+
input_ids = res["input_ids"].flatten()
|
68
|
+
image_offsets = self.get_mm_items_offset(
|
69
|
+
input_ids=input_ids,
|
70
|
+
mm_token_id=_IMAGE_SPECIAL_TOKEN_ID,
|
71
|
+
)
|
72
|
+
|
73
|
+
items = [
|
74
|
+
MultimodalDataItem(
|
75
|
+
pixel_values=res["input_image_embeds"],
|
76
|
+
image_sizes=res["image_sizes"],
|
77
|
+
image_emb_mask=res["image_attention_mask"],
|
78
|
+
image_offsets=image_offsets,
|
79
|
+
modality=Modality.IMAGE,
|
80
|
+
)
|
81
|
+
]
|
82
|
+
|
83
|
+
return {
|
84
|
+
"mm_items": items,
|
85
|
+
"input_ids": input_ids.tolist(),
|
86
|
+
"im_token_id": _IMAGE_SPECIAL_TOKEN_ID,
|
87
|
+
}
|
@@ -32,8 +32,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
32
32
|
)
|
33
33
|
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
|
34
34
|
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
|
35
|
-
self.
|
36
|
-
self.
|
35
|
+
self.IM_TOKEN_ID = hf_config.image_token_id
|
36
|
+
self.VIDEO_TOKEN_ID = hf_config.video_token_id
|
37
37
|
self.vision_start_token_id = hf_config.vision_start_token_id
|
38
38
|
self.vision_end_token_id = hf_config.vision_end_token_id
|
39
39
|
self.NUM_TOKEN_PER_FRAME = 770
|
@@ -125,87 +125,45 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
|
|
125
125
|
async def resize_image_async(image):
|
126
126
|
return resize_image(image)
|
127
127
|
|
128
|
-
|
129
|
-
if base_output.images and
|
128
|
+
# Qwen-specific: resize images if they are raw Image objects
|
129
|
+
if base_output.images and isinstance(base_output.images[0], Image.Image):
|
130
130
|
resize_tasks = [resize_image_async(image) for image in base_output.images]
|
131
131
|
base_output.images = await asyncio.gather(*resize_tasks)
|
132
132
|
|
133
|
-
ret = self.process_mm_data(
|
134
|
-
input_text=base_output.input_text,
|
135
|
-
images=None if images_are_preprocessed else base_output.images,
|
136
|
-
)
|
137
|
-
input_ids = ret["input_ids"].flatten().tolist()
|
138
|
-
image_offsets = self.get_mm_items_offset(
|
139
|
-
input_ids=ret["input_ids"].flatten(), mm_token_id=self.image_token_id
|
140
|
-
)
|
141
|
-
image_grid_thw = None
|
142
133
|
video_grid_thw = None # TODO
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
)
|
153
|
-
all_pixel_values = [
|
154
|
-
item.pixel_values
|
155
|
-
for item in base_output.images
|
156
|
-
if item.pixel_values is not None
|
157
|
-
]
|
158
|
-
all_precomputed_features = [
|
159
|
-
item.precomputed_features
|
160
|
-
for item in base_output.images
|
161
|
-
if item.precomputed_features is not None
|
162
|
-
]
|
163
|
-
pixel_values = (
|
164
|
-
torch.concat(all_pixel_values) if all_pixel_values else None
|
165
|
-
)
|
166
|
-
precomputed_features = (
|
167
|
-
torch.concat(all_precomputed_features)
|
168
|
-
if all_precomputed_features
|
169
|
-
else None
|
170
|
-
)
|
171
|
-
else:
|
172
|
-
image_grid_thw = ret["image_grid_thw"]
|
173
|
-
pixel_values = ret["pixel_values"]
|
174
|
-
precomputed_features = None
|
175
|
-
items += [
|
176
|
-
MultimodalDataItem(
|
177
|
-
pixel_values=pixel_values,
|
178
|
-
image_grid_thws=image_grid_thw,
|
179
|
-
video_grid_thws=video_grid_thw,
|
180
|
-
precomputed_features=precomputed_features,
|
181
|
-
image_offsets=image_offsets,
|
182
|
-
modality=Modality.IMAGE,
|
183
|
-
)
|
184
|
-
]
|
134
|
+
|
135
|
+
combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
|
136
|
+
|
137
|
+
if combined_mm_item is None:
|
138
|
+
# Note(Xinyuan): This is the case where image loading fails.
|
139
|
+
return None
|
140
|
+
|
141
|
+
video_grid_thw = None # TODO
|
142
|
+
second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
|
185
143
|
|
186
144
|
mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
|
187
145
|
spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
|
188
|
-
image_token_id=self.
|
189
|
-
video_token_id=self.
|
146
|
+
image_token_id=self.IM_TOKEN_ID,
|
147
|
+
video_token_id=self.VIDEO_TOKEN_ID,
|
190
148
|
vision_start_token_id=self.vision_start_token_id,
|
191
149
|
model_type=self.hf_config.model_type,
|
192
150
|
tokens_per_second=getattr(
|
193
151
|
self.hf_config.vision_config, "tokens_per_second", None
|
194
152
|
),
|
195
|
-
input_ids=
|
196
|
-
image_grid_thw=image_grid_thw,
|
153
|
+
input_ids=input_ids.unsqueeze(0),
|
154
|
+
image_grid_thw=combined_mm_item.image_grid_thw,
|
197
155
|
video_grid_thw=video_grid_thw,
|
198
|
-
second_per_grid_ts=
|
156
|
+
second_per_grid_ts=second_per_grid_ts,
|
199
157
|
)
|
200
158
|
mrope_positions = mrope_positions.squeeze(1)
|
201
159
|
|
202
160
|
return {
|
203
|
-
"input_ids": input_ids,
|
204
|
-
"mm_items":
|
161
|
+
"input_ids": input_ids.tolist(),
|
162
|
+
"mm_items": [combined_mm_item],
|
205
163
|
"im_start_id": self.IM_START_TOKEN_ID,
|
206
164
|
"im_end_id": self.IM_END_TOKEN_ID,
|
207
|
-
"im_token_id": self.
|
208
|
-
"video_token_id": self.
|
165
|
+
"im_token_id": self.IM_TOKEN_ID,
|
166
|
+
"video_token_id": self.VIDEO_TOKEN_ID,
|
209
167
|
"mrope_positions": mrope_positions,
|
210
168
|
"mrope_position_delta": mrope_position_delta,
|
211
169
|
}
|
@@ -37,6 +37,7 @@ import hashlib
|
|
37
37
|
import logging
|
38
38
|
import threading
|
39
39
|
from enum import Enum, auto
|
40
|
+
from http import HTTPStatus
|
40
41
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
41
42
|
|
42
43
|
import numpy as np
|
@@ -51,6 +52,7 @@ from sglang.srt.disaggregation.base import BaseKVSender
|
|
51
52
|
from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
|
52
53
|
ScheduleBatchDisaggregationDecodeMixin,
|
53
54
|
)
|
55
|
+
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
|
54
56
|
from sglang.srt.layers.multimodal import gpu_tensor_hash
|
55
57
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
56
58
|
from sglang.srt.mem_cache.chunk_cache import ChunkCache
|
@@ -60,7 +62,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
|
|
60
62
|
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
61
63
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
62
64
|
from sglang.srt.server_args import ServerArgs
|
63
|
-
from sglang.srt.utils import flatten_nested_list,
|
65
|
+
from sglang.srt.utils import flatten_nested_list, support_triton
|
64
66
|
|
65
67
|
if TYPE_CHECKING:
|
66
68
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
@@ -68,32 +70,38 @@ if TYPE_CHECKING:
|
|
68
70
|
|
69
71
|
INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
70
72
|
|
73
|
+
GLOBAL_SERVER_ARGS_KEYS = [
|
74
|
+
"attention_backend",
|
75
|
+
"debug_tensor_dump_inject",
|
76
|
+
"debug_tensor_dump_output_folder",
|
77
|
+
"chunked_prefill_size",
|
78
|
+
"deepep_mode",
|
79
|
+
"device",
|
80
|
+
"disable_chunked_prefix_cache",
|
81
|
+
"disable_radix_cache",
|
82
|
+
"enable_deepep_moe",
|
83
|
+
"enable_dp_attention",
|
84
|
+
"enable_two_batch_overlap",
|
85
|
+
"enable_dp_lm_head",
|
86
|
+
"enable_ep_moe",
|
87
|
+
"deepep_config",
|
88
|
+
"enable_nan_detection",
|
89
|
+
"flashinfer_mla_disable_ragged",
|
90
|
+
"max_micro_batch_size",
|
91
|
+
"moe_dense_tp_size",
|
92
|
+
"ep_dispatch_algorithm",
|
93
|
+
"disable_shared_experts_fusion",
|
94
|
+
"sampling_backend",
|
95
|
+
"speculative_accept_threshold_acc",
|
96
|
+
"speculative_accept_threshold_single",
|
97
|
+
"torchao_config",
|
98
|
+
"triton_attention_reduce_in_fp32",
|
99
|
+
"ep_num_redundant_experts",
|
100
|
+
"mm_attention_backend",
|
101
|
+
]
|
102
|
+
|
71
103
|
# Put some global args for easy access
|
72
|
-
global_server_args_dict = {
|
73
|
-
"attention_backend": ServerArgs.attention_backend,
|
74
|
-
"chunked_prefill_size": ServerArgs.chunked_prefill_size,
|
75
|
-
"deepep_mode": ServerArgs.deepep_mode,
|
76
|
-
"device": ServerArgs.device,
|
77
|
-
"disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
|
78
|
-
"disable_radix_cache": ServerArgs.disable_radix_cache,
|
79
|
-
"enable_deepep_moe": ServerArgs.enable_deepep_moe,
|
80
|
-
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
81
|
-
"enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
|
82
|
-
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
83
|
-
"deepep_config": ServerArgs.deepep_config,
|
84
|
-
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
85
|
-
"flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
|
86
|
-
"max_micro_batch_size": ServerArgs.max_micro_batch_size,
|
87
|
-
"moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
|
88
|
-
"ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
|
89
|
-
"n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
|
90
|
-
"sampling_backend": ServerArgs.sampling_backend,
|
91
|
-
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
92
|
-
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
|
93
|
-
"torchao_config": ServerArgs.torchao_config,
|
94
|
-
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
95
|
-
"ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
|
96
|
-
}
|
104
|
+
global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
|
97
105
|
|
98
106
|
logger = logging.getLogger(__name__)
|
99
107
|
|
@@ -185,7 +193,7 @@ class MultimodalDataItem:
|
|
185
193
|
# the real data, pixel_values or audio_features
|
186
194
|
# data: Union[List[torch.Tensor], List[np.ndarray]]
|
187
195
|
pixel_values: Union[torch.Tensor, np.ndarray] = None
|
188
|
-
|
196
|
+
image_grid_thw: Union[torch.Tensor, np.ndarray] = None
|
189
197
|
video_grid_thws: Union[torch.Tensor, np.ndarray] = None
|
190
198
|
|
191
199
|
image_emb_mask: Optional[torch.Tensor] = None
|
@@ -195,6 +203,9 @@ class MultimodalDataItem:
|
|
195
203
|
# [num_images, (n, w, h)]
|
196
204
|
tgt_size: Tuple[int, int] = None
|
197
205
|
|
206
|
+
# kimi-vl related
|
207
|
+
image_grid_hws: Optional[List[torch.Tensor]] = None
|
208
|
+
|
198
209
|
audio_features: Union[torch.Tensor, np.ndarray] = None
|
199
210
|
audio_feature_lens: Optional[List[torch.Tensor]] = None
|
200
211
|
audio_offsets: Optional[List[Tuple[int, int]]] = None
|
@@ -440,6 +451,7 @@ class Req:
|
|
440
451
|
bootstrap_host: Optional[str] = None,
|
441
452
|
bootstrap_port: Optional[int] = None,
|
442
453
|
bootstrap_room: Optional[int] = None,
|
454
|
+
data_parallel_rank: Optional[int] = None,
|
443
455
|
):
|
444
456
|
# Input and output info
|
445
457
|
self.rid = rid
|
@@ -594,6 +606,9 @@ class Req:
|
|
594
606
|
self.bootstrap_room: Optional[int] = bootstrap_room
|
595
607
|
self.disagg_kv_sender: Optional[BaseKVSender] = None
|
596
608
|
|
609
|
+
# For data parallel rank routing
|
610
|
+
self.data_parallel_rank: Optional[int] = data_parallel_rank
|
611
|
+
|
597
612
|
# the start index of the sent kv cache
|
598
613
|
# We want to send it chunk by chunk for chunked prefill.
|
599
614
|
# After every chunk forward, we do the following:
|
@@ -770,6 +785,16 @@ class Req:
|
|
770
785
|
logger.info(f"{prefix}: {self.time_stats}")
|
771
786
|
self.has_log_time_stats = True
|
772
787
|
|
788
|
+
def set_finish_with_abort(self, error_msg: str):
|
789
|
+
if get_tensor_model_parallel_rank() == 0:
|
790
|
+
logger.error(f"{error_msg}, {self.rid=}")
|
791
|
+
self.multimodal_inputs = None
|
792
|
+
self.grammar = None
|
793
|
+
self.origin_input_ids = [0] # set it to one token to skip the long prefill
|
794
|
+
self.finished_reason = FINISH_ABORT(
|
795
|
+
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
796
|
+
)
|
797
|
+
|
773
798
|
def __repr__(self):
|
774
799
|
return (
|
775
800
|
f"Req(rid={self.rid}, "
|
@@ -831,6 +856,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
831
856
|
global_num_tokens: Optional[List[int]] = None
|
832
857
|
global_num_tokens_for_logprob: Optional[List[int]] = None
|
833
858
|
can_run_dp_cuda_graph: bool = False
|
859
|
+
tbo_split_seq_index: Optional[int] = None
|
860
|
+
global_forward_mode: Optional[ForwardMode] = None
|
834
861
|
|
835
862
|
# For processing logprobs
|
836
863
|
return_logprob: bool = False
|
@@ -1254,7 +1281,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1254
1281
|
self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
|
1255
1282
|
|
1256
1283
|
# Write to req_to_token_pool
|
1257
|
-
if global_server_args_dict
|
1284
|
+
if support_triton(global_server_args_dict.get("attention_backend")):
|
1258
1285
|
# TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
|
1259
1286
|
|
1260
1287
|
write_req_to_token_pool_triton[(bs,)](
|
@@ -1318,7 +1345,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1318
1345
|
page_size = self.token_to_kv_pool_allocator.page_size
|
1319
1346
|
if page_size == 1:
|
1320
1347
|
return len(self.reqs)
|
1321
|
-
|
1348
|
+
# In the decoding phase, the length of a request's KV cache should be
|
1349
|
+
# the total length of the request minus 1
|
1350
|
+
return sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
|
1322
1351
|
|
1323
1352
|
def check_decode_mem(self, buf_multiplier=1):
|
1324
1353
|
tokens_required = (
|
@@ -1607,7 +1636,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1607
1636
|
if self.spec_info:
|
1608
1637
|
self.spec_info.merge_batch(other.spec_info)
|
1609
1638
|
|
1610
|
-
def get_model_worker_batch(
|
1639
|
+
def get_model_worker_batch(
|
1640
|
+
self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
|
1641
|
+
) -> ModelWorkerBatch:
|
1611
1642
|
if self.forward_mode.is_decode_or_idle():
|
1612
1643
|
extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
|
1613
1644
|
else:
|
@@ -1617,15 +1648,20 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1617
1648
|
|
1618
1649
|
# Create seq_lens_cpu when needed
|
1619
1650
|
if (
|
1620
|
-
|
1651
|
+
global_server_args_dict["attention_backend"] == "fa3"
|
1652
|
+
or (
|
1621
1653
|
global_server_args_dict["use_mla_backend"]
|
1622
1654
|
and global_server_args_dict["attention_backend"] == "flashinfer"
|
1623
1655
|
)
|
1624
1656
|
or global_server_args_dict["attention_backend"] == "flashmla"
|
1625
|
-
or global_server_args_dict["attention_backend"] == "fa3"
|
1626
1657
|
or global_server_args_dict["attention_backend"] == "cutlass_mla"
|
1658
|
+
or global_server_args_dict["enable_two_batch_overlap"]
|
1627
1659
|
):
|
1628
|
-
seq_lens_cpu =
|
1660
|
+
seq_lens_cpu = (
|
1661
|
+
seq_lens_cpu_cache
|
1662
|
+
if seq_lens_cpu_cache is not None
|
1663
|
+
else self.seq_lens.cpu()
|
1664
|
+
)
|
1629
1665
|
else:
|
1630
1666
|
seq_lens_cpu = None
|
1631
1667
|
|
@@ -1644,6 +1680,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1644
1680
|
req_pool_indices=self.req_pool_indices,
|
1645
1681
|
seq_lens=self.seq_lens,
|
1646
1682
|
out_cache_loc=self.out_cache_loc,
|
1683
|
+
seq_lens_cpu=seq_lens_cpu,
|
1647
1684
|
seq_lens_sum=self.seq_lens_sum,
|
1648
1685
|
return_logprob=self.return_logprob,
|
1649
1686
|
top_logprobs_nums=self.top_logprobs_nums,
|
@@ -1651,7 +1688,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1651
1688
|
global_num_tokens=self.global_num_tokens,
|
1652
1689
|
global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
|
1653
1690
|
can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
|
1654
|
-
|
1691
|
+
tbo_split_seq_index=self.tbo_split_seq_index,
|
1692
|
+
global_forward_mode=self.global_forward_mode,
|
1655
1693
|
extend_num_tokens=self.extend_num_tokens,
|
1656
1694
|
extend_seq_lens=extend_seq_lens,
|
1657
1695
|
extend_prefix_lens=extend_prefix_lens,
|
@@ -1713,11 +1751,11 @@ class ModelWorkerBatch:
|
|
1713
1751
|
req_pool_indices: torch.Tensor
|
1714
1752
|
# The sequence length
|
1715
1753
|
seq_lens: torch.Tensor
|
1716
|
-
seq_lens_cpu: Optional[torch.Tensor]
|
1717
1754
|
# The indices of output tokens in the token_to_kv_pool_allocator
|
1718
1755
|
out_cache_loc: torch.Tensor
|
1719
1756
|
|
1720
|
-
# The
|
1757
|
+
# The sequence length tensor on CPU
|
1758
|
+
seq_lens_cpu: Optional[torch.Tensor]
|
1721
1759
|
seq_lens_sum: int
|
1722
1760
|
|
1723
1761
|
# For logprob
|
@@ -1729,6 +1767,8 @@ class ModelWorkerBatch:
|
|
1729
1767
|
global_num_tokens: Optional[List[int]]
|
1730
1768
|
global_num_tokens_for_logprob: Optional[List[int]]
|
1731
1769
|
can_run_dp_cuda_graph: bool
|
1770
|
+
tbo_split_seq_index: Optional[int]
|
1771
|
+
global_forward_mode: Optional[ForwardMode]
|
1732
1772
|
|
1733
1773
|
# For extend
|
1734
1774
|
extend_num_tokens: Optional[int]
|
@@ -1802,10 +1842,72 @@ def write_req_to_token_pool_triton(
|
|
1802
1842
|
)
|
1803
1843
|
|
1804
1844
|
|
1805
|
-
|
1806
|
-
|
1845
|
+
def get_last_loc(
|
1846
|
+
req_to_token: torch.Tensor,
|
1847
|
+
req_pool_indices_tensor: torch.Tensor,
|
1848
|
+
prefix_lens_tensor: torch.Tensor,
|
1849
|
+
) -> torch.Tensor:
|
1850
|
+
if global_server_args_dict["attention_backend"] != "torch_native":
|
1851
|
+
impl = get_last_loc_triton
|
1852
|
+
else:
|
1853
|
+
impl = get_last_loc_torch
|
1854
|
+
|
1855
|
+
return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
|
1856
|
+
|
1857
|
+
|
1858
|
+
def get_last_loc_torch(
|
1859
|
+
req_to_token: torch.Tensor,
|
1860
|
+
req_pool_indices_tensor: torch.Tensor,
|
1861
|
+
prefix_lens_tensor: torch.Tensor,
|
1862
|
+
) -> torch.Tensor:
|
1807
1863
|
return torch.where(
|
1808
1864
|
prefix_lens_tensor > 0,
|
1809
1865
|
req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
|
1810
1866
|
torch.full_like(prefix_lens_tensor, -1),
|
1811
1867
|
)
|
1868
|
+
|
1869
|
+
|
1870
|
+
@triton.jit
|
1871
|
+
def get_last_loc_kernel(
|
1872
|
+
req_to_token,
|
1873
|
+
req_pool_indices_tensor,
|
1874
|
+
prefix_lens_tensor,
|
1875
|
+
result,
|
1876
|
+
num_tokens,
|
1877
|
+
req_to_token_stride,
|
1878
|
+
BLOCK_SIZE: tl.constexpr,
|
1879
|
+
):
|
1880
|
+
pid = tl.program_id(0)
|
1881
|
+
offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
|
1882
|
+
mask = offset < num_tokens
|
1883
|
+
|
1884
|
+
prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
|
1885
|
+
req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
|
1886
|
+
|
1887
|
+
token_mask = prefix_lens > 0
|
1888
|
+
token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
|
1889
|
+
tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
|
1890
|
+
|
1891
|
+
tl.store(result + offset, tokens, mask=mask)
|
1892
|
+
|
1893
|
+
|
1894
|
+
def get_last_loc_triton(
|
1895
|
+
req_to_token: torch.Tensor,
|
1896
|
+
req_pool_indices_tensor: torch.Tensor,
|
1897
|
+
prefix_lens_tensor: torch.Tensor,
|
1898
|
+
) -> torch.Tensor:
|
1899
|
+
BLOCK_SIZE = 256
|
1900
|
+
num_tokens = prefix_lens_tensor.shape[0]
|
1901
|
+
result = torch.empty_like(prefix_lens_tensor)
|
1902
|
+
grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
|
1903
|
+
|
1904
|
+
get_last_loc_kernel[grid](
|
1905
|
+
req_to_token,
|
1906
|
+
req_pool_indices_tensor,
|
1907
|
+
prefix_lens_tensor,
|
1908
|
+
result,
|
1909
|
+
num_tokens,
|
1910
|
+
req_to_token.stride(0),
|
1911
|
+
BLOCK_SIZE,
|
1912
|
+
)
|
1913
|
+
return result
|