sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -11,8 +11,6 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
11
11
|
import torch
|
12
12
|
import triton
|
13
13
|
|
14
|
-
from sglang.global_config import global_config
|
15
|
-
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
16
14
|
from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
|
17
15
|
from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
|
18
16
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
@@ -22,7 +20,6 @@ from sglang.srt.utils import is_cuda
|
|
22
20
|
if TYPE_CHECKING:
|
23
21
|
from sglang.srt.layers.radix_attention import RadixAttention
|
24
22
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
25
|
-
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
26
23
|
from sglang.srt.speculative.spec_info import SpecInfo
|
27
24
|
|
28
25
|
_is_cuda = is_cuda()
|
@@ -108,7 +105,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
108
105
|
PAGE_SIZE,
|
109
106
|
)
|
110
107
|
workspace_size = cutlass_mla_get_workspace_size(
|
111
|
-
max_seqlen_pad * PAGE_SIZE, bs
|
108
|
+
max_seqlen_pad * PAGE_SIZE, bs, num_kv_splits=1
|
112
109
|
)
|
113
110
|
workspace = torch.empty(
|
114
111
|
workspace_size, device="cuda", dtype=torch.uint8
|
@@ -138,7 +135,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
138
135
|
cuda_graph_kv_indices = block_kv_indices
|
139
136
|
|
140
137
|
workspace_size = cutlass_mla_get_workspace_size(
|
141
|
-
cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs
|
138
|
+
cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs, num_kv_splits=1
|
142
139
|
)
|
143
140
|
self.cuda_graph_mla_workspace = torch.empty(
|
144
141
|
workspace_size, device="cuda", dtype=torch.uint8
|
@@ -157,7 +154,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
157
154
|
):
|
158
155
|
if forward_mode.is_decode_or_idle():
|
159
156
|
if spec_info is None:
|
160
|
-
max_seqlen_pad =
|
157
|
+
max_seqlen_pad = self.cuda_graph_kv_indices.shape[1]
|
161
158
|
|
162
159
|
create_flashmla_kv_indices_triton[(bs,)](
|
163
160
|
self.req_to_token,
|
@@ -169,12 +166,6 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
169
166
|
self.cuda_graph_kv_indices.stride(0),
|
170
167
|
PAGE_SIZE,
|
171
168
|
)
|
172
|
-
workspace_size = cutlass_mla_get_workspace_size(
|
173
|
-
max_seqlen_pad * PAGE_SIZE, bs
|
174
|
-
)
|
175
|
-
self.cuda_graph_mla_workspace = torch.empty(
|
176
|
-
workspace_size, device="cuda", dtype=torch.uint8
|
177
|
-
)
|
178
169
|
self.forward_metadata = CutlassMLADecodeMetadata(
|
179
170
|
self.cuda_graph_mla_workspace,
|
180
171
|
self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
|
@@ -205,8 +196,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
205
196
|
if forward_mode.is_decode_or_idle():
|
206
197
|
assert seq_lens_cpu is not None
|
207
198
|
seq_lens = seq_lens[:bs]
|
208
|
-
|
209
|
-
max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
|
199
|
+
|
210
200
|
create_flashmla_kv_indices_triton[(bs,)](
|
211
201
|
self.req_to_token,
|
212
202
|
req_pool_indices[:bs],
|
@@ -217,16 +207,6 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
217
207
|
self.cuda_graph_kv_indices.stride(0),
|
218
208
|
PAGE_SIZE,
|
219
209
|
)
|
220
|
-
workspace_size = cutlass_mla_get_workspace_size(
|
221
|
-
max_seqlen_pad * PAGE_SIZE, bs
|
222
|
-
)
|
223
|
-
self.cuda_graph_mla_workspace = torch.empty(
|
224
|
-
workspace_size, device="cuda", dtype=torch.uint8
|
225
|
-
)
|
226
|
-
self.forward_metadata.workspace = self.cuda_graph_mla_workspace
|
227
|
-
self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
|
228
|
-
:bs, :max_seqlen_pad
|
229
|
-
]
|
230
210
|
else:
|
231
211
|
super().init_forward_metadata_replay_cuda_graph(
|
232
212
|
bs,
|
@@ -250,29 +230,55 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
250
230
|
layer: RadixAttention,
|
251
231
|
forward_batch: ForwardBatch,
|
252
232
|
save_kv_cache: bool = True,
|
233
|
+
# For multi-head latent attention
|
234
|
+
q_rope: Optional[torch.Tensor] = None,
|
235
|
+
k_rope: Optional[torch.Tensor] = None,
|
253
236
|
):
|
254
237
|
cache_loc = forward_batch.out_cache_loc
|
255
238
|
|
256
239
|
if k is not None:
|
257
240
|
assert v is not None
|
258
241
|
if save_kv_cache:
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
242
|
+
if k_rope is not None:
|
243
|
+
forward_batch.token_to_kv_pool.set_mla_kv_buffer(
|
244
|
+
layer,
|
245
|
+
cache_loc,
|
246
|
+
k,
|
247
|
+
k_rope,
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
forward_batch.token_to_kv_pool.set_kv_buffer(
|
251
|
+
layer,
|
252
|
+
cache_loc,
|
253
|
+
k,
|
254
|
+
v,
|
255
|
+
)
|
267
256
|
|
268
|
-
|
257
|
+
# Reshape inputs
|
258
|
+
if q_rope is not None:
|
259
|
+
q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
|
260
|
+
q_rope = q_rope.view(
|
261
|
+
-1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
|
265
|
+
q_nope = reshaped_q[:, :, : layer.v_head_dim]
|
266
|
+
q_rope = reshaped_q[:, :, layer.v_head_dim :]
|
267
|
+
|
268
|
+
q_nope = q_nope.to(self.q_data_type)
|
269
|
+
q_rope = q_rope.to(self.q_data_type)
|
270
|
+
|
271
|
+
k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
|
269
272
|
|
270
273
|
o = cutlass_mla_decode(
|
271
|
-
|
274
|
+
q_nope=q_nope,
|
275
|
+
q_pe=q_rope,
|
272
276
|
kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
|
273
277
|
seq_lens=forward_batch.seq_lens.to(torch.int32),
|
274
278
|
page_table=self.forward_metadata.block_kv_indices,
|
275
279
|
workspace=self.forward_metadata.workspace,
|
280
|
+
sm_scale=layer.scaling,
|
281
|
+
num_kv_splits=1,
|
276
282
|
)
|
277
283
|
|
278
284
|
return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
|
@@ -393,7 +393,6 @@ class FlashAttentionBackend(AttentionBackend):
|
|
393
393
|
dtype=torch.int32,
|
394
394
|
)
|
395
395
|
metadata_expand.max_seq_len_q = 1
|
396
|
-
metadata_expand.max_seq_len_k = self.speculative_step_id + 1
|
397
396
|
metadata_expand.cu_seqlens_q = torch.arange(
|
398
397
|
0,
|
399
398
|
metadata_expand.cache_seqlens_int32.numel() + 1,
|
@@ -407,9 +406,10 @@ class FlashAttentionBackend(AttentionBackend):
|
|
407
406
|
dtype=torch.int32,
|
408
407
|
device=device,
|
409
408
|
)
|
409
|
+
# shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
|
410
410
|
cache_loc = forward_batch.out_cache_loc.view(
|
411
|
-
self.speculative_num_steps
|
412
|
-
)
|
411
|
+
-1, self.speculative_num_steps
|
412
|
+
)
|
413
413
|
metadata_expand.page_table = (
|
414
414
|
cache_loc[:, :decode_length].contiguous().to(torch.int32)
|
415
415
|
)
|
@@ -549,9 +549,6 @@ class FlashAttentionBackend(AttentionBackend):
|
|
549
549
|
),
|
550
550
|
(1, 0),
|
551
551
|
)
|
552
|
-
metadata_expand.max_seq_len_k = (
|
553
|
-
metadata_expand.cache_seqlens_int32.max().item()
|
554
|
-
)
|
555
552
|
self.forward_metadata_spec_decode_expand = metadata_expand
|
556
553
|
elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed():
|
557
554
|
metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
|
@@ -1268,6 +1265,29 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1268
1265
|
),
|
1269
1266
|
}
|
1270
1267
|
|
1268
|
+
self.draft_extend_metadata = {
|
1269
|
+
"cache_seqlens": torch.zeros(
|
1270
|
+
max_bs, dtype=torch.int32, device=self.device
|
1271
|
+
),
|
1272
|
+
"cu_seqlens_q": torch.zeros(
|
1273
|
+
max_bs + 1,
|
1274
|
+
dtype=torch.int32,
|
1275
|
+
device=self.device,
|
1276
|
+
),
|
1277
|
+
"cu_seqlens_k": torch.zeros(
|
1278
|
+
max_bs + 1, dtype=torch.int32, device=self.device
|
1279
|
+
),
|
1280
|
+
"page_table": torch.zeros(
|
1281
|
+
max_bs,
|
1282
|
+
(self.max_context_len + self.page_size - 1) // self.page_size,
|
1283
|
+
dtype=torch.int32,
|
1284
|
+
device=self.device,
|
1285
|
+
),
|
1286
|
+
"strided_indices": torch.arange(
|
1287
|
+
0, self.max_context_len, self.page_size, device=self.device
|
1288
|
+
),
|
1289
|
+
}
|
1290
|
+
|
1271
1291
|
if self.topk > 1:
|
1272
1292
|
self.target_verify_metadata_topk_normal = {
|
1273
1293
|
"cache_seqlens": torch.zeros(
|
@@ -1397,9 +1417,6 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1397
1417
|
]
|
1398
1418
|
)
|
1399
1419
|
metadata_expand.max_seq_len_q = 1
|
1400
|
-
metadata_expand.max_seq_len_k = (
|
1401
|
-
self.speculative_step_id + 1
|
1402
|
-
) # , do this in replay
|
1403
1420
|
metadata_expand.cu_seqlens_q = (
|
1404
1421
|
self.draft_decode_metadata_topk_expand["cu_seqlens_q"][
|
1405
1422
|
: bs * self.topk + 1
|
@@ -1445,7 +1462,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1445
1462
|
"cache_seqlens"
|
1446
1463
|
][:bs]
|
1447
1464
|
metadata.cache_seqlens_int32.copy_(
|
1448
|
-
(seq_lens + self.speculative_num_draft_tokens)
|
1465
|
+
(seq_lens + self.speculative_num_draft_tokens)
|
1449
1466
|
)
|
1450
1467
|
|
1451
1468
|
metadata.max_seq_len_q = self.speculative_num_draft_tokens
|
@@ -1508,6 +1525,32 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1508
1525
|
|
1509
1526
|
self.target_verify_metadata_topk_normal[bs] = metadata
|
1510
1527
|
self.target_verify_metadata_topk_expand[bs] = metadata_expand
|
1528
|
+
elif forward_mode.is_draft_extend():
|
1529
|
+
metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
|
1530
|
+
:bs
|
1531
|
+
]
|
1532
|
+
metadata.cache_seqlens_int32.copy_(seq_lens)
|
1533
|
+
|
1534
|
+
num_tokens_per_bs = num_tokens // bs
|
1535
|
+
metadata.max_seq_len_q = num_tokens_per_bs
|
1536
|
+
metadata.max_seq_len_k = seq_lens.max().item()
|
1537
|
+
|
1538
|
+
metadata.cu_seqlens_q = torch.arange(
|
1539
|
+
0,
|
1540
|
+
bs * num_tokens_per_bs + 1,
|
1541
|
+
num_tokens_per_bs,
|
1542
|
+
dtype=torch.int32,
|
1543
|
+
device=device,
|
1544
|
+
)
|
1545
|
+
|
1546
|
+
metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
|
1547
|
+
: (bs + 1)
|
1548
|
+
]
|
1549
|
+
metadata.page_table = self.draft_extend_metadata["page_table"][
|
1550
|
+
req_pool_indices, :
|
1551
|
+
]
|
1552
|
+
|
1553
|
+
self.draft_extend_metadata[bs] = metadata
|
1511
1554
|
|
1512
1555
|
if encoder_lens is not None:
|
1513
1556
|
encoder_bs = encoder_lens.numel()
|
@@ -1550,38 +1593,32 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1550
1593
|
if spec_info is not None:
|
1551
1594
|
# Draft Decode
|
1552
1595
|
if self.topk <= 1:
|
1553
|
-
metadata = self.decode_cuda_graph_metadata[bs]
|
1554
1596
|
# When topk = 1, we use the normal decode metadata
|
1555
|
-
metadata.
|
1556
|
-
|
1557
|
-
|
1558
|
-
|
1559
|
-
metadata.max_seq_len_k = seq_lens_cpu.max().item() + (
|
1560
|
-
self.speculative_step_id + 1
|
1561
|
-
)
|
1562
|
-
metadata.cu_seqlens_k[1:].copy_(
|
1563
|
-
torch.cumsum(
|
1564
|
-
metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
|
1565
|
-
)
|
1566
|
-
)
|
1567
|
-
|
1597
|
+
metadata = self.decode_cuda_graph_metadata[bs]
|
1598
|
+
max_len = seq_lens_cpu.max().item()
|
1599
|
+
metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
|
1568
1600
|
max_seq_pages = (
|
1569
1601
|
metadata.max_seq_len_k + self.page_size - 1
|
1570
1602
|
) // self.page_size
|
1571
|
-
page_indices = self.req_to_token[
|
1572
|
-
req_pool_indices[:, None],
|
1573
|
-
self.decode_cuda_graph_metadata["strided_indices"][
|
1574
|
-
:max_seq_pages
|
1575
|
-
],
|
1576
|
-
]
|
1577
1603
|
|
1578
|
-
|
1579
|
-
|
1604
|
+
normal_decode_set_medadata(
|
1605
|
+
metadata.cache_seqlens_int32,
|
1606
|
+
metadata.cu_seqlens_k,
|
1607
|
+
metadata.page_table,
|
1608
|
+
self.req_to_token,
|
1609
|
+
req_pool_indices,
|
1610
|
+
self.decode_cuda_graph_metadata["strided_indices"],
|
1611
|
+
max_seq_pages,
|
1612
|
+
seq_lens,
|
1613
|
+
self.speculative_step_id + 1,
|
1614
|
+
self.page_size,
|
1615
|
+
)
|
1616
|
+
|
1580
1617
|
else:
|
1581
1618
|
# When top k > 1, we need two specific draft decode metadata, and then merge states
|
1582
1619
|
# 1. The first half of metadata for prefix tokens
|
1583
1620
|
metadata = self.draft_decode_metadata_topk_normal[bs]
|
1584
|
-
metadata.cache_seqlens_int32.copy_(seq_lens
|
1621
|
+
metadata.cache_seqlens_int32.copy_(seq_lens)
|
1585
1622
|
# metadata.max_seq_len_q = self.topk, already set in capture
|
1586
1623
|
metadata.max_seq_len_k = seq_lens_cpu.max().item()
|
1587
1624
|
# metadata.cu_seqlens_q already set in capture
|
@@ -1600,44 +1637,38 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1600
1637
|
# 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
|
1601
1638
|
metadata_expand = self.draft_decode_metadata_topk_expand[bs]
|
1602
1639
|
decode_length = self.speculative_step_id + 1
|
1603
|
-
|
1604
|
-
|
1605
|
-
).T.contiguous()
|
1640
|
+
# shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
|
1641
|
+
cache_loc = out_cache_loc.view(-1, self.speculative_num_steps)
|
1606
1642
|
metadata_expand.page_table[: cache_loc.shape[0]].copy_(
|
1607
|
-
cache_loc[:, :decode_length]
|
1643
|
+
cache_loc[:, :decode_length]
|
1608
1644
|
)
|
1609
1645
|
# TODO: Handle local attention metadata for draft decode when llama4 eagle is supported
|
1610
1646
|
else:
|
1611
|
-
metadata = self.decode_cuda_graph_metadata[bs]
|
1612
1647
|
# Normal Decode
|
1648
|
+
metadata = self.decode_cuda_graph_metadata[bs]
|
1613
1649
|
max_len = seq_lens_cpu.max().item()
|
1650
|
+
max_seq_pages = (max_len + self.page_size - 1) // self.page_size
|
1614
1651
|
metadata.max_seq_len_k = max_len
|
1615
1652
|
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1653
|
+
normal_decode_set_medadata(
|
1654
|
+
metadata.cache_seqlens_int32,
|
1655
|
+
metadata.cu_seqlens_k,
|
1656
|
+
metadata.page_table,
|
1657
|
+
self.req_to_token,
|
1658
|
+
req_pool_indices,
|
1659
|
+
self.decode_cuda_graph_metadata["strided_indices"],
|
1660
|
+
max_seq_pages,
|
1661
|
+
seq_lens,
|
1662
|
+
0,
|
1663
|
+
self.page_size,
|
1620
1664
|
)
|
1621
1665
|
|
1622
|
-
max_seq_pages = (
|
1623
|
-
metadata.max_seq_len_k + self.page_size - 1
|
1624
|
-
) // self.page_size
|
1625
|
-
page_indices = self.req_to_token[
|
1626
|
-
req_pool_indices[:, None],
|
1627
|
-
self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages][
|
1628
|
-
None, :
|
1629
|
-
],
|
1630
|
-
]
|
1631
|
-
page_indices //= self.page_size
|
1632
|
-
metadata.page_table[:, :max_seq_pages].copy_(page_indices)
|
1633
|
-
metadata.page_table[:, max_seq_pages:].fill_(0)
|
1634
|
-
|
1635
1666
|
self._update_local_attn_metadata_for_replay(metadata, bs)
|
1636
1667
|
elif forward_mode.is_target_verify():
|
1637
1668
|
if self.topk <= 1:
|
1638
1669
|
metadata = self.target_verify_metadata[bs]
|
1639
1670
|
metadata.cache_seqlens_int32.copy_(
|
1640
|
-
(seq_lens + self.speculative_num_draft_tokens)
|
1671
|
+
(seq_lens + self.speculative_num_draft_tokens)
|
1641
1672
|
)
|
1642
1673
|
|
1643
1674
|
metadata.max_seq_len_k = (
|
@@ -1659,7 +1690,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1659
1690
|
# When topk > 1, we need two specific target verify metadata, and then merge states
|
1660
1691
|
# 1. The first half of metadata for prefix tokens
|
1661
1692
|
metadata = self.target_verify_metadata_topk_normal[bs]
|
1662
|
-
metadata.cache_seqlens_int32.copy_(seq_lens
|
1693
|
+
metadata.cache_seqlens_int32.copy_(seq_lens)
|
1663
1694
|
# metadata.max_seq_len_q = self.speculative_num_draft_tokens, already set in capture
|
1664
1695
|
metadata.max_seq_len_k = seq_lens_cpu.max().item()
|
1665
1696
|
# metadata.cu_seqlens_q already set in capture
|
@@ -1719,9 +1750,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1719
1750
|
metadata_expand.page_table.copy_(
|
1720
1751
|
non_masked_page_table.gather(1, sort_order)
|
1721
1752
|
)
|
1722
|
-
metadata_expand.cache_seqlens_int32.copy_(
|
1723
|
-
mask.sum(dim=1).to(torch.int32)
|
1724
|
-
)
|
1753
|
+
metadata_expand.cache_seqlens_int32.copy_(mask.sum(dim=1))
|
1725
1754
|
metadata_expand.cu_seqlens_k[1:].copy_(
|
1726
1755
|
torch.cumsum(
|
1727
1756
|
metadata_expand.cache_seqlens_int32,
|
@@ -1729,9 +1758,28 @@ class FlashAttentionBackend(AttentionBackend):
|
|
1729
1758
|
dtype=torch.int32,
|
1730
1759
|
)
|
1731
1760
|
)
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1761
|
+
elif forward_mode.is_draft_extend():
|
1762
|
+
metadata = self.draft_extend_metadata[bs]
|
1763
|
+
metadata.cache_seqlens_int32.copy_(seq_lens)
|
1764
|
+
|
1765
|
+
metadata.max_seq_len_k = seq_lens_cpu.max().item()
|
1766
|
+
metadata.cu_seqlens_k[1:].copy_(
|
1767
|
+
torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
|
1768
|
+
)
|
1769
|
+
accept_length = spec_info.accept_length[:bs]
|
1770
|
+
metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
|
1771
|
+
metadata.cu_seqlens_q[1:].copy_(
|
1772
|
+
torch.cumsum(accept_length, dim=0, dtype=torch.int32)
|
1773
|
+
)
|
1774
|
+
|
1775
|
+
max_seq_pages = (
|
1776
|
+
metadata.max_seq_len_k + self.page_size - 1
|
1777
|
+
) // self.page_size
|
1778
|
+
page_indices = self.req_to_token[
|
1779
|
+
req_pool_indices[:, None],
|
1780
|
+
self.draft_extend_metadata["strided_indices"][:max_seq_pages],
|
1781
|
+
]
|
1782
|
+
metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
|
1735
1783
|
|
1736
1784
|
if encoder_lens is not None:
|
1737
1785
|
# Only support encoder size 1 for now
|
@@ -1980,6 +2028,8 @@ class FlashAttentionMultiStepBackend:
|
|
1980
2028
|
assert isinstance(forward_batch.spec_info, EagleDraftInput)
|
1981
2029
|
|
1982
2030
|
for i in range(self.speculative_num_steps - 1):
|
2031
|
+
# TODO: incrementally update the metadata for the later steps,
|
2032
|
+
# so that they do not need to recompute everything from scratch.
|
1983
2033
|
self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
|
1984
2034
|
bs,
|
1985
2035
|
forward_batch.req_pool_indices,
|
@@ -1991,3 +2041,27 @@ class FlashAttentionMultiStepBackend:
|
|
1991
2041
|
seq_lens_cpu=forward_batch.seq_lens_cpu,
|
1992
2042
|
out_cache_loc=forward_batch.out_cache_loc,
|
1993
2043
|
)
|
2044
|
+
|
2045
|
+
|
2046
|
+
# @torch.compile(dynamic=True, backend=get_compiler_backend())
|
2047
|
+
# TODO: fuse these kernels
|
2048
|
+
# NOTE: torch.compile makes it slower in speculative decoding
|
2049
|
+
def normal_decode_set_medadata(
|
2050
|
+
cache_seqlens_int32: torch.Tensor,
|
2051
|
+
cu_seqlens_k: torch.Tensor,
|
2052
|
+
page_table: torch.Tensor,
|
2053
|
+
req_to_token: torch.Tensor,
|
2054
|
+
req_pool_indices: torch.Tensor,
|
2055
|
+
strided_indices: torch.Tensor,
|
2056
|
+
max_seq_pages: torch.Tensor,
|
2057
|
+
seq_lens: torch.Tensor,
|
2058
|
+
seq_len_delta: int,
|
2059
|
+
page_size: int,
|
2060
|
+
):
|
2061
|
+
cache_seqlens_int32.copy_(seq_lens + seq_len_delta)
|
2062
|
+
cu_seqlens_k[1:].copy_(torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32))
|
2063
|
+
page_indices = req_to_token[
|
2064
|
+
req_pool_indices[:, None],
|
2065
|
+
strided_indices[:max_seq_pages][None, :],
|
2066
|
+
]
|
2067
|
+
page_table[:, :max_seq_pages].copy_(page_indices // page_size)
|
@@ -25,6 +25,7 @@ from sglang.global_config import global_config
|
|
25
25
|
from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
|
26
26
|
from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
|
27
27
|
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
28
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
28
29
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
29
30
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
30
31
|
from sglang.srt.utils import is_flashinfer_available, next_power_of_2
|
@@ -149,8 +150,11 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
149
150
|
for _ in range(self.num_wrappers)
|
150
151
|
]
|
151
152
|
|
153
|
+
fmha_backend = "auto"
|
154
|
+
if is_sm100_supported():
|
155
|
+
fmha_backend = "cutlass"
|
152
156
|
self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
|
153
|
-
self.workspace_buffer, "NHD"
|
157
|
+
self.workspace_buffer, "NHD", backend=fmha_backend
|
154
158
|
)
|
155
159
|
|
156
160
|
# Two wrappers: one for sliding window attention and one for full attention.
|
@@ -358,6 +362,35 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
358
362
|
)
|
359
363
|
self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
|
360
364
|
self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
|
365
|
+
elif forward_mode.is_draft_extend():
|
366
|
+
prefill_wrappers = []
|
367
|
+
for i in range(self.num_wrappers):
|
368
|
+
prefill_wrappers.append(
|
369
|
+
BatchPrefillWithPagedKVCacheWrapper(
|
370
|
+
self.workspace_buffer,
|
371
|
+
"NHD",
|
372
|
+
backend="fa2",
|
373
|
+
use_cuda_graph=True,
|
374
|
+
qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
|
375
|
+
paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
|
376
|
+
paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
|
377
|
+
paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
|
378
|
+
)
|
379
|
+
)
|
380
|
+
|
381
|
+
seq_lens_sum = seq_lens.sum().item()
|
382
|
+
self.indices_updater_prefill.update(
|
383
|
+
req_pool_indices,
|
384
|
+
seq_lens,
|
385
|
+
seq_lens_sum,
|
386
|
+
prefix_lens=None,
|
387
|
+
prefill_wrappers=prefill_wrappers,
|
388
|
+
use_ragged=False,
|
389
|
+
encoder_lens=encoder_lens,
|
390
|
+
spec_info=spec_info,
|
391
|
+
)
|
392
|
+
self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
|
393
|
+
self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
|
361
394
|
else:
|
362
395
|
raise ValueError(f"Invalid mode: {forward_mode=}")
|
363
396
|
|
@@ -392,6 +425,17 @@ class FlashInferAttnBackend(AttentionBackend):
|
|
392
425
|
encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
|
393
426
|
spec_info=spec_info,
|
394
427
|
)
|
428
|
+
elif forward_mode.is_draft_extend():
|
429
|
+
self.indices_updater_prefill.update(
|
430
|
+
req_pool_indices[:bs],
|
431
|
+
seq_lens[:bs],
|
432
|
+
seq_lens_sum,
|
433
|
+
prefix_lens=None,
|
434
|
+
prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
|
435
|
+
use_ragged=False,
|
436
|
+
encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
|
437
|
+
spec_info=spec_info,
|
438
|
+
)
|
395
439
|
else:
|
396
440
|
raise ValueError("Invalid forward mode")
|
397
441
|
|
@@ -1005,14 +1049,13 @@ class FlashInferMultiStepDraftBackend:
|
|
1005
1049
|
kv_indices_buffer,
|
1006
1050
|
self.kv_indptr,
|
1007
1051
|
forward_batch.positions,
|
1008
|
-
num_seqs,
|
1009
|
-
self.topk,
|
1010
1052
|
self.pool_len,
|
1011
1053
|
kv_indices_buffer.shape[1],
|
1012
1054
|
self.kv_indptr.shape[1],
|
1013
1055
|
next_power_of_2(num_seqs),
|
1014
1056
|
next_power_of_2(self.speculative_num_steps),
|
1015
1057
|
next_power_of_2(bs),
|
1058
|
+
self.page_size,
|
1016
1059
|
)
|
1017
1060
|
|
1018
1061
|
assert forward_batch.spec_info is not None
|