sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -23,19 +23,29 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
23
23
|
ForwardMode,
|
24
24
|
)
|
25
25
|
from sglang.srt.server_args import ServerArgs
|
26
|
+
from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
|
26
27
|
from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
|
27
28
|
EAGLEDraftCudaGraphRunner,
|
28
29
|
)
|
30
|
+
from sglang.srt.speculative.eagle_draft_extend_cuda_graph_runner import (
|
31
|
+
EAGLEDraftExtendCudaGraphRunner,
|
32
|
+
)
|
29
33
|
from sglang.srt.speculative.eagle_utils import (
|
30
34
|
EagleDraftInput,
|
31
35
|
EagleVerifyInput,
|
32
36
|
EagleVerifyOutput,
|
33
37
|
assign_draft_cache_locs,
|
38
|
+
fast_topk,
|
34
39
|
generate_token_bitmask,
|
35
40
|
select_top_k_tokens,
|
36
41
|
)
|
37
42
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
38
|
-
from sglang.srt.utils import
|
43
|
+
from sglang.srt.utils import (
|
44
|
+
empty_context,
|
45
|
+
get_available_gpu_memory,
|
46
|
+
is_cuda,
|
47
|
+
next_power_of_2,
|
48
|
+
)
|
39
49
|
|
40
50
|
if is_cuda():
|
41
51
|
from sgl_kernel import segment_packbits
|
@@ -66,7 +76,6 @@ class EAGLEWorker(TpModelWorker):
|
|
66
76
|
self.server_args = server_args
|
67
77
|
self.topk = server_args.speculative_eagle_topk
|
68
78
|
self.speculative_num_steps = server_args.speculative_num_steps
|
69
|
-
self.padded_static_len = self.speculative_num_steps + 1
|
70
79
|
self.enable_nan_detection = server_args.enable_nan_detection
|
71
80
|
self.gpu_id = gpu_id
|
72
81
|
self.device = server_args.device
|
@@ -75,6 +84,7 @@ class EAGLEWorker(TpModelWorker):
|
|
75
84
|
self.speculative_algorithm = SpeculativeAlgorithm.from_string(
|
76
85
|
server_args.speculative_algorithm
|
77
86
|
)
|
87
|
+
self.padded_static_len = -1
|
78
88
|
|
79
89
|
# Override context length with target model's context length
|
80
90
|
server_args.context_length = target_worker.model_runner.model_config.context_len
|
@@ -148,11 +158,18 @@ class EAGLEWorker(TpModelWorker):
|
|
148
158
|
self.init_attention_backend()
|
149
159
|
self.init_cuda_graphs()
|
150
160
|
|
161
|
+
# Some dummy tensors
|
162
|
+
self.num_new_pages_per_topk = torch.empty(
|
163
|
+
(), dtype=torch.int64, device=self.device
|
164
|
+
)
|
165
|
+
self.extend_lens = torch.empty((), dtype=torch.int64, device=self.device)
|
166
|
+
|
151
167
|
def init_attention_backend(self):
|
152
168
|
# Create multi-step attn backends and cuda graph runners
|
153
169
|
if self.server_args.attention_backend == "flashinfer":
|
154
170
|
if not global_server_args_dict["use_mla_backend"]:
|
155
171
|
from sglang.srt.layers.attention.flashinfer_backend import (
|
172
|
+
FlashInferAttnBackend,
|
156
173
|
FlashInferMultiStepDraftBackend,
|
157
174
|
)
|
158
175
|
|
@@ -161,8 +178,13 @@ class EAGLEWorker(TpModelWorker):
|
|
161
178
|
self.topk,
|
162
179
|
self.speculative_num_steps,
|
163
180
|
)
|
181
|
+
self.draft_extend_attn_backend = FlashInferAttnBackend(
|
182
|
+
self.draft_model_runner,
|
183
|
+
skip_prefill=False,
|
184
|
+
)
|
164
185
|
else:
|
165
186
|
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
187
|
+
FlashInferMLAAttnBackend,
|
166
188
|
FlashInferMLAMultiStepDraftBackend,
|
167
189
|
)
|
168
190
|
|
@@ -171,11 +193,14 @@ class EAGLEWorker(TpModelWorker):
|
|
171
193
|
self.topk,
|
172
194
|
self.speculative_num_steps,
|
173
195
|
)
|
174
|
-
|
175
|
-
|
196
|
+
self.draft_extend_attn_backend = FlashInferMLAAttnBackend(
|
197
|
+
self.draft_model_runner,
|
198
|
+
skip_prefill=False,
|
199
|
+
)
|
176
200
|
self.has_prefill_wrapper_verify = True
|
177
201
|
elif self.server_args.attention_backend == "triton":
|
178
202
|
from sglang.srt.layers.attention.triton_backend import (
|
203
|
+
TritonAttnBackend,
|
179
204
|
TritonMultiStepDraftBackend,
|
180
205
|
)
|
181
206
|
|
@@ -184,11 +209,14 @@ class EAGLEWorker(TpModelWorker):
|
|
184
209
|
self.topk,
|
185
210
|
self.speculative_num_steps,
|
186
211
|
)
|
187
|
-
self.draft_extend_attn_backend =
|
188
|
-
|
212
|
+
self.draft_extend_attn_backend = TritonAttnBackend(
|
213
|
+
self.draft_model_runner,
|
214
|
+
skip_prefill=False,
|
215
|
+
)
|
189
216
|
self.has_prefill_wrapper_verify = False
|
190
217
|
elif self.server_args.attention_backend == "fa3":
|
191
218
|
from sglang.srt.layers.attention.flashattention_backend import (
|
219
|
+
FlashAttentionBackend,
|
192
220
|
FlashAttentionMultiStepBackend,
|
193
221
|
)
|
194
222
|
|
@@ -197,8 +225,10 @@ class EAGLEWorker(TpModelWorker):
|
|
197
225
|
self.topk,
|
198
226
|
self.speculative_num_steps,
|
199
227
|
)
|
200
|
-
self.draft_extend_attn_backend =
|
201
|
-
|
228
|
+
self.draft_extend_attn_backend = FlashAttentionBackend(
|
229
|
+
self.draft_model_runner,
|
230
|
+
skip_prefill=False,
|
231
|
+
)
|
202
232
|
self.has_prefill_wrapper_verify = False
|
203
233
|
elif self.server_args.attention_backend == "flashmla":
|
204
234
|
from sglang.srt.layers.attention.flashmla_backend import (
|
@@ -211,7 +241,6 @@ class EAGLEWorker(TpModelWorker):
|
|
211
241
|
self.speculative_num_steps,
|
212
242
|
)
|
213
243
|
self.draft_extend_attn_backend = None
|
214
|
-
self.padded_static_len = self.speculative_num_steps + 1
|
215
244
|
self.has_prefill_wrapper_verify = False
|
216
245
|
else:
|
217
246
|
raise ValueError(
|
@@ -237,12 +266,23 @@ class EAGLEWorker(TpModelWorker):
|
|
237
266
|
self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
|
238
267
|
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
239
268
|
logger.info(
|
240
|
-
f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s.
|
269
|
+
f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
|
241
270
|
)
|
242
271
|
|
243
272
|
# Capture extend
|
244
273
|
if self.draft_extend_attn_backend:
|
245
|
-
|
274
|
+
tic = time.perf_counter()
|
275
|
+
before_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
276
|
+
logger.info(
|
277
|
+
f"Capture draft extend cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
278
|
+
)
|
279
|
+
self.cuda_graph_runner_for_draft_extend = EAGLEDraftExtendCudaGraphRunner(
|
280
|
+
self
|
281
|
+
)
|
282
|
+
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
283
|
+
logger.info(
|
284
|
+
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
|
285
|
+
)
|
246
286
|
|
247
287
|
@property
|
248
288
|
def draft_model_runner(self):
|
@@ -288,10 +328,12 @@ class EAGLEWorker(TpModelWorker):
|
|
288
328
|
|
289
329
|
return logits_output, next_token_ids, model_worker_batch.bid, 0, False
|
290
330
|
else:
|
291
|
-
logits_output, next_token_ids, bid =
|
331
|
+
logits_output, next_token_ids, bid, seq_lens_cpu = (
|
332
|
+
self.forward_target_extend(batch)
|
333
|
+
)
|
292
334
|
with self.draft_tp_context(self.draft_model_runner.tp_group):
|
293
335
|
self.forward_draft_extend(
|
294
|
-
batch, logits_output.hidden_states, next_token_ids
|
336
|
+
batch, logits_output.hidden_states, next_token_ids, seq_lens_cpu
|
295
337
|
)
|
296
338
|
return logits_output, next_token_ids, bid, 0, False
|
297
339
|
|
@@ -315,7 +357,12 @@ class EAGLEWorker(TpModelWorker):
|
|
315
357
|
logits_output, next_token_ids, _ = self.target_worker.forward_batch_generation(
|
316
358
|
model_worker_batch
|
317
359
|
)
|
318
|
-
return
|
360
|
+
return (
|
361
|
+
logits_output,
|
362
|
+
next_token_ids,
|
363
|
+
model_worker_batch.bid,
|
364
|
+
model_worker_batch.seq_lens_cpu,
|
365
|
+
)
|
319
366
|
|
320
367
|
def draft(self, batch: ScheduleBatch):
|
321
368
|
# Parse args
|
@@ -330,14 +377,21 @@ class EAGLEWorker(TpModelWorker):
|
|
330
377
|
)
|
331
378
|
|
332
379
|
# Allocate cache locations
|
380
|
+
# Layout of the out_cache_loc
|
381
|
+
# [ topk 0 ] [ topk 1 ]
|
382
|
+
# [iter=0, iter=1, iter=2] [iter=0, iter=1, iter=2]
|
333
383
|
if self.page_size == 1:
|
334
384
|
out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
|
335
|
-
num_seqs * self.
|
385
|
+
num_seqs * self.speculative_num_steps * self.topk, backup_state=True
|
336
386
|
)
|
337
387
|
else:
|
338
388
|
if self.topk == 1:
|
339
|
-
prefix_lens =
|
340
|
-
|
389
|
+
prefix_lens, seq_lens, last_loc = get_last_loc_large_page_size_top_k_1(
|
390
|
+
batch.req_to_token_pool.req_to_token,
|
391
|
+
batch.req_pool_indices,
|
392
|
+
batch.seq_lens,
|
393
|
+
self.speculative_num_steps,
|
394
|
+
)
|
341
395
|
extend_num_tokens = num_seqs * self.speculative_num_steps
|
342
396
|
else:
|
343
397
|
# In this case, the last partial page needs to be duplicated.
|
@@ -350,29 +404,33 @@ class EAGLEWorker(TpModelWorker):
|
|
350
404
|
# "x" means speculative draft tokens
|
351
405
|
# "." means padded tokens
|
352
406
|
|
353
|
-
# TODO:
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
407
|
+
# TODO(lmzheng): The current implementation is still a fake support
|
408
|
+
# for page size > 1. In the `assign_draft_cache_locs` below,
|
409
|
+
# we directly move the indices instead of the real kv cache.
|
410
|
+
# This only works when the kernel backend runs with page size = 1.
|
411
|
+
# If the kernel backend runs with page size > 1, we need to
|
412
|
+
# duplicate the real KV cache. The overhead of duplicating KV
|
413
|
+
# cache seems okay because the draft KV cache only has one layer.
|
414
|
+
# see a related copy operation in MHATokenToKVPool::move_kv_cache.
|
415
|
+
|
416
|
+
(
|
417
|
+
prefix_lens,
|
418
|
+
seq_lens,
|
419
|
+
last_loc,
|
420
|
+
self.num_new_pages_per_topk,
|
421
|
+
self.extend_lens,
|
422
|
+
) = get_last_loc_large_page_size_large_top_k(
|
423
|
+
batch.req_to_token_pool.req_to_token,
|
424
|
+
batch.req_pool_indices,
|
425
|
+
batch.seq_lens,
|
426
|
+
self.speculative_num_steps,
|
427
|
+
self.topk,
|
428
|
+
self.page_size,
|
366
429
|
)
|
367
|
-
|
368
|
-
#
|
369
|
-
|
370
|
-
|
371
|
-
last_loc = get_last_loc(
|
372
|
-
batch.req_to_token_pool.req_to_token,
|
373
|
-
batch.req_pool_indices,
|
374
|
-
prefix_lens,
|
375
|
-
)
|
430
|
+
|
431
|
+
# TODO(lmzheng): remove this device sync
|
432
|
+
extend_num_tokens = torch.sum(self.extend_lens).item()
|
433
|
+
|
376
434
|
out_cache_loc, token_to_kv_pool_state_backup = (
|
377
435
|
batch.alloc_paged_token_slots_extend(
|
378
436
|
prefix_lens,
|
@@ -387,18 +445,30 @@ class EAGLEWorker(TpModelWorker):
|
|
387
445
|
batch.req_pool_indices,
|
388
446
|
batch.req_to_token_pool.req_to_token,
|
389
447
|
batch.seq_lens,
|
448
|
+
self.extend_lens,
|
449
|
+
self.num_new_pages_per_topk,
|
390
450
|
out_cache_loc,
|
391
451
|
batch.req_to_token_pool.req_to_token.shape[1],
|
392
452
|
self.topk,
|
393
453
|
self.speculative_num_steps,
|
394
454
|
self.page_size,
|
455
|
+
next_power_of_2(num_seqs),
|
456
|
+
next_power_of_2(self.speculative_num_steps),
|
395
457
|
)
|
458
|
+
|
459
|
+
if self.page_size > 1 and self.topk > 1:
|
460
|
+
# Remove padded slots
|
461
|
+
out_cache_loc = out_cache_loc[
|
462
|
+
: num_seqs * self.topk * self.speculative_num_steps
|
463
|
+
]
|
464
|
+
|
396
465
|
batch.out_cache_loc = out_cache_loc
|
397
466
|
batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
|
467
|
+
batch.return_hidden_states = False
|
398
468
|
spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
|
469
|
+
spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
399
470
|
|
400
471
|
# Get forward batch
|
401
|
-
spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
402
472
|
model_worker_batch = batch.get_model_worker_batch()
|
403
473
|
forward_batch = ForwardBatch.init_new(
|
404
474
|
model_worker_batch, self.draft_model_runner
|
@@ -413,15 +483,19 @@ class EAGLEWorker(TpModelWorker):
|
|
413
483
|
else:
|
414
484
|
# Initialize attention backend
|
415
485
|
self.draft_attn_backend.init_forward_metadata(forward_batch)
|
416
|
-
forward_batch = ForwardBatch.init_new(
|
417
|
-
model_worker_batch, self.draft_model_runner
|
418
|
-
)
|
419
486
|
# Run forward steps
|
420
487
|
score_list, token_list, parents_list = self.draft_forward(forward_batch)
|
421
488
|
|
422
489
|
self.token_to_kv_pool_allocator.restore_state(token_to_kv_pool_state_backup)
|
423
490
|
|
424
|
-
|
491
|
+
(
|
492
|
+
tree_mask,
|
493
|
+
position,
|
494
|
+
retrive_index,
|
495
|
+
retrive_next_token,
|
496
|
+
retrive_next_sibling,
|
497
|
+
draft_tokens,
|
498
|
+
) = build_tree_kernel_efficient(
|
425
499
|
spec_info.verified_id,
|
426
500
|
score_list,
|
427
501
|
token_list,
|
@@ -432,7 +506,22 @@ class EAGLEWorker(TpModelWorker):
|
|
432
506
|
self.speculative_num_steps,
|
433
507
|
self.server_args.speculative_num_draft_tokens,
|
434
508
|
)
|
435
|
-
|
509
|
+
|
510
|
+
return EagleVerifyInput(
|
511
|
+
draft_token=draft_tokens,
|
512
|
+
custom_mask=tree_mask,
|
513
|
+
positions=position,
|
514
|
+
retrive_index=retrive_index,
|
515
|
+
retrive_next_token=retrive_next_token,
|
516
|
+
retrive_next_sibling=retrive_next_sibling,
|
517
|
+
retrive_cum_len=None,
|
518
|
+
spec_steps=self.speculative_num_steps,
|
519
|
+
topk=self.topk,
|
520
|
+
draft_token_num=self.server_args.speculative_num_draft_tokens,
|
521
|
+
capture_hidden_mode=CaptureHiddenMode.FULL,
|
522
|
+
seq_lens_sum=forward_batch.seq_lens_sum,
|
523
|
+
seq_lens_cpu=forward_batch.seq_lens_cpu,
|
524
|
+
)
|
436
525
|
|
437
526
|
def draft_forward(self, forward_batch: ForwardBatch):
|
438
527
|
# Parse args
|
@@ -446,6 +535,13 @@ class EAGLEWorker(TpModelWorker):
|
|
446
535
|
if self.hot_token_id is not None:
|
447
536
|
topk_index = self.hot_token_id[topk_index]
|
448
537
|
|
538
|
+
out_cache_loc = out_cache_loc.reshape(
|
539
|
+
forward_batch.batch_size, self.topk, self.speculative_num_steps
|
540
|
+
)
|
541
|
+
out_cache_loc = out_cache_loc.permute((2, 0, 1)).reshape(
|
542
|
+
self.speculative_num_steps, -1
|
543
|
+
)
|
544
|
+
|
449
545
|
# Return values
|
450
546
|
score_list: List[torch.Tensor] = []
|
451
547
|
token_list: List[torch.Tensor] = []
|
@@ -467,10 +563,7 @@ class EAGLEWorker(TpModelWorker):
|
|
467
563
|
|
468
564
|
# Set inputs
|
469
565
|
forward_batch.input_ids = input_ids
|
470
|
-
out_cache_loc = out_cache_loc
|
471
|
-
forward_batch.out_cache_loc = out_cache_loc[
|
472
|
-
:, self.topk * i : self.topk * (i + 1)
|
473
|
-
].flatten()
|
566
|
+
forward_batch.out_cache_loc = out_cache_loc[i]
|
474
567
|
forward_batch.positions.add_(1)
|
475
568
|
forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
|
476
569
|
spec_info.hidden_states = hidden_states
|
@@ -490,9 +583,13 @@ class EAGLEWorker(TpModelWorker):
|
|
490
583
|
|
491
584
|
def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
|
492
585
|
spec_info.prepare_for_verify(batch, self.page_size)
|
586
|
+
batch.return_hidden_states = False
|
493
587
|
batch.forward_mode = ForwardMode.TARGET_VERIFY
|
494
588
|
batch.spec_info = spec_info
|
495
|
-
model_worker_batch = batch.get_model_worker_batch(
|
589
|
+
model_worker_batch = batch.get_model_worker_batch(
|
590
|
+
seq_lens_cpu_cache=spec_info.seq_lens_cpu
|
591
|
+
)
|
592
|
+
assert model_worker_batch.capture_hidden_mode == spec_info.capture_hidden_mode
|
496
593
|
|
497
594
|
if batch.has_grammar:
|
498
595
|
retrieve_next_token_cpu = spec_info.retrive_next_token.cpu()
|
@@ -524,7 +621,7 @@ class EAGLEWorker(TpModelWorker):
|
|
524
621
|
if vocab_mask is not None:
|
525
622
|
assert spec_info.grammar is not None
|
526
623
|
vocab_mask = vocab_mask.to(spec_info.retrive_next_token.device)
|
527
|
-
# otherwise, this vocab mask will be the one from the previous extend stage
|
624
|
+
# NOTE (sk): otherwise, this vocab mask will be the one from the previous extend stage
|
528
625
|
# and will be applied to produce wrong results
|
529
626
|
batch.sampling_info.vocab_mask = None
|
530
627
|
|
@@ -545,13 +642,13 @@ class EAGLEWorker(TpModelWorker):
|
|
545
642
|
]
|
546
643
|
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
|
547
644
|
|
645
|
+
if batch.return_logprob:
|
646
|
+
self.add_logprob_values(batch, res, logits_output)
|
647
|
+
|
548
648
|
# Prepare the batch for the next draft forwards.
|
549
649
|
batch.forward_mode = ForwardMode.DECODE
|
550
650
|
batch.spec_info = res.draft_input
|
551
651
|
|
552
|
-
if batch.return_logprob:
|
553
|
-
self.add_logprob_values(batch, res, logits_output)
|
554
|
-
|
555
652
|
return logits_output, res, model_worker_batch, can_run_cuda_graph
|
556
653
|
|
557
654
|
def add_logprob_values(
|
@@ -564,8 +661,16 @@ class EAGLEWorker(TpModelWorker):
|
|
564
661
|
logits_output = res.logits_output
|
565
662
|
top_logprobs_nums = batch.top_logprobs_nums
|
566
663
|
token_ids_logprobs = batch.token_ids_logprobs
|
664
|
+
accepted_indices = res.accepted_indices
|
665
|
+
assert len(accepted_indices) == len(logits_output.next_token_logits)
|
666
|
+
temperatures = batch.sampling_info.temperatures
|
667
|
+
num_draft_tokens = batch.spec_info.draft_token_num
|
668
|
+
# acceptance indices are the indices in a "flattened" batch.
|
669
|
+
# dividing it to num_draft_tokens will yield the actual batch index.
|
670
|
+
temperatures = temperatures[accepted_indices // num_draft_tokens]
|
671
|
+
|
567
672
|
logprobs = torch.nn.functional.log_softmax(
|
568
|
-
logits_output.next_token_logits, dim=-1
|
673
|
+
logits_output.next_token_logits / temperatures, dim=-1
|
569
674
|
)
|
570
675
|
batch_next_token_ids = res.verified_id
|
571
676
|
num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
|
@@ -600,7 +705,7 @@ class EAGLEWorker(TpModelWorker):
|
|
600
705
|
pt = 0
|
601
706
|
next_token_logprobs = logits_output.next_token_logprobs.tolist()
|
602
707
|
verified_ids = batch_next_token_ids.tolist()
|
603
|
-
for req, num_tokens in zip(batch.reqs, num_tokens_per_req):
|
708
|
+
for req, num_tokens in zip(batch.reqs, num_tokens_per_req, strict=True):
|
604
709
|
for _ in range(num_tokens):
|
605
710
|
if req.return_logprob:
|
606
711
|
req.output_token_logprobs_val.append(next_token_logprobs[pt])
|
@@ -619,6 +724,7 @@ class EAGLEWorker(TpModelWorker):
|
|
619
724
|
batch: ScheduleBatch,
|
620
725
|
hidden_states: torch.Tensor,
|
621
726
|
next_token_ids: List[int],
|
727
|
+
seq_lens_cpu: torch.Tensor,
|
622
728
|
):
|
623
729
|
"""Run draft model extend. This API modifies the states of the batch.
|
624
730
|
|
@@ -631,9 +737,12 @@ class EAGLEWorker(TpModelWorker):
|
|
631
737
|
hidden_states=hidden_states,
|
632
738
|
verified_id=next_token_ids,
|
633
739
|
)
|
740
|
+
batch.return_hidden_states = False
|
634
741
|
batch.spec_info.prepare_for_extend(batch)
|
635
742
|
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
636
|
-
model_worker_batch = batch.get_model_worker_batch(
|
743
|
+
model_worker_batch = batch.get_model_worker_batch(
|
744
|
+
seq_lens_cpu_cache=seq_lens_cpu
|
745
|
+
)
|
637
746
|
forward_batch = ForwardBatch.init_new(
|
638
747
|
model_worker_batch, self.draft_model_runner
|
639
748
|
)
|
@@ -652,23 +761,41 @@ class EAGLEWorker(TpModelWorker):
|
|
652
761
|
return_logprob_backup = batch.return_logprob
|
653
762
|
|
654
763
|
# Prepare metadata
|
655
|
-
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
656
764
|
batch.spec_info.prepare_extend_after_decode(
|
657
765
|
batch,
|
658
766
|
self.speculative_num_steps,
|
659
767
|
)
|
660
|
-
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
|
661
|
-
batch.return_logprob = False
|
662
768
|
model_worker_batch = batch.get_model_worker_batch()
|
663
769
|
forward_batch = ForwardBatch.init_new(
|
664
770
|
model_worker_batch, self.draft_model_runner
|
665
771
|
)
|
772
|
+
if forward_batch.seq_lens_cpu is not None:
|
773
|
+
forward_batch.seq_lens_sum = forward_batch.seq_lens_cpu.sum().item()
|
774
|
+
else:
|
775
|
+
forward_batch.seq_lens_sum = batch.seq_lens.sum().item()
|
666
776
|
|
667
777
|
# Run
|
668
|
-
|
778
|
+
can_cuda_graph = (
|
779
|
+
self.cuda_graph_runner_for_draft_extend
|
780
|
+
and self.cuda_graph_runner_for_draft_extend.can_run(forward_batch)
|
781
|
+
)
|
782
|
+
if can_cuda_graph:
|
783
|
+
logits_output = self.cuda_graph_runner_for_draft_extend.replay(
|
784
|
+
forward_batch
|
785
|
+
)
|
786
|
+
forward_batch.spec_info.topk_p, forward_batch.spec_info.topk_index = (
|
787
|
+
logits_output.topk_p,
|
788
|
+
logits_output.topk_index,
|
789
|
+
)
|
790
|
+
forward_batch.spec_info.hidden_states = logits_output.hidden_states
|
791
|
+
else:
|
792
|
+
self.draft_model_runner.attn_backend.init_forward_metadata(forward_batch)
|
793
|
+
logits_output = self.draft_model_runner.model.forward(
|
794
|
+
forward_batch.input_ids, forward_batch.positions, forward_batch
|
795
|
+
)
|
796
|
+
self.capture_for_decode(logits_output, forward_batch.spec_info)
|
669
797
|
|
670
798
|
self._detect_nan_if_needed(logits_output)
|
671
|
-
self.capture_for_decode(logits_output, forward_batch.spec_info)
|
672
799
|
|
673
800
|
# Restore backup.
|
674
801
|
# This is because `seq_lens` can be modified in `prepare_extend_after_decode`
|
@@ -701,4 +828,48 @@ def load_token_map(token_map_path: str) -> List[int]:
|
|
701
828
|
)
|
702
829
|
token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
|
703
830
|
hot_token_id = torch.load(token_map_path, weights_only=True)
|
704
|
-
return torch.tensor(hot_token_id, dtype=torch.
|
831
|
+
return torch.tensor(hot_token_id, dtype=torch.int64)
|
832
|
+
|
833
|
+
|
834
|
+
@torch.compile(dynamic=True)
|
835
|
+
def get_last_loc_large_page_size_top_k_1(
|
836
|
+
req_to_token: torch.Tensor,
|
837
|
+
req_pool_indices: torch.Tensor,
|
838
|
+
seq_lens,
|
839
|
+
speculative_num_steps: int,
|
840
|
+
):
|
841
|
+
prefix_lens = seq_lens
|
842
|
+
seq_lens = prefix_lens + speculative_num_steps
|
843
|
+
last_loc = get_last_loc(
|
844
|
+
req_to_token,
|
845
|
+
req_pool_indices,
|
846
|
+
prefix_lens,
|
847
|
+
)
|
848
|
+
return prefix_lens, seq_lens, last_loc
|
849
|
+
|
850
|
+
|
851
|
+
@torch.compile(dynamic=True)
|
852
|
+
def get_last_loc_large_page_size_large_top_k(
|
853
|
+
req_to_token: torch.Tensor,
|
854
|
+
req_pool_indices: torch.Tensor,
|
855
|
+
seq_lens: torch.Tensor,
|
856
|
+
speculative_num_steps: int,
|
857
|
+
topk: int,
|
858
|
+
page_size: int,
|
859
|
+
):
|
860
|
+
prefix_lens = seq_lens
|
861
|
+
last_page_lens = prefix_lens % page_size
|
862
|
+
num_new_pages_per_topk = (
|
863
|
+
last_page_lens + speculative_num_steps + page_size - 1
|
864
|
+
) // page_size
|
865
|
+
seq_lens = prefix_lens // page_size * page_size + num_new_pages_per_topk * (
|
866
|
+
page_size * topk
|
867
|
+
)
|
868
|
+
extend_lens = seq_lens - prefix_lens
|
869
|
+
last_loc = get_last_loc(
|
870
|
+
req_to_token,
|
871
|
+
req_pool_indices,
|
872
|
+
prefix_lens,
|
873
|
+
)
|
874
|
+
|
875
|
+
return prefix_lens, seq_lens, last_loc, num_new_pages_per_topk, extend_lens
|