sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/operations.py
CHANGED
@@ -12,7 +12,7 @@ if _ENABLE_PROFILE:
|
|
12
12
|
|
13
13
|
|
14
14
|
def execute_operations(inputs, operations):
|
15
|
-
stages = _convert_operations_to_stages(
|
15
|
+
stages = _convert_operations_to_stages(operations)
|
16
16
|
executor = _StageExecutor("primary", stages, inputs=inputs)
|
17
17
|
for _ in range(executor.num_stages):
|
18
18
|
executor.next()
|
@@ -20,6 +20,37 @@ def execute_operations(inputs, operations):
|
|
20
20
|
return executor.output
|
21
21
|
|
22
22
|
|
23
|
+
def execute_overlapped_operations(
|
24
|
+
inputs_arr: Sequence,
|
25
|
+
operations_arr: Sequence,
|
26
|
+
delta_stages: Sequence[int],
|
27
|
+
) -> Sequence:
|
28
|
+
# Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
|
29
|
+
inputs_a, inputs_b = inputs_arr
|
30
|
+
operations_a, operations_b = operations_arr
|
31
|
+
delta_stage_a, delta_stage_b = delta_stages
|
32
|
+
assert delta_stage_a == 0
|
33
|
+
delta_stage = delta_stage_b
|
34
|
+
|
35
|
+
stages_a = _convert_operations_to_stages(operations_a)
|
36
|
+
stages_b = _convert_operations_to_stages(operations_b)
|
37
|
+
executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
|
38
|
+
executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
|
39
|
+
|
40
|
+
for _ in range(delta_stage):
|
41
|
+
executor_a.next()
|
42
|
+
|
43
|
+
for _ in range(executor_a.num_stages - delta_stage):
|
44
|
+
executor_a.next()
|
45
|
+
executor_b.next()
|
46
|
+
|
47
|
+
for _ in range(delta_stage):
|
48
|
+
executor_b.next()
|
49
|
+
|
50
|
+
assert executor_a.done and executor_b.done
|
51
|
+
return [executor_a.output, executor_b.output]
|
52
|
+
|
53
|
+
|
23
54
|
class YieldOperation:
|
24
55
|
pass
|
25
56
|
|
@@ -109,6 +140,9 @@ class _StateDict:
|
|
109
140
|
for k, v in values.items():
|
110
141
|
setattr(self, k, v)
|
111
142
|
|
143
|
+
def get(self, item):
|
144
|
+
return self._data.get(item)
|
145
|
+
|
112
146
|
def clear(self, expect_keys: Sequence[str]):
|
113
147
|
if set(self._data.keys()) != set(expect_keys):
|
114
148
|
raise Exception(
|
@@ -119,6 +153,7 @@ class _StateDict:
|
|
119
153
|
|
120
154
|
|
121
155
|
def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
|
156
|
+
operations = _decorate_operations(operations)
|
122
157
|
operation_chunks = list(
|
123
158
|
_chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
|
124
159
|
)
|
@@ -140,7 +175,7 @@ def _chunk_by_separator(
|
|
140
175
|
yield pending_items
|
141
176
|
|
142
177
|
|
143
|
-
def
|
178
|
+
def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
|
144
179
|
return [_decorate_operation(op, debug_name_prefix) for op in operations]
|
145
180
|
|
146
181
|
|
@@ -1,31 +1,207 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import List, Optional
|
3
|
+
|
1
4
|
import torch
|
2
5
|
|
6
|
+
from sglang.srt import operations
|
7
|
+
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
|
8
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
9
|
+
from sglang.srt.operations import Operation
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class OperationsStrategy:
|
14
|
+
operations: List[Operation]
|
15
|
+
deep_gemm_num_sms: Optional[int] = None
|
16
|
+
tbo_delta_stages: Optional[int] = None
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
|
20
|
+
return OperationsStrategy(
|
21
|
+
operations=[x for item in items for x in item.operations],
|
22
|
+
deep_gemm_num_sms=_assert_all_same(
|
23
|
+
[item.deep_gemm_num_sms for item in items]
|
24
|
+
),
|
25
|
+
tbo_delta_stages=_assert_all_same(
|
26
|
+
[item.tbo_delta_stages for item in items]
|
27
|
+
),
|
28
|
+
)
|
29
|
+
|
30
|
+
@staticmethod
|
31
|
+
def init_new_tbo(
|
32
|
+
layers: torch.nn.ModuleList,
|
33
|
+
forward_mode: ForwardMode,
|
34
|
+
) -> "OperationsStrategy":
|
35
|
+
layer_name = layers[0].__class__.__name__
|
36
|
+
if layer_name == "DeepseekV2DecoderLayer":
|
37
|
+
return OperationsStrategy.concat(
|
38
|
+
[
|
39
|
+
_compute_moe_deepseek_layer_operations_strategy_tbo(
|
40
|
+
layer, forward_mode
|
41
|
+
)
|
42
|
+
for layer in layers
|
43
|
+
]
|
44
|
+
)
|
45
|
+
elif layer_name == "Qwen3MoeDecoderLayer":
|
46
|
+
return OperationsStrategy.concat(
|
47
|
+
[
|
48
|
+
_compute_moe_qwen3_layer_operations_strategy_tbo(
|
49
|
+
layer, forward_mode
|
50
|
+
)
|
51
|
+
for layer in layers
|
52
|
+
]
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
raise NotImplementedError
|
56
|
+
|
57
|
+
|
58
|
+
def _assert_all_same(items: List):
|
59
|
+
assert all(item == items[0] for item in items)
|
60
|
+
return items[0]
|
61
|
+
|
62
|
+
|
63
|
+
# -------------------------------- Strategy for DeepSeek ---------------------------------------
|
64
|
+
|
65
|
+
|
66
|
+
# TODO can refactor to make it more fancy if we have more complex strategies
|
67
|
+
def _compute_moe_deepseek_layer_operations_strategy_tbo(
|
68
|
+
layer: torch.nn.Module,
|
69
|
+
forward_mode: ForwardMode,
|
70
|
+
) -> OperationsStrategy:
|
71
|
+
assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
|
72
|
+
if forward_mode == ForwardMode.EXTEND:
|
73
|
+
return _compute_moe_deepseek_blog_prefill(layer)
|
74
|
+
elif forward_mode == ForwardMode.DECODE:
|
75
|
+
return _compute_moe_deepseek_blog_decode(layer)
|
76
|
+
else:
|
77
|
+
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
78
|
+
|
79
|
+
|
80
|
+
def _compute_moe_deepseek_blog_prefill(layer):
|
81
|
+
device_properties = torch.cuda.get_device_properties(device="cuda")
|
82
|
+
total_num_sms = device_properties.multi_processor_count
|
83
|
+
deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
|
84
|
+
|
85
|
+
return OperationsStrategy(
|
86
|
+
deep_gemm_num_sms=deep_gemm_num_sms,
|
87
|
+
tbo_delta_stages=0,
|
88
|
+
operations=[
|
89
|
+
layer.op_comm_prepare_attn,
|
90
|
+
layer.self_attn.op_prepare,
|
91
|
+
layer.self_attn.op_core,
|
92
|
+
layer.op_comm_prepare_mlp,
|
93
|
+
layer.mlp.op_gate,
|
94
|
+
layer.mlp.op_select_experts,
|
95
|
+
layer.mlp.op_dispatch_a,
|
96
|
+
operations.YieldOperation(),
|
97
|
+
layer.mlp.op_dispatch_b,
|
98
|
+
layer.mlp.op_experts,
|
99
|
+
layer.mlp.op_combine_a,
|
100
|
+
operations.YieldOperation(),
|
101
|
+
layer.mlp.op_shared_experts,
|
102
|
+
layer.mlp.op_combine_b,
|
103
|
+
layer.mlp.op_output,
|
104
|
+
layer.op_comm_postprocess_layer,
|
105
|
+
],
|
106
|
+
)
|
107
|
+
|
108
|
+
|
109
|
+
def _compute_moe_deepseek_blog_decode(layer):
|
110
|
+
return OperationsStrategy(
|
111
|
+
deep_gemm_num_sms=None,
|
112
|
+
tbo_delta_stages=2,
|
113
|
+
operations=[
|
114
|
+
layer.op_comm_prepare_attn,
|
115
|
+
layer.self_attn.op_prepare,
|
116
|
+
operations.YieldOperation(),
|
117
|
+
layer.self_attn.op_core,
|
118
|
+
layer.op_comm_prepare_mlp,
|
119
|
+
layer.mlp.op_gate,
|
120
|
+
layer.mlp.op_select_experts,
|
121
|
+
operations.YieldOperation(),
|
122
|
+
layer.mlp.op_dispatch_a,
|
123
|
+
layer.mlp.op_shared_experts,
|
124
|
+
operations.YieldOperation(),
|
125
|
+
layer.mlp.op_dispatch_b,
|
126
|
+
layer.mlp.op_experts,
|
127
|
+
layer.mlp.op_combine_a,
|
128
|
+
operations.YieldOperation(),
|
129
|
+
layer.mlp.op_combine_b,
|
130
|
+
operations.YieldOperation(),
|
131
|
+
layer.mlp.op_output,
|
132
|
+
layer.op_comm_postprocess_layer,
|
133
|
+
],
|
134
|
+
)
|
135
|
+
|
136
|
+
|
137
|
+
# -------------------------------- Strategy for Qwen3 ---------------------------------------
|
138
|
+
|
3
139
|
|
4
|
-
|
140
|
+
# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
|
141
|
+
# convenience to adjust strategy
|
142
|
+
def _compute_moe_qwen3_layer_operations_strategy_tbo(
|
5
143
|
layer: torch.nn.Module,
|
6
|
-
|
7
|
-
|
8
|
-
|
144
|
+
forward_mode: ForwardMode,
|
145
|
+
) -> OperationsStrategy:
|
146
|
+
assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
|
147
|
+
if forward_mode == ForwardMode.EXTEND:
|
148
|
+
return _compute_moe_qwen3_prefill(layer)
|
149
|
+
elif forward_mode == ForwardMode.DECODE:
|
150
|
+
return _compute_moe_qwen3_decode(layer)
|
151
|
+
else:
|
152
|
+
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
153
|
+
|
154
|
+
|
155
|
+
def _compute_moe_qwen3_prefill(layer):
|
156
|
+
device_properties = torch.cuda.get_device_properties(device="cuda")
|
157
|
+
total_num_sms = device_properties.multi_processor_count
|
158
|
+
deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
|
159
|
+
|
160
|
+
return OperationsStrategy(
|
161
|
+
deep_gemm_num_sms=deep_gemm_num_sms,
|
162
|
+
tbo_delta_stages=0,
|
163
|
+
operations=[
|
164
|
+
layer.op_comm_prepare_attn,
|
165
|
+
layer.self_attn.op_prepare,
|
166
|
+
layer.self_attn.op_core,
|
167
|
+
layer.op_comm_prepare_mlp,
|
168
|
+
layer.mlp.op_gate,
|
169
|
+
layer.mlp.op_select_experts,
|
170
|
+
layer.mlp.op_dispatch_a,
|
171
|
+
operations.YieldOperation(),
|
172
|
+
layer.mlp.op_dispatch_b,
|
173
|
+
layer.mlp.op_experts,
|
174
|
+
layer.mlp.op_combine_a,
|
175
|
+
operations.YieldOperation(),
|
176
|
+
layer.mlp.op_combine_b,
|
177
|
+
layer.mlp.op_output,
|
178
|
+
layer.op_comm_postprocess_layer,
|
179
|
+
],
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def _compute_moe_qwen3_decode(layer):
|
184
|
+
return OperationsStrategy(
|
185
|
+
deep_gemm_num_sms=None,
|
186
|
+
tbo_delta_stages=2,
|
187
|
+
operations=[
|
9
188
|
layer.op_comm_prepare_attn,
|
10
|
-
layer.
|
189
|
+
layer.self_attn.op_prepare,
|
190
|
+
operations.YieldOperation(),
|
191
|
+
layer.self_attn.op_core,
|
11
192
|
layer.op_comm_prepare_mlp,
|
12
|
-
layer.
|
193
|
+
layer.mlp.op_gate,
|
194
|
+
layer.mlp.op_select_experts,
|
195
|
+
operations.YieldOperation(),
|
196
|
+
layer.mlp.op_dispatch_a,
|
197
|
+
operations.YieldOperation(),
|
198
|
+
layer.mlp.op_dispatch_b,
|
199
|
+
layer.mlp.op_experts,
|
200
|
+
layer.mlp.op_combine_a,
|
201
|
+
operations.YieldOperation(),
|
202
|
+
layer.mlp.op_combine_b,
|
203
|
+
layer.mlp.op_output,
|
13
204
|
layer.op_comm_postprocess_layer,
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return [
|
18
|
-
layer.op_comm_prepare_attn,
|
19
|
-
layer.op_attn,
|
20
|
-
layer.op_comm_prepare_mlp,
|
21
|
-
layer.mlp.op_gate,
|
22
|
-
layer.mlp.op_shared_experts,
|
23
|
-
layer.mlp.op_select_experts,
|
24
|
-
layer.mlp.op_dispatch_a,
|
25
|
-
layer.mlp.op_dispatch_b,
|
26
|
-
layer.mlp.op_experts,
|
27
|
-
layer.mlp.op_combine_a,
|
28
|
-
layer.mlp.op_combine_b,
|
29
|
-
layer.mlp.op_output,
|
30
|
-
layer.op_comm_postprocess_layer,
|
31
|
-
]
|
205
|
+
operations.YieldOperation(),
|
206
|
+
],
|
207
|
+
)
|
@@ -9,10 +9,12 @@ import torch
|
|
9
9
|
|
10
10
|
import sglang.srt.sampling.penaltylib as penaltylib
|
11
11
|
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
|
12
|
+
from sglang.srt.sampling.sampling_params import TOP_K_ALL
|
12
13
|
|
13
14
|
if TYPE_CHECKING:
|
14
15
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
15
16
|
|
17
|
+
|
16
18
|
logger = logging.getLogger(__name__)
|
17
19
|
|
18
20
|
|
@@ -27,6 +29,12 @@ class SamplingBatchInfo:
|
|
27
29
|
# Whether all requests use greedy sampling
|
28
30
|
is_all_greedy: bool
|
29
31
|
|
32
|
+
# Whether any requests use top_p sampling
|
33
|
+
need_top_p_sampling: bool
|
34
|
+
|
35
|
+
# Whether any requests use top_k sampling
|
36
|
+
need_top_k_sampling: bool
|
37
|
+
|
30
38
|
# Whether any request needs min_p sampling
|
31
39
|
need_min_p_sampling: bool
|
32
40
|
|
@@ -133,6 +141,8 @@ class SamplingBatchInfo:
|
|
133
141
|
top_ks=top_ks,
|
134
142
|
min_ps=min_ps,
|
135
143
|
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
|
144
|
+
need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
|
145
|
+
need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
|
136
146
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
137
147
|
vocab_size=vocab_size,
|
138
148
|
penalizer_orchestrator=penalizer_orchestrator,
|
@@ -167,7 +177,7 @@ class SamplingBatchInfo:
|
|
167
177
|
|
168
178
|
# Apply the mask
|
169
179
|
for i, grammar in enumerate(self.grammars):
|
170
|
-
if grammar and not grammar.finished:
|
180
|
+
if grammar and not grammar.finished and not grammar.is_terminated():
|
171
181
|
grammar.fill_vocab_mask(self.vocab_mask, i)
|
172
182
|
|
173
183
|
# Move the mask to the device if needed
|
@@ -308,4 +318,6 @@ class SamplingBatchInfo:
|
|
308
318
|
setattr(self, item, torch.cat([self_val, other_val]))
|
309
319
|
|
310
320
|
self.is_all_greedy &= other.is_all_greedy
|
321
|
+
self.need_top_p_sampling |= other.need_top_p_sampling
|
322
|
+
self.need_top_k_sampling |= other.need_top_k_sampling
|
311
323
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
@@ -16,6 +16,7 @@
|
|
16
16
|
from typing import Any, Dict, List, Optional, Union
|
17
17
|
|
18
18
|
_SAMPLING_EPS = 1e-6
|
19
|
+
TOP_K_ALL = 1 << 30
|
19
20
|
|
20
21
|
|
21
22
|
class SamplingParams:
|
@@ -84,7 +85,7 @@ class SamplingParams:
|
|
84
85
|
self.temperature = 1.0
|
85
86
|
self.top_k = 1
|
86
87
|
if self.top_k == -1:
|
87
|
-
self.top_k =
|
88
|
+
self.top_k = TOP_K_ALL # whole vocabulary
|
88
89
|
|
89
90
|
def verify(self):
|
90
91
|
if self.temperature < 0.0:
|
sglang/srt/server_args.py
CHANGED
@@ -28,6 +28,7 @@ from sglang.srt.utils import (
|
|
28
28
|
configure_ipv6,
|
29
29
|
get_device,
|
30
30
|
get_device_memory_capacity,
|
31
|
+
is_cuda,
|
31
32
|
is_flashinfer_available,
|
32
33
|
is_hip,
|
33
34
|
is_port_available,
|
@@ -60,6 +61,7 @@ class ServerArgs:
|
|
60
61
|
is_embedding: bool = False
|
61
62
|
enable_multimodal: Optional[bool] = None
|
62
63
|
revision: Optional[str] = None
|
64
|
+
impl: str = "auto"
|
63
65
|
|
64
66
|
# Port for the HTTP server
|
65
67
|
host: str = "127.0.0.1"
|
@@ -163,20 +165,24 @@ class ServerArgs:
|
|
163
165
|
enable_tokenizer_batch_encode: bool = False
|
164
166
|
disable_outlines_disk_cache: bool = False
|
165
167
|
disable_custom_all_reduce: bool = False
|
168
|
+
enable_mscclpp: bool = False
|
166
169
|
disable_overlap_schedule: bool = False
|
167
170
|
enable_mixed_chunk: bool = False
|
168
171
|
enable_dp_attention: bool = False
|
169
172
|
enable_dp_lm_head: bool = False
|
173
|
+
enable_two_batch_overlap: bool = False
|
170
174
|
enable_ep_moe: bool = False
|
171
175
|
enable_deepep_moe: bool = False
|
172
176
|
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
173
177
|
ep_num_redundant_experts: int = 0
|
174
|
-
ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
|
178
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
175
179
|
init_expert_location: str = "trivial"
|
176
180
|
enable_eplb: bool = False
|
181
|
+
eplb_algorithm: str = "auto"
|
177
182
|
eplb_rebalance_num_iterations: int = 1000
|
183
|
+
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
178
184
|
expert_distribution_recorder_mode: Optional[
|
179
|
-
Literal["stat", "per_pass", "per_token"]
|
185
|
+
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
180
186
|
] = None
|
181
187
|
expert_distribution_recorder_buffer_size: Optional[int] = None
|
182
188
|
enable_expert_distribution_metrics: bool = False
|
@@ -203,7 +209,7 @@ class ServerArgs:
|
|
203
209
|
flashinfer_mla_disable_ragged: bool = False
|
204
210
|
warmups: Optional[str] = None
|
205
211
|
moe_dense_tp_size: Optional[int] = None
|
206
|
-
|
212
|
+
disable_shared_experts_fusion: bool = False
|
207
213
|
disable_chunked_prefix_cache: bool = False
|
208
214
|
disable_fast_image_processor: bool = False
|
209
215
|
mm_attention_backend: Optional[str] = None
|
@@ -259,17 +265,28 @@ class ServerArgs:
|
|
259
265
|
self.mem_fraction_static = 0.88
|
260
266
|
else:
|
261
267
|
self.mem_fraction_static = 0.88
|
262
|
-
if gpu_mem is not None and gpu_mem >
|
268
|
+
if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
|
269
|
+
self.mem_fraction_static = 0.79
|
270
|
+
elif gpu_mem is not None and gpu_mem > 96 * 1024:
|
263
271
|
mem_fraction = self.mem_fraction_static
|
272
|
+
# 15 GB + additional 3GB for cuda graph
|
273
|
+
reserve_mem = 1024 * 18
|
274
|
+
# need reserve more memory for spec cuda graph
|
275
|
+
if self.speculative_algorithm is not None:
|
276
|
+
reserve_mem = 1024 * 20
|
264
277
|
self.mem_fraction_static = min(
|
265
278
|
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
266
|
-
(gpu_mem -
|
267
|
-
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
279
|
+
(gpu_mem - reserve_mem) / gpu_mem,
|
268
280
|
)
|
281
|
+
else:
|
282
|
+
if self.speculative_algorithm is not None:
|
283
|
+
self.mem_fraction_static *= 0.95
|
269
284
|
|
270
285
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
271
286
|
if self.chunked_prefill_size is None:
|
272
|
-
if gpu_mem is not None and gpu_mem
|
287
|
+
if gpu_mem is not None and gpu_mem > 180_000:
|
288
|
+
self.chunked_prefill_size = 16384
|
289
|
+
elif gpu_mem is not None and gpu_mem < 25_000:
|
273
290
|
self.chunked_prefill_size = 2048
|
274
291
|
elif self.disaggregation_mode != "null":
|
275
292
|
self.chunked_prefill_size = 16384
|
@@ -309,6 +326,11 @@ class ServerArgs:
|
|
309
326
|
self.sampling_backend = "pytorch"
|
310
327
|
|
311
328
|
# Set kernel backends
|
329
|
+
if self.device == "cpu":
|
330
|
+
if self.attention_backend is None:
|
331
|
+
self.attention_backend = "intel_amx"
|
332
|
+
self.sampling_backend = "pytorch"
|
333
|
+
|
312
334
|
if self.sampling_backend is None:
|
313
335
|
self.sampling_backend = (
|
314
336
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
@@ -365,12 +387,28 @@ class ServerArgs:
|
|
365
387
|
"Pipeline parallelism is incompatible with overlap schedule."
|
366
388
|
)
|
367
389
|
|
390
|
+
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
391
|
+
self.expert_distribution_recorder_mode = "stat"
|
392
|
+
logger.info(
|
393
|
+
f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
394
|
+
)
|
395
|
+
|
396
|
+
if (self.enable_eplb or (self.init_expert_location is not None)) and (
|
397
|
+
self.ep_dispatch_algorithm is None
|
398
|
+
):
|
399
|
+
self.ep_dispatch_algorithm = "static"
|
400
|
+
logger.info(
|
401
|
+
f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
402
|
+
)
|
403
|
+
|
404
|
+
if self.enable_expert_distribution_metrics and (
|
405
|
+
self.expert_distribution_recorder_mode is None
|
406
|
+
):
|
407
|
+
self.expert_distribution_recorder_mode = "stat"
|
408
|
+
|
368
409
|
if self.expert_distribution_recorder_buffer_size is None:
|
369
|
-
|
370
|
-
|
371
|
-
# self.expert_distribution_recorder_buffer_size = x
|
372
|
-
if False:
|
373
|
-
pass
|
410
|
+
if (x := self.eplb_rebalance_num_iterations) is not None:
|
411
|
+
self.expert_distribution_recorder_buffer_size = x
|
374
412
|
elif self.expert_distribution_recorder_mode is not None:
|
375
413
|
self.expert_distribution_recorder_buffer_size = 1000
|
376
414
|
|
@@ -387,6 +425,12 @@ class ServerArgs:
|
|
387
425
|
"Overlap scheduler is disabled because of using "
|
388
426
|
"eagle speculative decoding."
|
389
427
|
)
|
428
|
+
if self.enable_mixed_chunk:
|
429
|
+
self.enable_mixed_chunk = False
|
430
|
+
logger.warning(
|
431
|
+
"Mixed chunked prefill is disabled because of using "
|
432
|
+
"eagle speculative decoding."
|
433
|
+
)
|
390
434
|
|
391
435
|
model_arch = get_model_arch(self)
|
392
436
|
|
@@ -409,7 +453,7 @@ class ServerArgs:
|
|
409
453
|
self.speculative_num_steps,
|
410
454
|
self.speculative_eagle_topk,
|
411
455
|
self.speculative_num_draft_tokens,
|
412
|
-
) = auto_choose_speculative_params(
|
456
|
+
) = auto_choose_speculative_params(self)
|
413
457
|
|
414
458
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
415
459
|
self.speculative_eagle_topk = 1
|
@@ -691,6 +735,18 @@ class ServerArgs:
|
|
691
735
|
default=ServerArgs.page_size,
|
692
736
|
help="The number of tokens in a page.",
|
693
737
|
)
|
738
|
+
parser.add_argument(
|
739
|
+
"--impl",
|
740
|
+
type=str,
|
741
|
+
default=ServerArgs.impl,
|
742
|
+
help="Which implementation of the model to use.\n\n"
|
743
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
744
|
+
"and fall back to the Transformers implementation if no SGLang "
|
745
|
+
"implementation is available.\n"
|
746
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
747
|
+
'* "transformers" will use the Transformers model '
|
748
|
+
"implementation.\n",
|
749
|
+
)
|
694
750
|
|
695
751
|
# Other runtime options
|
696
752
|
parser.add_argument(
|
@@ -957,12 +1013,13 @@ class ServerArgs:
|
|
957
1013
|
type=str,
|
958
1014
|
choices=[
|
959
1015
|
"aiter",
|
960
|
-
"
|
961
|
-
"triton",
|
962
|
-
"torch_native",
|
1016
|
+
"cutlass_mla",
|
963
1017
|
"fa3",
|
1018
|
+
"flashinfer",
|
964
1019
|
"flashmla",
|
965
|
-
"
|
1020
|
+
"intel_amx",
|
1021
|
+
"torch_native",
|
1022
|
+
"triton",
|
966
1023
|
],
|
967
1024
|
default=ServerArgs.attention_backend,
|
968
1025
|
help="Choose the kernels for attention layers.",
|
@@ -1119,6 +1176,11 @@ class ServerArgs:
|
|
1119
1176
|
action="store_true",
|
1120
1177
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
1121
1178
|
)
|
1179
|
+
parser.add_argument(
|
1180
|
+
"--enable-mscclpp",
|
1181
|
+
action="store_true",
|
1182
|
+
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
1183
|
+
)
|
1122
1184
|
parser.add_argument(
|
1123
1185
|
"--disable-overlap-schedule",
|
1124
1186
|
action="store_true",
|
@@ -1144,6 +1206,11 @@ class ServerArgs:
|
|
1144
1206
|
action="store_true",
|
1145
1207
|
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1146
1208
|
)
|
1209
|
+
parser.add_argument(
|
1210
|
+
"--enable-two-batch-overlap",
|
1211
|
+
action="store_true",
|
1212
|
+
help="Enabling two micro batches to overlap.",
|
1213
|
+
)
|
1147
1214
|
parser.add_argument(
|
1148
1215
|
"--enable-torch-compile",
|
1149
1216
|
action="store_true",
|
@@ -1295,12 +1362,24 @@ class ServerArgs:
|
|
1295
1362
|
action="store_true",
|
1296
1363
|
help="Enable EPLB algorithm",
|
1297
1364
|
)
|
1365
|
+
parser.add_argument(
|
1366
|
+
"--eplb-algorithm",
|
1367
|
+
type=str,
|
1368
|
+
default=ServerArgs.eplb_algorithm,
|
1369
|
+
help="Chosen EPLB algorithm",
|
1370
|
+
)
|
1298
1371
|
parser.add_argument(
|
1299
1372
|
"--eplb-rebalance-num-iterations",
|
1300
1373
|
type=int,
|
1301
1374
|
default=ServerArgs.eplb_rebalance_num_iterations,
|
1302
1375
|
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1303
1376
|
)
|
1377
|
+
parser.add_argument(
|
1378
|
+
"--eplb-rebalance-layers-per-chunk",
|
1379
|
+
type=int,
|
1380
|
+
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1381
|
+
help="Number of layers to rebalance per forward pass.",
|
1382
|
+
)
|
1304
1383
|
parser.add_argument(
|
1305
1384
|
"--expert-distribution-recorder-mode",
|
1306
1385
|
type=str,
|
@@ -1322,15 +1401,12 @@ class ServerArgs:
|
|
1322
1401
|
"--deepep-config",
|
1323
1402
|
type=str,
|
1324
1403
|
default=ServerArgs.deepep_config,
|
1325
|
-
help="Tuned DeepEP config suitable for your own cluster.",
|
1404
|
+
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1326
1405
|
)
|
1327
|
-
|
1328
1406
|
parser.add_argument(
|
1329
|
-
"--
|
1330
|
-
|
1331
|
-
|
1332
|
-
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
1333
|
-
"set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
|
1407
|
+
"--disable-shared-experts-fusion",
|
1408
|
+
action="store_true",
|
1409
|
+
help="Disable shared experts fusion optimization for deepseek v3/r1.",
|
1334
1410
|
)
|
1335
1411
|
parser.add_argument(
|
1336
1412
|
"--disable-chunked-prefix-cache",
|
@@ -1451,7 +1527,7 @@ class ServerArgs:
|
|
1451
1527
|
self.max_loras_per_batch > 0
|
1452
1528
|
# FIXME
|
1453
1529
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1454
|
-
), "compatibility of lora and
|
1530
|
+
), "compatibility of lora and radix attention is in progress"
|
1455
1531
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1456
1532
|
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1457
1533
|
|
@@ -1585,18 +1661,29 @@ def get_model_arch(args: ServerArgs):
|
|
1585
1661
|
return hf_config.architectures[0]
|
1586
1662
|
|
1587
1663
|
|
1588
|
-
def auto_choose_speculative_params(
|
1664
|
+
def auto_choose_speculative_params(self: ServerArgs):
|
1589
1665
|
"""
|
1590
1666
|
Automatically choose the parameters for speculative decoding.
|
1591
1667
|
|
1592
1668
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1593
1669
|
"""
|
1670
|
+
kwargs = {}
|
1671
|
+
|
1672
|
+
hf_config = get_config(
|
1673
|
+
self.model_path,
|
1674
|
+
trust_remote_code=self.trust_remote_code,
|
1675
|
+
revision=self.revision,
|
1676
|
+
model_override_args=json.loads(self.json_model_override_args),
|
1677
|
+
**kwargs,
|
1678
|
+
)
|
1679
|
+
arch = hf_config.architectures[0]
|
1680
|
+
|
1594
1681
|
if arch in ["LlamaForCausalLM"]:
|
1595
1682
|
# The default value for llama
|
1596
1683
|
return (5, 4, 8)
|
1597
1684
|
elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
|
1598
1685
|
# The default value for deepseek
|
1599
|
-
return (
|
1686
|
+
return (3, 1, 4)
|
1600
1687
|
elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
|
1601
1688
|
return (5, 4, 8)
|
1602
1689
|
else:
|