sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
|
7
|
+
|
8
|
+
|
9
|
+
class EplbAlgorithm(Enum):
|
10
|
+
deepseek = auto()
|
11
|
+
deepseek_hierarchical = auto()
|
12
|
+
deepseek_vec = auto()
|
13
|
+
deepseek_vec_hierarchical = auto()
|
14
|
+
# TODO may have more algorithm later
|
15
|
+
|
16
|
+
|
17
|
+
def rebalance_experts(
|
18
|
+
tokens_per_expert: torch.Tensor,
|
19
|
+
num_physical_experts: int,
|
20
|
+
num_local_physical_experts: int,
|
21
|
+
num_groups: Optional[int],
|
22
|
+
num_nodes: int,
|
23
|
+
algorithm: EplbAlgorithm,
|
24
|
+
):
|
25
|
+
if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
|
26
|
+
return deepseek.rebalance_experts(
|
27
|
+
weight=tokens_per_expert.sum(dim=0),
|
28
|
+
num_replicas=num_physical_experts,
|
29
|
+
num_groups=num_groups,
|
30
|
+
num_nodes=num_nodes,
|
31
|
+
num_gpus=num_physical_experts // num_local_physical_experts,
|
32
|
+
enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
|
33
|
+
)
|
34
|
+
|
35
|
+
if algorithm in [
|
36
|
+
EplbAlgorithm.deepseek_vec,
|
37
|
+
EplbAlgorithm.deepseek_vec_hierarchical,
|
38
|
+
]:
|
39
|
+
return deepseek_vec.rebalance_experts(
|
40
|
+
tokens_per_expert=tokens_per_expert,
|
41
|
+
num_physical_experts=num_physical_experts,
|
42
|
+
num_local_physical_experts=num_local_physical_experts,
|
43
|
+
num_groups=num_groups,
|
44
|
+
num_nodes=num_nodes,
|
45
|
+
enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
|
46
|
+
)
|
47
|
+
|
48
|
+
raise NotImplementedError
|
49
|
+
|
50
|
+
|
51
|
+
def compute_algorithm(
|
52
|
+
raw_algorithm: str,
|
53
|
+
num_groups: Optional[int],
|
54
|
+
num_nodes: int,
|
55
|
+
) -> EplbAlgorithm:
|
56
|
+
if raw_algorithm != "auto":
|
57
|
+
return EplbAlgorithm[raw_algorithm]
|
58
|
+
|
59
|
+
# TODO test on real scenarios and know which ones perform better
|
60
|
+
if (num_groups is not None) and (num_groups % num_nodes == 0):
|
61
|
+
return EplbAlgorithm.deepseek_hierarchical
|
62
|
+
else:
|
63
|
+
return EplbAlgorithm.deepseek
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
|
2
|
+
from typing import Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.utils import get_bool_env_var
|
7
|
+
|
8
|
+
|
9
|
+
def balanced_packing(
|
10
|
+
weight: torch.Tensor, num_packs: int
|
11
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
12
|
+
"""
|
13
|
+
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
|
14
|
+
are as balanced as possible.
|
15
|
+
|
16
|
+
Parameters:
|
17
|
+
weight: [X, n], the weight of each item
|
18
|
+
num_packs: number of packs
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
pack_index: [X, n], the pack index of each item
|
22
|
+
rank_in_pack: [X, n], the rank of the item in the pack
|
23
|
+
"""
|
24
|
+
num_layers, num_groups = weight.shape
|
25
|
+
assert num_groups % num_packs == 0
|
26
|
+
groups_per_pack = num_groups // num_packs
|
27
|
+
|
28
|
+
if groups_per_pack == 1:
|
29
|
+
pack_index = torch.arange(
|
30
|
+
weight.size(-1), dtype=torch.int64, device=weight.device
|
31
|
+
).expand(weight.shape)
|
32
|
+
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
|
33
|
+
return pack_index, rank_in_pack
|
34
|
+
|
35
|
+
indices = weight.float().sort(-1, descending=True).indices.cpu()
|
36
|
+
pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
|
37
|
+
rank_in_pack = torch.full_like(pack_index, fill_value=-1)
|
38
|
+
for i in range(num_layers):
|
39
|
+
pack_weights = [0] * num_packs
|
40
|
+
pack_items = [0] * num_packs
|
41
|
+
for group in indices[i]:
|
42
|
+
pack = min(
|
43
|
+
(i for i in range(num_packs) if pack_items[i] < groups_per_pack),
|
44
|
+
key=pack_weights.__getitem__,
|
45
|
+
)
|
46
|
+
assert pack_items[pack] < groups_per_pack
|
47
|
+
pack_index[i, group] = pack
|
48
|
+
rank_in_pack[i, group] = pack_items[pack]
|
49
|
+
pack_weights[pack] += weight[i, group]
|
50
|
+
pack_items[pack] += 1
|
51
|
+
return pack_index, rank_in_pack
|
52
|
+
|
53
|
+
|
54
|
+
def replicate_experts(
|
55
|
+
weight: torch.Tensor, num_phy: int
|
56
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
57
|
+
"""
|
58
|
+
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
|
59
|
+
|
60
|
+
Parameters:
|
61
|
+
weight: [X, num_log]
|
62
|
+
num_phy: total number of experts after replication
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
phy2log: [X, num_phy], logical expert id of each physical expert
|
66
|
+
rank: [X, num_phy], the replica rank
|
67
|
+
logcnt: [X, num_log], number of replicas for each logical expert
|
68
|
+
"""
|
69
|
+
n, num_log = weight.shape
|
70
|
+
num_redundant = num_phy - num_log
|
71
|
+
assert num_redundant >= 0
|
72
|
+
device = weight.device
|
73
|
+
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
|
74
|
+
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
|
75
|
+
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
|
76
|
+
arangen = torch.arange(n, dtype=torch.int64, device=device)
|
77
|
+
for i in range(num_log, num_phy):
|
78
|
+
redundant_indices = (weight / logcnt).max(dim=-1).indices
|
79
|
+
phy2log[:, i] = redundant_indices
|
80
|
+
rank[:, i] = logcnt[arangen, redundant_indices]
|
81
|
+
logcnt[arangen, redundant_indices] += 1
|
82
|
+
return phy2log, rank, logcnt
|
83
|
+
|
84
|
+
|
85
|
+
def rebalance_experts_hierarchical(
|
86
|
+
weight: torch.Tensor,
|
87
|
+
num_physical_experts: int,
|
88
|
+
num_groups: int,
|
89
|
+
num_nodes: int,
|
90
|
+
num_gpus: int,
|
91
|
+
):
|
92
|
+
"""
|
93
|
+
Parameters:
|
94
|
+
weight: [num_moe_layers, num_logical_experts]
|
95
|
+
num_physical_experts: number of physical experts after replication
|
96
|
+
num_groups: number of expert groups
|
97
|
+
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
98
|
+
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
physical_to_logical_map: [num_moe_layers, num_physical_experts]
|
102
|
+
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
|
103
|
+
logical_count: [num_moe_layers, num_logical_experts]
|
104
|
+
"""
|
105
|
+
num_layers, num_logical_experts = weight.shape
|
106
|
+
assert num_logical_experts % num_groups == 0
|
107
|
+
group_size = num_logical_experts // num_groups
|
108
|
+
assert num_groups % num_nodes == 0
|
109
|
+
groups_per_node = num_groups // num_nodes
|
110
|
+
assert num_gpus % num_nodes == 0
|
111
|
+
assert num_physical_experts % num_gpus == 0
|
112
|
+
phy_experts_per_gpu = num_physical_experts // num_gpus
|
113
|
+
|
114
|
+
def inverse(perm: torch.Tensor) -> torch.Tensor:
|
115
|
+
inv = torch.empty_like(perm)
|
116
|
+
inv.scatter_(
|
117
|
+
1,
|
118
|
+
perm,
|
119
|
+
torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
|
120
|
+
perm.shape
|
121
|
+
),
|
122
|
+
)
|
123
|
+
return inv
|
124
|
+
|
125
|
+
# Step 1: pack groups to nodes
|
126
|
+
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
|
127
|
+
group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
|
128
|
+
log2mlog = (
|
129
|
+
(
|
130
|
+
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
|
131
|
+
).unsqueeze(-1)
|
132
|
+
+ torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
|
133
|
+
).flatten(-2)
|
134
|
+
mlog2log = inverse(log2mlog)
|
135
|
+
|
136
|
+
# Step 2: construct redundant experts within nodes
|
137
|
+
# [num_layers * num_nodes, num_logical_experts // num_nodes]
|
138
|
+
tokens_per_mlog = weight.gather(-1, mlog2log).view(
|
139
|
+
-1, num_logical_experts // num_nodes
|
140
|
+
)
|
141
|
+
phy2mlog, phyrank, mlogcnt = replicate_experts(
|
142
|
+
tokens_per_mlog, num_physical_experts // num_nodes
|
143
|
+
)
|
144
|
+
|
145
|
+
# Step 3: pack physical_experts to GPUs
|
146
|
+
# [num_layers * num_nodes, num_physical_experts // num_nodes]
|
147
|
+
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
|
148
|
+
pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
|
149
|
+
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
|
150
|
+
pphy2phy = inverse(phy2pphy)
|
151
|
+
|
152
|
+
pphy2mlog = phy2mlog.gather(
|
153
|
+
-1, pphy2phy
|
154
|
+
) # [num_layers * num_nodes, num_log_per_nodes]
|
155
|
+
pphy2mlog = (
|
156
|
+
pphy2mlog.view(num_layers, num_nodes, -1)
|
157
|
+
+ torch.arange(
|
158
|
+
0,
|
159
|
+
num_logical_experts,
|
160
|
+
num_logical_experts // num_nodes,
|
161
|
+
device=group_pack_index.device,
|
162
|
+
).view(1, -1, 1)
|
163
|
+
).flatten(-2)
|
164
|
+
pphy2log = mlog2log.gather(-1, pphy2mlog)
|
165
|
+
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
|
166
|
+
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
|
167
|
+
return pphy2log, pphyrank, logcnt
|
168
|
+
|
169
|
+
|
170
|
+
def rebalance_experts(
|
171
|
+
weight: torch.Tensor,
|
172
|
+
num_replicas: int,
|
173
|
+
num_groups: int,
|
174
|
+
num_nodes: int,
|
175
|
+
num_gpus: int,
|
176
|
+
enable_hierarchical: bool,
|
177
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
178
|
+
"""
|
179
|
+
Entry point for expert-parallelism load balancer.
|
180
|
+
|
181
|
+
Parameters:
|
182
|
+
weight: [layers, num_logical_experts], the load statistics for all logical experts
|
183
|
+
num_replicas: number of physical experts, must be a multiple of `num_gpus`
|
184
|
+
num_groups: number of expert groups
|
185
|
+
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
186
|
+
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
|
190
|
+
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
|
191
|
+
expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
|
192
|
+
"""
|
193
|
+
|
194
|
+
num_layers, num_logical_experts = weight.shape
|
195
|
+
weight = weight.float().cpu()
|
196
|
+
if enable_hierarchical:
|
197
|
+
# use hierarchical load-balance policy
|
198
|
+
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
199
|
+
weight, num_replicas, num_groups, num_nodes, num_gpus
|
200
|
+
)
|
201
|
+
else:
|
202
|
+
# use global load-balance policy
|
203
|
+
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
204
|
+
weight, num_replicas, 1, 1, num_gpus
|
205
|
+
)
|
206
|
+
maxlogcnt = logcnt.max().item()
|
207
|
+
log2phy: torch.Tensor = torch.full(
|
208
|
+
(num_layers, num_logical_experts, maxlogcnt),
|
209
|
+
-1,
|
210
|
+
dtype=torch.int64,
|
211
|
+
device=logcnt.device,
|
212
|
+
)
|
213
|
+
log2phy.view(num_layers, -1).scatter_(
|
214
|
+
-1,
|
215
|
+
phy2log * maxlogcnt + phyrank,
|
216
|
+
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
|
217
|
+
num_layers, -1
|
218
|
+
),
|
219
|
+
)
|
220
|
+
return phy2log, log2phy, logcnt
|
221
|
+
|
222
|
+
|
223
|
+
__all__ = ["rebalance_experts"]
|
@@ -0,0 +1,276 @@
|
|
1
|
+
# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
|
2
|
+
from typing import Optional, Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
|
7
|
+
def pack_groups(tokens_per_group: torch.Tensor, num_nodes: int) -> torch.Tensor:
|
8
|
+
num_layers, num_groups = tokens_per_group.shape
|
9
|
+
assert num_groups % num_nodes == 0
|
10
|
+
groups_per_rank = num_groups // num_nodes
|
11
|
+
|
12
|
+
indices = tokens_per_group.float().sort(-1, descending=True).indices.cpu()
|
13
|
+
ret = torch.full_like(
|
14
|
+
tokens_per_group, fill_value=-1, dtype=torch.int64, device="cpu"
|
15
|
+
)
|
16
|
+
for layer in range(num_layers):
|
17
|
+
node_tokens = [0] * num_nodes
|
18
|
+
node_groups = [0] * num_nodes
|
19
|
+
for group in indices[layer]:
|
20
|
+
|
21
|
+
def key_func(rank: int) -> int:
|
22
|
+
if node_groups[rank] >= groups_per_rank:
|
23
|
+
return 1, 0
|
24
|
+
else:
|
25
|
+
return 0, node_tokens[rank]
|
26
|
+
|
27
|
+
rank = min(range(num_nodes), key=key_func)
|
28
|
+
assert node_groups[rank] < groups_per_rank
|
29
|
+
ret[layer, group] = rank * groups_per_rank + node_groups[rank]
|
30
|
+
node_tokens[rank] += tokens_per_group[layer, group]
|
31
|
+
node_groups[rank] += 1
|
32
|
+
return ret
|
33
|
+
|
34
|
+
|
35
|
+
def make_redundant_experts_chunkwise(
|
36
|
+
tokens_per_expert: torch.Tensor,
|
37
|
+
num_physical_experts: int,
|
38
|
+
num_local_physical_experts: int,
|
39
|
+
num_physical_experts_per_chunk: int,
|
40
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
41
|
+
num_steps, num_moe_layers, num_logical_experts = tokens_per_expert.shape
|
42
|
+
num_redundancy_experts = num_physical_experts - num_logical_experts
|
43
|
+
|
44
|
+
physical_to_logical_map = torch.empty(
|
45
|
+
num_moe_layers,
|
46
|
+
num_physical_experts,
|
47
|
+
dtype=torch.int,
|
48
|
+
device=tokens_per_expert.device,
|
49
|
+
)
|
50
|
+
logical_to_physical_map = torch.full(
|
51
|
+
(num_moe_layers, num_logical_experts, num_redundancy_experts + 1),
|
52
|
+
-1,
|
53
|
+
dtype=torch.int,
|
54
|
+
device=tokens_per_expert.device,
|
55
|
+
)
|
56
|
+
logical_count = torch.ones(
|
57
|
+
num_moe_layers,
|
58
|
+
num_logical_experts,
|
59
|
+
dtype=torch.int,
|
60
|
+
device=tokens_per_expert.device,
|
61
|
+
)
|
62
|
+
|
63
|
+
assert num_physical_experts % num_physical_experts_per_chunk == 0
|
64
|
+
num_chunks = num_physical_experts // num_physical_experts_per_chunk
|
65
|
+
assert num_logical_experts % num_chunks == 0
|
66
|
+
num_logical_experts_per_group = num_logical_experts // num_chunks
|
67
|
+
assert num_redundancy_experts % num_chunks == 0
|
68
|
+
num_redundancy_experts_per_group = num_redundancy_experts // num_chunks
|
69
|
+
|
70
|
+
arange_num_moe_layers_num_groups = torch.arange(
|
71
|
+
num_moe_layers * num_chunks, dtype=torch.int, device=tokens_per_expert.device
|
72
|
+
)
|
73
|
+
arange_num_logical_experts = torch.arange(
|
74
|
+
num_logical_experts, dtype=torch.int, device=tokens_per_expert.device
|
75
|
+
)
|
76
|
+
arange_num_logical_experts_per_group = torch.arange(
|
77
|
+
num_logical_experts_per_group, dtype=torch.int, device=tokens_per_expert.device
|
78
|
+
)
|
79
|
+
arange_num_groups = torch.arange(
|
80
|
+
num_chunks, dtype=torch.int, device=tokens_per_expert.device
|
81
|
+
)
|
82
|
+
physical_to_logical_map.view(
|
83
|
+
num_moe_layers, num_chunks, num_physical_experts_per_chunk
|
84
|
+
)[:, :, :num_logical_experts_per_group] = arange_num_logical_experts.view(
|
85
|
+
num_chunks, num_logical_experts_per_group
|
86
|
+
)
|
87
|
+
logical_to_physical_map[:, :, 0] = (
|
88
|
+
arange_num_logical_experts_per_group.expand(
|
89
|
+
num_chunks, num_logical_experts_per_group
|
90
|
+
)
|
91
|
+
+ arange_num_groups[:, None] * num_physical_experts_per_chunk
|
92
|
+
).view(num_logical_experts)
|
93
|
+
|
94
|
+
tokens_per_expert_all_diff = tokens_per_expert + arange_num_logical_experts * 1e-4
|
95
|
+
for i in range(num_redundancy_experts_per_group):
|
96
|
+
score = (
|
97
|
+
tokens_per_expert_all_diff / logical_count
|
98
|
+
) # NOTE: Values in score must be different from each other
|
99
|
+
score1 = tokens_per_expert / (logical_count + 1)
|
100
|
+
score = score.view(
|
101
|
+
num_steps, num_moe_layers, num_chunks, num_logical_experts_per_group
|
102
|
+
)
|
103
|
+
score1 = score1.view_as(score)
|
104
|
+
values, indices = score.max(-1, keepdim=True)
|
105
|
+
values = values.expand_as(score).contiguous()
|
106
|
+
score.scatter_(-1, indices, score1.gather(-1, indices))
|
107
|
+
values.scatter_(-1, indices, score.max(-1, keepdim=True).values)
|
108
|
+
redundancy_indices = values.sum(0).argmin(-1)
|
109
|
+
physical_to_logical_map.view(
|
110
|
+
num_moe_layers, num_chunks, num_physical_experts_per_chunk
|
111
|
+
)[:, :, num_logical_experts_per_group + i] = (
|
112
|
+
redundancy_indices + arange_num_groups * num_logical_experts_per_group
|
113
|
+
)
|
114
|
+
redundancy_count = (
|
115
|
+
logical_count.view(
|
116
|
+
num_moe_layers * num_chunks, num_logical_experts_per_group
|
117
|
+
)
|
118
|
+
.gather(-1, redundancy_indices.view(num_moe_layers * num_chunks, 1))
|
119
|
+
.squeeze(1)
|
120
|
+
)
|
121
|
+
physical_redundancy_indices = (
|
122
|
+
(
|
123
|
+
arange_num_groups * num_physical_experts_per_chunk
|
124
|
+
+ num_logical_experts_per_group
|
125
|
+
+ i
|
126
|
+
)
|
127
|
+
.expand(num_moe_layers, num_chunks)
|
128
|
+
.flatten()
|
129
|
+
)
|
130
|
+
logical_to_physical_map.view(
|
131
|
+
num_moe_layers * num_chunks,
|
132
|
+
num_logical_experts_per_group,
|
133
|
+
num_redundancy_experts + 1,
|
134
|
+
)[
|
135
|
+
arange_num_moe_layers_num_groups,
|
136
|
+
redundancy_indices.view(num_moe_layers * num_chunks),
|
137
|
+
redundancy_count,
|
138
|
+
] = physical_redundancy_indices
|
139
|
+
logical_count.view(num_moe_layers * num_chunks, num_logical_experts_per_group)[
|
140
|
+
arange_num_moe_layers_num_groups,
|
141
|
+
redundancy_indices.view(num_moe_layers * num_chunks),
|
142
|
+
] += 1
|
143
|
+
|
144
|
+
if num_local_physical_experts > 1:
|
145
|
+
# Load-balancing between GPUs
|
146
|
+
physical_to_logical_map_int64 = physical_to_logical_map.to(torch.int64)
|
147
|
+
counts = logical_count.gather(-1, physical_to_logical_map_int64)
|
148
|
+
score = tokens_per_expert.sum(0).gather(-1, physical_to_logical_map_int64)
|
149
|
+
score = score / counts
|
150
|
+
score = score.view(num_moe_layers, num_chunks, num_physical_experts_per_chunk)
|
151
|
+
indices = score.argsort(-1, descending=True)
|
152
|
+
indices += torch.arange(
|
153
|
+
0,
|
154
|
+
num_physical_experts,
|
155
|
+
num_physical_experts_per_chunk,
|
156
|
+
dtype=indices.dtype,
|
157
|
+
device=indices.device,
|
158
|
+
)[None, :, None]
|
159
|
+
|
160
|
+
assert num_physical_experts_per_chunk % num_local_physical_experts == 0
|
161
|
+
num_local_groups = num_physical_experts_per_chunk // num_local_physical_experts
|
162
|
+
indices = indices.view(
|
163
|
+
num_moe_layers, num_chunks, num_local_physical_experts, num_local_groups
|
164
|
+
)
|
165
|
+
indices[:, :, 1::2, :] = indices[:, :, 1::2, :].flip(-1)
|
166
|
+
indices = indices.transpose(2, 3)
|
167
|
+
indices = indices.reshape(num_moe_layers, num_physical_experts)
|
168
|
+
physical_to_logical_map = physical_to_logical_map.gather(-1, indices)
|
169
|
+
mask = logical_to_physical_map == -1
|
170
|
+
logical_to_physical_map[mask] = 0
|
171
|
+
logical_to_physical_map = (
|
172
|
+
indices.argsort(-1)
|
173
|
+
.gather(
|
174
|
+
-1, logical_to_physical_map.view(num_moe_layers, -1).to(torch.int64)
|
175
|
+
)
|
176
|
+
.view_as(logical_to_physical_map)
|
177
|
+
.to(torch.int)
|
178
|
+
)
|
179
|
+
logical_to_physical_map[mask] = -1
|
180
|
+
|
181
|
+
return physical_to_logical_map, logical_to_physical_map, logical_count
|
182
|
+
|
183
|
+
|
184
|
+
def decode_rebalance_experts(
|
185
|
+
tokens_per_expert: torch.Tensor,
|
186
|
+
num_physical_experts: int,
|
187
|
+
num_local_physical_experts: int,
|
188
|
+
):
|
189
|
+
return make_redundant_experts_chunkwise(
|
190
|
+
tokens_per_expert,
|
191
|
+
num_physical_experts,
|
192
|
+
num_local_physical_experts,
|
193
|
+
num_physical_experts,
|
194
|
+
)
|
195
|
+
|
196
|
+
|
197
|
+
def prefill_rebalance_experts(
|
198
|
+
tokens_per_expert: torch.Tensor,
|
199
|
+
num_physical_experts: int,
|
200
|
+
num_local_physical_experts: int,
|
201
|
+
num_groups: int,
|
202
|
+
num_nodes: int,
|
203
|
+
):
|
204
|
+
tokens_per_expert = tokens_per_expert.float().cpu()
|
205
|
+
|
206
|
+
num_steps, _, num_logical_experts = tokens_per_expert.shape
|
207
|
+
assert num_logical_experts % num_groups == 0
|
208
|
+
group_size = num_logical_experts // num_groups
|
209
|
+
assert num_groups % num_nodes == 0, f"{num_groups=} {num_nodes=}"
|
210
|
+
|
211
|
+
tokens_per_group = tokens_per_expert.sum(0).unflatten(-1, (num_groups, -1)).sum(-1)
|
212
|
+
group_perm = pack_groups(
|
213
|
+
tokens_per_group, num_nodes
|
214
|
+
) # [num_moe_layers, num_groups] => [num_moe_layers, num_nodes]
|
215
|
+
|
216
|
+
# log2mlog [layers, #logexp] -> [layers, #logexp]
|
217
|
+
log2mlog = (
|
218
|
+
(group_perm * group_size).unsqueeze(-1)
|
219
|
+
+ torch.arange(group_size, dtype=torch.int64, device=group_perm.device)
|
220
|
+
).flatten(-2)
|
221
|
+
|
222
|
+
# mlog2log [layers, #logexp] -> [layers, #logexp], inverse of log2mlog
|
223
|
+
mlog2log = torch.empty_like(log2mlog)
|
224
|
+
arange = torch.arange(
|
225
|
+
num_logical_experts, dtype=torch.int64, device=mlog2log.device
|
226
|
+
)
|
227
|
+
mlog2log.scatter_(1, log2mlog, arange.expand(log2mlog.size(0), -1))
|
228
|
+
|
229
|
+
# tokens_per_mlog[i][j][k] = tokens_per_expert[i][j][mlog2log[j][k]]
|
230
|
+
tokens_per_mlog = tokens_per_expert.gather(
|
231
|
+
2, mlog2log.unsqueeze(0).expand(num_steps, -1, -1)
|
232
|
+
)
|
233
|
+
|
234
|
+
phy2mlog, mlog2phy, mlog_count = make_redundant_experts_chunkwise(
|
235
|
+
tokens_per_mlog,
|
236
|
+
num_physical_experts,
|
237
|
+
num_local_physical_experts,
|
238
|
+
num_physical_experts // num_nodes,
|
239
|
+
)
|
240
|
+
|
241
|
+
# phy2log[i][j] = mlog2log[i][phy2mlog[i][j]]
|
242
|
+
phy2log = mlog2log.gather(1, phy2mlog.to(torch.int64))
|
243
|
+
|
244
|
+
# mlog2phy: [num_moe_layers, num_logical_experts, ...]
|
245
|
+
# log2phy[i][j][k] = mlog2phy[i][log2mlog[i][j]][k]
|
246
|
+
log2phy = mlog2phy.gather(
|
247
|
+
1, log2mlog.unsqueeze(-1).expand(-1, -1, mlog2phy.size(-1)).to(torch.int64)
|
248
|
+
)
|
249
|
+
|
250
|
+
# log_count[i][j] = mlog_count[i][log2mlog[i][j]]
|
251
|
+
log_count = mlog_count.gather(1, log2mlog)
|
252
|
+
return phy2log, log2phy, log_count
|
253
|
+
|
254
|
+
|
255
|
+
def rebalance_experts(
|
256
|
+
tokens_per_expert: torch.Tensor,
|
257
|
+
num_physical_experts: int,
|
258
|
+
num_local_physical_experts: int,
|
259
|
+
num_groups: Optional[int],
|
260
|
+
num_nodes: int,
|
261
|
+
enable_hierarchical: bool,
|
262
|
+
):
|
263
|
+
if enable_hierarchical:
|
264
|
+
return prefill_rebalance_experts(
|
265
|
+
tokens_per_expert=tokens_per_expert,
|
266
|
+
num_physical_experts=num_physical_experts,
|
267
|
+
num_local_physical_experts=num_local_physical_experts,
|
268
|
+
num_groups=num_groups,
|
269
|
+
num_nodes=num_nodes,
|
270
|
+
)
|
271
|
+
else:
|
272
|
+
return decode_rebalance_experts(
|
273
|
+
tokens_per_expert=tokens_per_expert,
|
274
|
+
num_physical_experts=num_physical_experts,
|
275
|
+
num_local_physical_experts=num_local_physical_experts,
|
276
|
+
)
|
@@ -0,0 +1,96 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from typing import TYPE_CHECKING, List
|
4
|
+
|
5
|
+
import torch.cuda
|
6
|
+
|
7
|
+
from sglang.srt.managers.expert_distribution import (
|
8
|
+
get_global_expert_distribution_recorder,
|
9
|
+
)
|
10
|
+
from sglang.srt.managers.expert_location import ExpertLocationMetadata
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class EPLBManager:
|
19
|
+
def __init__(self, model_runner: "ModelRunner"):
|
20
|
+
super().__init__()
|
21
|
+
self._model_runner = model_runner
|
22
|
+
self._server_args = model_runner.server_args
|
23
|
+
self._rebalance_layers_per_chunk = (
|
24
|
+
self._server_args.eplb_rebalance_layers_per_chunk
|
25
|
+
)
|
26
|
+
self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
|
27
|
+
|
28
|
+
# Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
|
29
|
+
assert (
|
30
|
+
self._server_args.eplb_rebalance_num_iterations
|
31
|
+
>= self._server_args.expert_distribution_recorder_buffer_size
|
32
|
+
), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
|
33
|
+
|
34
|
+
if not get_global_expert_distribution_recorder().recording:
|
35
|
+
get_global_expert_distribution_recorder().start_record()
|
36
|
+
|
37
|
+
logger.info(
|
38
|
+
f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
|
39
|
+
)
|
40
|
+
|
41
|
+
self._main_generator = self._entrypoint()
|
42
|
+
|
43
|
+
def on_forward_pass_end(self):
|
44
|
+
next(self._main_generator)
|
45
|
+
|
46
|
+
# can be more complex if needed
|
47
|
+
def _entrypoint(self):
|
48
|
+
while True:
|
49
|
+
for _ in range(self._rebalance_num_iterations):
|
50
|
+
yield
|
51
|
+
|
52
|
+
yield from self.rebalance()
|
53
|
+
|
54
|
+
def rebalance(self):
|
55
|
+
logger.info("[EPLBManager] rebalance start")
|
56
|
+
|
57
|
+
enable_timing = self._rebalance_layers_per_chunk is None
|
58
|
+
|
59
|
+
if enable_timing:
|
60
|
+
torch.cuda.synchronize()
|
61
|
+
time_start = time.time()
|
62
|
+
|
63
|
+
logical_count = get_global_expert_distribution_recorder().dump_record(
|
64
|
+
output_mode="object"
|
65
|
+
)["logical_count"]
|
66
|
+
expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
|
67
|
+
self._server_args, self._model_runner.model_config, logical_count
|
68
|
+
)
|
69
|
+
|
70
|
+
update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
|
71
|
+
for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
|
72
|
+
if len(update_layer_ids_chunks) > 1:
|
73
|
+
yield
|
74
|
+
self._model_runner.update_expert_location(
|
75
|
+
expert_location_metadata,
|
76
|
+
update_layer_ids=update_layer_ids,
|
77
|
+
)
|
78
|
+
|
79
|
+
msg = f"[EPLBManager] rebalance end"
|
80
|
+
if enable_timing:
|
81
|
+
torch.cuda.synchronize()
|
82
|
+
time_end = time.time()
|
83
|
+
msg += f" time={time_end - time_start:.3f}s"
|
84
|
+
logger.info(msg)
|
85
|
+
|
86
|
+
def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
|
87
|
+
all_layer_ids = sorted(
|
88
|
+
list(self._model_runner.model.routed_experts_weights_of_layer.keys())
|
89
|
+
)
|
90
|
+
chunk_size = self._rebalance_layers_per_chunk or 1000000
|
91
|
+
return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
|
92
|
+
|
93
|
+
|
94
|
+
def _chunk_list(items: List, chunk_size):
|
95
|
+
for start_index in range(0, len(items), chunk_size):
|
96
|
+
yield items[start_index : start_index + chunk_size]
|