sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ from abc import ABC
|
|
18
18
|
from collections import deque
|
19
19
|
from contextlib import contextmanager
|
20
20
|
from pathlib import Path
|
21
|
-
from typing import Dict, List, Literal, Optional, Tuple, Type
|
21
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Type
|
22
22
|
|
23
23
|
import einops
|
24
24
|
import torch
|
@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC):
|
|
91
91
|
def dump_record(self, output_mode: _OutputMode = "file"):
|
92
92
|
self._on_not_implemented()
|
93
93
|
|
94
|
+
@property
|
95
|
+
def recording(self):
|
96
|
+
return False
|
97
|
+
|
94
98
|
def _on_not_implemented(self):
|
95
99
|
raise Exception(
|
96
100
|
"Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
|
@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
123
127
|
for k in self._accumulator.get_single_pass_gatherer_keys()
|
124
128
|
}
|
125
129
|
|
130
|
+
if server_args.enable_expert_distribution_metrics:
|
131
|
+
logger.info(
|
132
|
+
"ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
|
133
|
+
)
|
134
|
+
self.start_record()
|
135
|
+
|
126
136
|
def with_current_layer(self, layer_idx):
|
127
137
|
return self._current_layer_idx.with_value(layer_idx)
|
128
138
|
|
@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
221
231
|
self._reset()
|
222
232
|
return output
|
223
233
|
|
234
|
+
@property
|
235
|
+
def recording(self):
|
236
|
+
return self._recording
|
237
|
+
|
224
238
|
|
225
239
|
_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
|
226
240
|
_ExpertDistributionRecorderNoop()
|
@@ -250,15 +264,23 @@ class _SinglePassGatherer(ABC):
|
|
250
264
|
return _DetailSinglePassGatherer(
|
251
265
|
server_args, expert_location_metadata, rank
|
252
266
|
)
|
267
|
+
|
268
|
+
if server_args.expert_distribution_recorder_mode == "stat_approx":
|
269
|
+
if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
|
270
|
+
return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
|
271
|
+
else:
|
272
|
+
raise NotImplementedError
|
273
|
+
|
253
274
|
if server_args.enable_deepep_moe:
|
254
275
|
if server_args.deepep_mode == "normal":
|
255
|
-
return
|
276
|
+
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
256
277
|
elif server_args.deepep_mode == "low_latency":
|
257
278
|
return _DeepepLowLatencySinglePassGatherer(
|
258
279
|
expert_location_metadata, rank
|
259
280
|
)
|
260
281
|
else:
|
261
282
|
raise NotImplementedError
|
283
|
+
|
262
284
|
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
263
285
|
|
264
286
|
def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
|
@@ -293,7 +315,82 @@ class _SinglePassGatherer(ABC):
|
|
293
315
|
raise NotImplementedError
|
294
316
|
|
295
317
|
|
296
|
-
class
|
318
|
+
class _DetailSinglePassGatherer(_SinglePassGatherer):
|
319
|
+
# DeepSeek V3 has this value; should generalize later
|
320
|
+
_TOP_K_NUM = 8
|
321
|
+
|
322
|
+
def __init__(
|
323
|
+
self,
|
324
|
+
server_args: ServerArgs,
|
325
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
326
|
+
rank: int,
|
327
|
+
):
|
328
|
+
super().__init__(expert_location_metadata, rank)
|
329
|
+
self._metadata: Optional[Dict[str, Any]] = None
|
330
|
+
self._topk_ids_of_layer = torch.zeros(
|
331
|
+
(
|
332
|
+
expert_location_metadata.num_layers,
|
333
|
+
# TODO determine the max number
|
334
|
+
server_args.chunked_prefill_size * 8,
|
335
|
+
self._TOP_K_NUM,
|
336
|
+
),
|
337
|
+
dtype=torch.int32,
|
338
|
+
device=server_args.device,
|
339
|
+
)
|
340
|
+
self._misc_objects: List[Dict[str, Any]] = []
|
341
|
+
assert (
|
342
|
+
not server_args.enable_two_batch_overlap
|
343
|
+
), "DetailSinglePassGatherer does not support TBO yet"
|
344
|
+
# TODO assert shared experts fusion is disabled, o/w data is wrong
|
345
|
+
|
346
|
+
def on_forward_pass_start(self, forward_batch: ForwardBatch):
|
347
|
+
assert self._metadata is None
|
348
|
+
self._metadata = dict(
|
349
|
+
# TODO pr-chain
|
350
|
+
# rids=forward_batch.rids,
|
351
|
+
input_ids=forward_batch.input_ids.cpu().tolist(),
|
352
|
+
positions=forward_batch.positions.cpu().tolist(),
|
353
|
+
extend_seq_lens=forward_batch.extend_seq_lens_cpu,
|
354
|
+
forward_mode=forward_batch.forward_mode.value,
|
355
|
+
)
|
356
|
+
|
357
|
+
def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
|
358
|
+
self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
|
359
|
+
topk_ids
|
360
|
+
)
|
361
|
+
|
362
|
+
def on_deepep_dispatch_normal(
|
363
|
+
self,
|
364
|
+
layer_idx: int,
|
365
|
+
local_physical_count_of_layer: List[int],
|
366
|
+
num_tokens_per_rank,
|
367
|
+
num_tokens_per_rdma_rank,
|
368
|
+
num_tokens_per_expert,
|
369
|
+
):
|
370
|
+
self._misc_objects.append(
|
371
|
+
dict(
|
372
|
+
layer_id=layer_idx,
|
373
|
+
num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
|
374
|
+
num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
|
375
|
+
num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
|
376
|
+
)
|
377
|
+
)
|
378
|
+
|
379
|
+
def reset(self):
|
380
|
+
self._topk_ids_of_layer[...] = -1
|
381
|
+
self._misc_objects.clear()
|
382
|
+
self._metadata = None
|
383
|
+
|
384
|
+
def collect(self) -> Dict:
|
385
|
+
num_tokens = len(self._metadata["input_ids"])
|
386
|
+
return dict(
|
387
|
+
**self._metadata,
|
388
|
+
topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
|
389
|
+
misc_objects=self._misc_objects,
|
390
|
+
)
|
391
|
+
|
392
|
+
|
393
|
+
class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
|
297
394
|
def __init__(self, *args, **kwargs):
|
298
395
|
super().__init__(*args, **kwargs)
|
299
396
|
self._objects_of_layer = {}
|
@@ -322,29 +419,63 @@ def _list_sum(a: List, b: List) -> List:
|
|
322
419
|
return [x + y for x, y in zip(a, b, strict=True)]
|
323
420
|
|
324
421
|
|
325
|
-
class
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
torch.
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
422
|
+
class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
423
|
+
def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
|
424
|
+
super().__init__(*args, **kwargs)
|
425
|
+
self._enable_global_physical_experts = enable_global_physical_experts
|
426
|
+
self._data = torch.zeros(
|
427
|
+
(
|
428
|
+
self._expert_location_metadata.num_layers,
|
429
|
+
(
|
430
|
+
self._expert_location_metadata.num_physical_experts
|
431
|
+
if enable_global_physical_experts
|
432
|
+
else self._expert_location_metadata.num_local_physical_experts
|
433
|
+
),
|
434
|
+
),
|
435
|
+
dtype=torch.int,
|
436
|
+
device="cuda",
|
437
|
+
)
|
337
438
|
|
338
|
-
|
439
|
+
def reset(self):
|
440
|
+
self._data[...] = 0
|
339
441
|
|
340
442
|
def collect(self) -> Dict:
|
341
|
-
|
342
|
-
|
343
|
-
|
443
|
+
if self._enable_global_physical_experts:
|
444
|
+
global_physical_count = self._data
|
445
|
+
else:
|
446
|
+
# Can optimize if bottleneck
|
447
|
+
global_physical_count = _convert_local_to_global_physical_count(
|
448
|
+
self._data,
|
449
|
+
rank=self._rank,
|
450
|
+
num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
|
451
|
+
num_physical_experts=self._expert_location_metadata.num_physical_experts,
|
452
|
+
)
|
453
|
+
|
344
454
|
return dict(global_physical_count=global_physical_count)
|
345
455
|
|
346
456
|
|
347
|
-
class
|
457
|
+
class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
|
458
|
+
def __init__(self, *args, **kwargs):
|
459
|
+
super().__init__(*args, **kwargs, enable_global_physical_experts=True)
|
460
|
+
|
461
|
+
# can optimize (e.g. fuse / compile)
|
462
|
+
def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
|
463
|
+
topk_ids = topk_ids.flatten()
|
464
|
+
mask = topk_ids != -1
|
465
|
+
self._data[layer_idx, :].scatter_add_(
|
466
|
+
dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
|
467
|
+
)
|
468
|
+
|
469
|
+
|
470
|
+
class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
|
471
|
+
def __init__(self, *args, **kwargs):
|
472
|
+
super().__init__(*args, **kwargs)
|
473
|
+
if torch.distributed.get_rank() == 0:
|
474
|
+
logger.info(
|
475
|
+
"DeepepNormalSinglePassGatherer gathers approximate statistics. "
|
476
|
+
"If used with small batch size, consider using expert_distribution_recorder_mode=stat."
|
477
|
+
)
|
478
|
+
|
348
479
|
def on_deepep_dispatch_normal(
|
349
480
|
self,
|
350
481
|
layer_idx: int,
|
@@ -369,17 +500,9 @@ class _DeepepNormalSinglePassGatherer(_LayerBasedSinglePassGatherer):
|
|
369
500
|
return dict(global_physical_count=global_physical_count)
|
370
501
|
|
371
502
|
|
372
|
-
class _DeepepLowLatencySinglePassGatherer(
|
503
|
+
class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
|
373
504
|
def __init__(self, *args, **kwargs):
|
374
|
-
super().__init__(*args, **kwargs)
|
375
|
-
self._data = torch.zeros(
|
376
|
-
(
|
377
|
-
self._expert_location_metadata.num_layers,
|
378
|
-
self._expert_location_metadata.num_local_physical_experts,
|
379
|
-
),
|
380
|
-
dtype=torch.int,
|
381
|
-
device="cuda",
|
382
|
-
)
|
505
|
+
super().__init__(*args, **kwargs, enable_global_physical_experts=False)
|
383
506
|
|
384
507
|
def on_deepep_dispatch_low_latency(
|
385
508
|
self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
|
@@ -387,19 +510,6 @@ class _DeepepLowLatencySinglePassGatherer(_SinglePassGatherer):
|
|
387
510
|
# Most naive implementation, can optimize later
|
388
511
|
self._data[layer_idx, :] += local_physical_count_of_layer
|
389
512
|
|
390
|
-
def reset(self):
|
391
|
-
self._data[...] = 0
|
392
|
-
|
393
|
-
def collect(self) -> Dict:
|
394
|
-
# Can optimize if bottleneck
|
395
|
-
global_physical_count = _convert_local_to_global_physical_count(
|
396
|
-
self._data,
|
397
|
-
rank=self._rank,
|
398
|
-
num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
|
399
|
-
num_physical_experts=self._expert_location_metadata.num_physical_experts,
|
400
|
-
)
|
401
|
-
return dict(global_physical_count=global_physical_count)
|
402
|
-
|
403
513
|
|
404
514
|
def _convert_local_to_global_physical_count(
|
405
515
|
local_physical_count: torch.Tensor,
|
@@ -438,9 +548,9 @@ class _Accumulator(ABC):
|
|
438
548
|
def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
|
439
549
|
return {
|
440
550
|
"stat": _StatAccumulator,
|
441
|
-
|
442
|
-
|
443
|
-
|
551
|
+
"stat_approx": _StatAccumulator,
|
552
|
+
"per_pass": _DetailAccumulator,
|
553
|
+
"per_token": _DetailAccumulator,
|
444
554
|
}[server_args.expert_distribution_recorder_mode]
|
445
555
|
|
446
556
|
def __init__(
|
@@ -547,6 +657,63 @@ class _DequeCollection:
|
|
547
657
|
return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
|
548
658
|
|
549
659
|
|
660
|
+
class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
|
661
|
+
def __init__(self, *args, **kwargs):
|
662
|
+
super().__init__(*args, **kwargs)
|
663
|
+
self._records = []
|
664
|
+
|
665
|
+
def get_single_pass_gatherer_keys(self):
|
666
|
+
if False: # TODO `server_args.enable_two_batch_overlap`
|
667
|
+
return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
|
668
|
+
return super().get_single_pass_gatherer_keys()
|
669
|
+
|
670
|
+
def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
|
671
|
+
if False: # TODO `server_args.enable_two_batch_overlap`
|
672
|
+
return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
|
673
|
+
return super().get_single_pass_gatherer_key(debug_name)
|
674
|
+
|
675
|
+
def append(
|
676
|
+
self,
|
677
|
+
forward_pass_id: int,
|
678
|
+
gatherer_key: str,
|
679
|
+
single_pass_data: Dict,
|
680
|
+
):
|
681
|
+
super().append(forward_pass_id, gatherer_key, single_pass_data)
|
682
|
+
|
683
|
+
def _process_object(obj):
|
684
|
+
if isinstance(obj, torch.Tensor):
|
685
|
+
return obj.cpu().clone()
|
686
|
+
return obj
|
687
|
+
|
688
|
+
single_pass_data_processed = {
|
689
|
+
k: _process_object(v) for k, v in single_pass_data.items()
|
690
|
+
}
|
691
|
+
|
692
|
+
self._records.append(
|
693
|
+
dict(
|
694
|
+
forward_pass_id=forward_pass_id,
|
695
|
+
rank=self._rank,
|
696
|
+
gatherer_key=gatherer_key,
|
697
|
+
**single_pass_data_processed,
|
698
|
+
)
|
699
|
+
)
|
700
|
+
|
701
|
+
def reset(self):
|
702
|
+
super().reset()
|
703
|
+
self._records.clear()
|
704
|
+
|
705
|
+
def dump(self, output_mode: _OutputMode):
|
706
|
+
assert output_mode == "file"
|
707
|
+
output = dict(
|
708
|
+
records=self._records,
|
709
|
+
# NOTE: This may change during recording, so here we say it is the "last" one
|
710
|
+
last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
|
711
|
+
)
|
712
|
+
_dump_to_file(
|
713
|
+
f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
|
714
|
+
)
|
715
|
+
|
716
|
+
|
550
717
|
class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
551
718
|
def __init__(self, *args, **kwargs):
|
552
719
|
super().__init__(*args, **kwargs)
|
@@ -560,6 +727,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
560
727
|
dtype=torch.int32,
|
561
728
|
device=self._server_args.device,
|
562
729
|
)
|
730
|
+
self._first_dump = True
|
563
731
|
|
564
732
|
def append(
|
565
733
|
self,
|
@@ -584,9 +752,15 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
584
752
|
num_logical_experts=self._expert_location_metadata.num_logical_experts,
|
585
753
|
physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
|
586
754
|
)
|
755
|
+
|
756
|
+
if self._first_dump:
|
757
|
+
self._first_dump = False
|
758
|
+
torch.cuda.empty_cache()
|
759
|
+
|
587
760
|
torch.distributed.all_reduce(
|
588
761
|
logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
|
589
762
|
)
|
763
|
+
|
590
764
|
output = dict(
|
591
765
|
rank=self._rank,
|
592
766
|
logical_count=logical_count_of_buffered_step,
|
@@ -13,6 +13,7 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
import json
|
15
15
|
import logging
|
16
|
+
import random
|
16
17
|
from dataclasses import dataclass
|
17
18
|
from pathlib import Path
|
18
19
|
from typing import List, Optional
|
@@ -22,7 +23,7 @@ import torch.distributed
|
|
22
23
|
import torch.nn.functional as F
|
23
24
|
|
24
25
|
from sglang.srt.configs.model_config import ModelConfig
|
25
|
-
from sglang.srt.managers import
|
26
|
+
from sglang.srt.managers import eplb_algorithms
|
26
27
|
from sglang.srt.model_loader import get_model_architecture
|
27
28
|
from sglang.srt.server_args import ServerArgs
|
28
29
|
|
@@ -32,9 +33,11 @@ logger = logging.getLogger(__name__)
|
|
32
33
|
@dataclass
|
33
34
|
class ExpertLocationMetadata:
|
34
35
|
physical_to_logical_map: torch.Tensor # (layers, num_physical_experts)
|
36
|
+
physical_to_logical_map_cpu: torch.Tensor
|
35
37
|
logical_to_all_physical_map: torch.Tensor # (layers, num_logical_experts, X)
|
36
38
|
logical_to_all_physical_map_num_valid: torch.Tensor # (layers, num_logical_experts)
|
37
|
-
|
39
|
+
# (layers, num_logical_experts)
|
40
|
+
logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
|
38
41
|
|
39
42
|
# -------------------------------- properties ------------------------------------
|
40
43
|
|
@@ -69,11 +72,8 @@ class ExpertLocationMetadata:
|
|
69
72
|
num_layers_2, num_logical_experts_1 = (
|
70
73
|
self.logical_to_all_physical_map_num_valid.shape
|
71
74
|
)
|
72
|
-
|
73
|
-
|
74
|
-
)
|
75
|
-
assert num_layers_0 == num_layers_1 == num_layers_2 == num_layers_3
|
76
|
-
assert num_logical_experts_0 == num_logical_experts_1 == num_logical_experts_2
|
75
|
+
assert num_layers_0 == num_layers_1 == num_layers_2
|
76
|
+
assert num_logical_experts_0 == num_logical_experts_1
|
77
77
|
assert num_physical_experts_0 == num_physical_experts_1
|
78
78
|
|
79
79
|
# -------------------------------- construction ------------------------------------
|
@@ -116,6 +116,7 @@ class ExpertLocationMetadata:
|
|
116
116
|
)
|
117
117
|
|
118
118
|
return ExpertLocationMetadata._init_raw(
|
119
|
+
server_args=server_args,
|
119
120
|
ep_size=common["ep_size"],
|
120
121
|
physical_to_logical_map=physical_to_logical_map,
|
121
122
|
logical_to_all_physical_map=logical_to_all_physical_map,
|
@@ -134,26 +135,31 @@ class ExpertLocationMetadata:
|
|
134
135
|
common = ExpertLocationMetadata._init_common(server_args, model_config)
|
135
136
|
model_config_for_expert_location = common["model_config_for_expert_location"]
|
136
137
|
num_physical_experts = common["num_physical_experts"]
|
137
|
-
|
138
|
-
|
139
|
-
if phase == "null":
|
140
|
-
phase = "decode"
|
138
|
+
num_groups = model_config_for_expert_location.num_groups
|
139
|
+
num_nodes = server_args.nnodes
|
141
140
|
|
142
141
|
physical_to_logical_map, logical_to_all_physical_map, expert_count = (
|
143
|
-
|
142
|
+
eplb_algorithms.rebalance_experts(
|
144
143
|
tokens_per_expert=logical_count,
|
145
144
|
num_physical_experts=num_physical_experts,
|
146
145
|
num_local_physical_experts=num_physical_experts // common["ep_size"],
|
147
|
-
num_groups=
|
148
|
-
num_nodes=
|
149
|
-
|
146
|
+
num_groups=num_groups,
|
147
|
+
num_nodes=num_nodes,
|
148
|
+
algorithm=eplb_algorithms.compute_algorithm(
|
149
|
+
raw_algorithm=server_args.eplb_algorithm,
|
150
|
+
num_groups=num_groups,
|
151
|
+
num_nodes=num_nodes,
|
152
|
+
),
|
150
153
|
)
|
151
154
|
)
|
152
155
|
|
153
156
|
return ExpertLocationMetadata._init_raw(
|
157
|
+
server_args=server_args,
|
154
158
|
ep_size=common["ep_size"],
|
155
|
-
physical_to_logical_map=physical_to_logical_map,
|
156
|
-
logical_to_all_physical_map=logical_to_all_physical_map
|
159
|
+
physical_to_logical_map=physical_to_logical_map.to(server_args.device),
|
160
|
+
logical_to_all_physical_map=logical_to_all_physical_map.to(
|
161
|
+
server_args.device
|
162
|
+
),
|
157
163
|
)
|
158
164
|
|
159
165
|
@staticmethod
|
@@ -179,6 +185,7 @@ class ExpertLocationMetadata:
|
|
179
185
|
|
180
186
|
@staticmethod
|
181
187
|
def _init_raw(
|
188
|
+
server_args: ServerArgs,
|
182
189
|
ep_size: int,
|
183
190
|
physical_to_logical_map: torch.Tensor,
|
184
191
|
logical_to_all_physical_map: torch.Tensor,
|
@@ -197,14 +204,19 @@ class ExpertLocationMetadata:
|
|
197
204
|
|
198
205
|
return ExpertLocationMetadata(
|
199
206
|
physical_to_logical_map=physical_to_logical_map,
|
207
|
+
physical_to_logical_map_cpu=physical_to_logical_map.cpu(),
|
200
208
|
logical_to_all_physical_map=logical_to_all_physical_map_padded,
|
201
209
|
logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
|
202
|
-
logical_to_rank_dispatch_physical_map=
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
210
|
+
logical_to_rank_dispatch_physical_map=(
|
211
|
+
compute_logical_to_rank_dispatch_physical_map(
|
212
|
+
logical_to_all_physical_map=logical_to_all_physical_map,
|
213
|
+
num_gpus=ep_size,
|
214
|
+
num_physical_experts=num_physical_experts,
|
215
|
+
# TODO improve when we have real EP rank
|
216
|
+
ep_rank=torch.distributed.get_rank() % ep_size,
|
217
|
+
)
|
218
|
+
if server_args.ep_dispatch_algorithm == "static"
|
219
|
+
else None
|
208
220
|
),
|
209
221
|
)
|
210
222
|
|
@@ -213,6 +225,7 @@ class ExpertLocationMetadata:
|
|
213
225
|
def update(
|
214
226
|
self,
|
215
227
|
other: "ExpertLocationMetadata",
|
228
|
+
update_layer_ids: List[int],
|
216
229
|
):
|
217
230
|
for field in [
|
218
231
|
"ep_size",
|
@@ -221,12 +234,21 @@ class ExpertLocationMetadata:
|
|
221
234
|
|
222
235
|
for field in [
|
223
236
|
"physical_to_logical_map",
|
237
|
+
"physical_to_logical_map_cpu",
|
224
238
|
"logical_to_all_physical_map",
|
225
239
|
"logical_to_all_physical_map_num_valid",
|
226
240
|
"logical_to_rank_dispatch_physical_map",
|
227
241
|
]:
|
228
|
-
|
229
|
-
|
242
|
+
other_field = getattr(other, field)
|
243
|
+
self_field = getattr(self, field)
|
244
|
+
assert (other_field is not None) == (self_field is not None)
|
245
|
+
if self_field is not None:
|
246
|
+
mask_update = torch.tensor(
|
247
|
+
[i in update_layer_ids for i in range(self.num_layers)]
|
248
|
+
)
|
249
|
+
mask_update = mask_update.view(*([-1] + [1] * (self_field.dim() - 1)))
|
250
|
+
mask_update = mask_update.to(self_field.device, non_blocking=True)
|
251
|
+
self_field[...] = torch.where(mask_update, other_field, self_field)
|
230
252
|
|
231
253
|
# -------------------------------- usage ------------------------------------
|
232
254
|
|
@@ -292,49 +314,82 @@ def _pad_nested_array(arr, pad_value):
|
|
292
314
|
return padded
|
293
315
|
|
294
316
|
|
295
|
-
# TODO
|
317
|
+
# TODO optimize performance (rewrite and/or run in separate process with overlap)
|
296
318
|
def compute_logical_to_rank_dispatch_physical_map(
|
297
319
|
logical_to_all_physical_map: torch.Tensor,
|
298
|
-
logical_to_all_physical_map_num_valid: torch.Tensor,
|
299
320
|
num_gpus: int,
|
300
321
|
num_physical_experts: int,
|
301
322
|
ep_rank: int,
|
302
|
-
|
323
|
+
seed: int = 42,
|
303
324
|
):
|
304
|
-
|
325
|
+
r = random.Random(seed)
|
305
326
|
|
306
327
|
num_local_physical_experts = num_physical_experts // num_gpus
|
307
328
|
num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
|
329
|
+
dtype = logical_to_all_physical_map.dtype
|
308
330
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
chosen_index = (
|
314
|
-
torch.randint(
|
315
|
-
0, 65536, output_shape, dtype=torch.int32, device=device, generator=g
|
316
|
-
)
|
317
|
-
% logical_to_all_physical_map_num_valid
|
331
|
+
logical_to_rank_dispatch_physical_map = torch.full(
|
332
|
+
size=(num_gpus, num_layers, num_logical_experts),
|
333
|
+
fill_value=-1,
|
334
|
+
dtype=dtype,
|
318
335
|
)
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
336
|
+
|
337
|
+
for layer_id in range(num_layers):
|
338
|
+
for logical_expert_id in range(num_logical_experts):
|
339
|
+
candidate_physical_expert_ids = _logical_to_all_physical_raw(
|
340
|
+
logical_to_all_physical_map, layer_id, logical_expert_id
|
341
|
+
)
|
342
|
+
output_partial = logical_to_rank_dispatch_physical_map[
|
343
|
+
:, layer_id, logical_expert_id
|
344
|
+
]
|
345
|
+
|
346
|
+
for gpu_id in range(num_gpus):
|
347
|
+
same_gpu_physical_expert_ids = [
|
348
|
+
physical_expert_id
|
349
|
+
for physical_expert_id in candidate_physical_expert_ids
|
350
|
+
if _compute_gpu_id_of_physical_expert(
|
351
|
+
physical_expert_id, num_local_physical_experts
|
352
|
+
)
|
353
|
+
== gpu_id
|
354
|
+
]
|
355
|
+
if len(same_gpu_physical_expert_ids) > 0:
|
356
|
+
output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
|
357
|
+
|
358
|
+
num_remain = torch.sum(output_partial == -1).item()
|
359
|
+
output_partial[output_partial == -1] = torch.tensor(
|
360
|
+
_fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
|
361
|
+
dtype=dtype,
|
362
|
+
)
|
335
363
|
|
336
364
|
assert torch.all(logical_to_rank_dispatch_physical_map != -1)
|
337
|
-
|
365
|
+
|
366
|
+
device = logical_to_all_physical_map.device
|
367
|
+
return logical_to_rank_dispatch_physical_map[ep_rank, :, :].to(device)
|
368
|
+
|
369
|
+
|
370
|
+
def _logical_to_all_physical_raw(
|
371
|
+
logical_to_all_physical_map, layer_id: int, logical_expert_id: int
|
372
|
+
) -> List[int]:
|
373
|
+
return [
|
374
|
+
physical_expert_id
|
375
|
+
for physical_expert_id in logical_to_all_physical_map[
|
376
|
+
layer_id, logical_expert_id
|
377
|
+
].tolist()
|
378
|
+
if physical_expert_id != -1
|
379
|
+
]
|
380
|
+
|
381
|
+
|
382
|
+
def _compute_gpu_id_of_physical_expert(
|
383
|
+
physical_expert_id: int, num_local_physical_experts: int
|
384
|
+
) -> int:
|
385
|
+
return physical_expert_id // num_local_physical_experts
|
386
|
+
|
387
|
+
|
388
|
+
def _fair_choices(arr: List, k: int, r: random.Random) -> List:
|
389
|
+
quotient, remainder = divmod(k, len(arr))
|
390
|
+
ans = arr * quotient + r.sample(arr, k=remainder)
|
391
|
+
r.shuffle(ans)
|
392
|
+
return ans
|
338
393
|
|
339
394
|
|
340
395
|
@dataclass
|
@@ -363,7 +418,6 @@ def compute_initial_expert_location_metadata(
|
|
363
418
|
) -> ExpertLocationMetadata:
|
364
419
|
data = server_args.init_expert_location
|
365
420
|
if data == "trivial":
|
366
|
-
logger.info("init_expert_location from trivial")
|
367
421
|
return ExpertLocationMetadata.init_trivial(server_args, model_config)
|
368
422
|
|
369
423
|
# TODO unify with the utils function
|