sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/lora/utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Optional, Set, Tuple
|
4
|
+
from typing import List, Optional, Set, Tuple
|
5
5
|
|
6
6
|
import torch
|
7
7
|
|
@@ -106,18 +106,22 @@ def get_hidden_dim(
|
|
106
106
|
raise NotImplementedError()
|
107
107
|
|
108
108
|
|
109
|
-
def
|
109
|
+
def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
|
110
110
|
"""
|
111
|
-
Mapping a target module name to
|
111
|
+
Mapping a target module name to names of the normized LoRA weights.
|
112
|
+
Returned tuple contains (name for Lora A, name for Lora B)
|
112
113
|
"""
|
113
114
|
params_mapping = {
|
114
|
-
"q_proj": ("qkv_proj", "q_proj"),
|
115
|
-
"k_proj": ("qkv_proj", "kv_proj"),
|
116
|
-
"v_proj": ("qkv_proj", "kv_proj"),
|
117
|
-
"gate_proj": ("gate_up_proj", "gate_up_proj"),
|
118
|
-
"up_proj": ("gate_up_proj", "gate_up_proj"),
|
115
|
+
"q_proj": (["qkv_proj"], ["q_proj"]),
|
116
|
+
"k_proj": (["qkv_proj"], ["kv_proj"]),
|
117
|
+
"v_proj": (["qkv_proj"], ["kv_proj"]),
|
118
|
+
"gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
119
|
+
"up_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
120
|
+
"qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
|
121
|
+
"gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
119
122
|
}
|
120
|
-
|
123
|
+
stacked = params_mapping.get(name, ([name], [name]))
|
124
|
+
return stacked
|
121
125
|
|
122
126
|
|
123
127
|
def get_stacked_multiply(module_name: str) -> int:
|
@@ -133,7 +137,7 @@ def get_stacked_multiply(module_name: str) -> int:
|
|
133
137
|
|
134
138
|
|
135
139
|
def get_weight_name(
|
136
|
-
target_name: str, lora_weight_names: Set[
|
140
|
+
target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
|
137
141
|
) -> Optional[str]:
|
138
142
|
"""
|
139
143
|
target_name is name of a given module,
|
@@ -142,9 +146,9 @@ def get_weight_name(
|
|
142
146
|
Else raise ValueError.
|
143
147
|
"""
|
144
148
|
idx = 0 if lora_type == LoRAType.LORA_A else 1
|
145
|
-
for
|
146
|
-
if
|
147
|
-
return
|
149
|
+
for weight_name in lora_weight_names[idx]:
|
150
|
+
if weight_name in target_name:
|
151
|
+
return weight_name
|
148
152
|
raise ValueError(
|
149
153
|
f"Cannot find weight name for {target_name} in {lora_weight_names}"
|
150
154
|
)
|
@@ -22,7 +22,8 @@ from typing import List, Optional
|
|
22
22
|
|
23
23
|
import torch
|
24
24
|
|
25
|
-
from sglang.srt.mem_cache.memory_pool import
|
25
|
+
from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
|
26
|
+
from sglang.srt.mem_cache.memory_pool_host import HostKVCache
|
26
27
|
|
27
28
|
logger = logging.getLogger(__name__)
|
28
29
|
|
@@ -248,12 +248,20 @@ class DataParallelController:
|
|
248
248
|
|
249
249
|
def round_robin_scheduler(self, req: Req):
|
250
250
|
if self.server_args.disaggregation_mode == "null":
|
251
|
-
|
252
|
-
|
253
|
-
self.workers
|
254
|
-
|
251
|
+
if req.data_parallel_rank is not None:
|
252
|
+
logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
|
253
|
+
self.workers[req.data_parallel_rank].send_pyobj(req)
|
254
|
+
else:
|
255
|
+
self.workers[self.round_robin_counter].send_pyobj(req)
|
256
|
+
self.round_robin_counter = (self.round_robin_counter + 1) % len(
|
257
|
+
self.workers
|
258
|
+
)
|
255
259
|
else:
|
256
|
-
|
260
|
+
if req.data_parallel_rank is not None:
|
261
|
+
logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
|
262
|
+
self.workers[req.data_parallel_rank].send_pyobj(req)
|
263
|
+
else:
|
264
|
+
self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
|
257
265
|
|
258
266
|
def shortest_queue_scheduler(self, input_requests):
|
259
267
|
raise NotImplementedError()
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
|
7
|
+
|
8
|
+
|
9
|
+
class EplbAlgorithm(Enum):
|
10
|
+
deepseek = auto()
|
11
|
+
deepseek_hierarchical = auto()
|
12
|
+
deepseek_vec = auto()
|
13
|
+
deepseek_vec_hierarchical = auto()
|
14
|
+
# TODO may have more algorithm later
|
15
|
+
|
16
|
+
|
17
|
+
def rebalance_experts(
|
18
|
+
tokens_per_expert: torch.Tensor,
|
19
|
+
num_physical_experts: int,
|
20
|
+
num_local_physical_experts: int,
|
21
|
+
num_groups: Optional[int],
|
22
|
+
num_nodes: int,
|
23
|
+
algorithm: EplbAlgorithm,
|
24
|
+
):
|
25
|
+
if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
|
26
|
+
return deepseek.rebalance_experts(
|
27
|
+
weight=tokens_per_expert.sum(dim=0),
|
28
|
+
num_replicas=num_physical_experts,
|
29
|
+
num_groups=num_groups,
|
30
|
+
num_nodes=num_nodes,
|
31
|
+
num_gpus=num_physical_experts // num_local_physical_experts,
|
32
|
+
enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
|
33
|
+
)
|
34
|
+
|
35
|
+
if algorithm in [
|
36
|
+
EplbAlgorithm.deepseek_vec,
|
37
|
+
EplbAlgorithm.deepseek_vec_hierarchical,
|
38
|
+
]:
|
39
|
+
return deepseek_vec.rebalance_experts(
|
40
|
+
tokens_per_expert=tokens_per_expert,
|
41
|
+
num_physical_experts=num_physical_experts,
|
42
|
+
num_local_physical_experts=num_local_physical_experts,
|
43
|
+
num_groups=num_groups,
|
44
|
+
num_nodes=num_nodes,
|
45
|
+
enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
|
46
|
+
)
|
47
|
+
|
48
|
+
raise NotImplementedError
|
49
|
+
|
50
|
+
|
51
|
+
def compute_algorithm(
|
52
|
+
raw_algorithm: str,
|
53
|
+
num_groups: Optional[int],
|
54
|
+
num_nodes: int,
|
55
|
+
) -> EplbAlgorithm:
|
56
|
+
if raw_algorithm != "auto":
|
57
|
+
return EplbAlgorithm[raw_algorithm]
|
58
|
+
|
59
|
+
# TODO test on real scenarios and know which ones perform better
|
60
|
+
if (num_groups is not None) and (num_groups % num_nodes == 0):
|
61
|
+
return EplbAlgorithm.deepseek_hierarchical
|
62
|
+
else:
|
63
|
+
return EplbAlgorithm.deepseek
|
@@ -0,0 +1,223 @@
|
|
1
|
+
# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
|
2
|
+
from typing import Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.utils import get_bool_env_var
|
7
|
+
|
8
|
+
|
9
|
+
def balanced_packing(
|
10
|
+
weight: torch.Tensor, num_packs: int
|
11
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
12
|
+
"""
|
13
|
+
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
|
14
|
+
are as balanced as possible.
|
15
|
+
|
16
|
+
Parameters:
|
17
|
+
weight: [X, n], the weight of each item
|
18
|
+
num_packs: number of packs
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
pack_index: [X, n], the pack index of each item
|
22
|
+
rank_in_pack: [X, n], the rank of the item in the pack
|
23
|
+
"""
|
24
|
+
num_layers, num_groups = weight.shape
|
25
|
+
assert num_groups % num_packs == 0
|
26
|
+
groups_per_pack = num_groups // num_packs
|
27
|
+
|
28
|
+
if groups_per_pack == 1:
|
29
|
+
pack_index = torch.arange(
|
30
|
+
weight.size(-1), dtype=torch.int64, device=weight.device
|
31
|
+
).expand(weight.shape)
|
32
|
+
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
|
33
|
+
return pack_index, rank_in_pack
|
34
|
+
|
35
|
+
indices = weight.float().sort(-1, descending=True).indices.cpu()
|
36
|
+
pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
|
37
|
+
rank_in_pack = torch.full_like(pack_index, fill_value=-1)
|
38
|
+
for i in range(num_layers):
|
39
|
+
pack_weights = [0] * num_packs
|
40
|
+
pack_items = [0] * num_packs
|
41
|
+
for group in indices[i]:
|
42
|
+
pack = min(
|
43
|
+
(i for i in range(num_packs) if pack_items[i] < groups_per_pack),
|
44
|
+
key=pack_weights.__getitem__,
|
45
|
+
)
|
46
|
+
assert pack_items[pack] < groups_per_pack
|
47
|
+
pack_index[i, group] = pack
|
48
|
+
rank_in_pack[i, group] = pack_items[pack]
|
49
|
+
pack_weights[pack] += weight[i, group]
|
50
|
+
pack_items[pack] += 1
|
51
|
+
return pack_index, rank_in_pack
|
52
|
+
|
53
|
+
|
54
|
+
def replicate_experts(
|
55
|
+
weight: torch.Tensor, num_phy: int
|
56
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
57
|
+
"""
|
58
|
+
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
|
59
|
+
|
60
|
+
Parameters:
|
61
|
+
weight: [X, num_log]
|
62
|
+
num_phy: total number of experts after replication
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
phy2log: [X, num_phy], logical expert id of each physical expert
|
66
|
+
rank: [X, num_phy], the replica rank
|
67
|
+
logcnt: [X, num_log], number of replicas for each logical expert
|
68
|
+
"""
|
69
|
+
n, num_log = weight.shape
|
70
|
+
num_redundant = num_phy - num_log
|
71
|
+
assert num_redundant >= 0
|
72
|
+
device = weight.device
|
73
|
+
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
|
74
|
+
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
|
75
|
+
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
|
76
|
+
arangen = torch.arange(n, dtype=torch.int64, device=device)
|
77
|
+
for i in range(num_log, num_phy):
|
78
|
+
redundant_indices = (weight / logcnt).max(dim=-1).indices
|
79
|
+
phy2log[:, i] = redundant_indices
|
80
|
+
rank[:, i] = logcnt[arangen, redundant_indices]
|
81
|
+
logcnt[arangen, redundant_indices] += 1
|
82
|
+
return phy2log, rank, logcnt
|
83
|
+
|
84
|
+
|
85
|
+
def rebalance_experts_hierarchical(
|
86
|
+
weight: torch.Tensor,
|
87
|
+
num_physical_experts: int,
|
88
|
+
num_groups: int,
|
89
|
+
num_nodes: int,
|
90
|
+
num_gpus: int,
|
91
|
+
):
|
92
|
+
"""
|
93
|
+
Parameters:
|
94
|
+
weight: [num_moe_layers, num_logical_experts]
|
95
|
+
num_physical_experts: number of physical experts after replication
|
96
|
+
num_groups: number of expert groups
|
97
|
+
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
98
|
+
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
physical_to_logical_map: [num_moe_layers, num_physical_experts]
|
102
|
+
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
|
103
|
+
logical_count: [num_moe_layers, num_logical_experts]
|
104
|
+
"""
|
105
|
+
num_layers, num_logical_experts = weight.shape
|
106
|
+
assert num_logical_experts % num_groups == 0
|
107
|
+
group_size = num_logical_experts // num_groups
|
108
|
+
assert num_groups % num_nodes == 0
|
109
|
+
groups_per_node = num_groups // num_nodes
|
110
|
+
assert num_gpus % num_nodes == 0
|
111
|
+
assert num_physical_experts % num_gpus == 0
|
112
|
+
phy_experts_per_gpu = num_physical_experts // num_gpus
|
113
|
+
|
114
|
+
def inverse(perm: torch.Tensor) -> torch.Tensor:
|
115
|
+
inv = torch.empty_like(perm)
|
116
|
+
inv.scatter_(
|
117
|
+
1,
|
118
|
+
perm,
|
119
|
+
torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
|
120
|
+
perm.shape
|
121
|
+
),
|
122
|
+
)
|
123
|
+
return inv
|
124
|
+
|
125
|
+
# Step 1: pack groups to nodes
|
126
|
+
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
|
127
|
+
group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
|
128
|
+
log2mlog = (
|
129
|
+
(
|
130
|
+
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
|
131
|
+
).unsqueeze(-1)
|
132
|
+
+ torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
|
133
|
+
).flatten(-2)
|
134
|
+
mlog2log = inverse(log2mlog)
|
135
|
+
|
136
|
+
# Step 2: construct redundant experts within nodes
|
137
|
+
# [num_layers * num_nodes, num_logical_experts // num_nodes]
|
138
|
+
tokens_per_mlog = weight.gather(-1, mlog2log).view(
|
139
|
+
-1, num_logical_experts // num_nodes
|
140
|
+
)
|
141
|
+
phy2mlog, phyrank, mlogcnt = replicate_experts(
|
142
|
+
tokens_per_mlog, num_physical_experts // num_nodes
|
143
|
+
)
|
144
|
+
|
145
|
+
# Step 3: pack physical_experts to GPUs
|
146
|
+
# [num_layers * num_nodes, num_physical_experts // num_nodes]
|
147
|
+
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
|
148
|
+
pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
|
149
|
+
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
|
150
|
+
pphy2phy = inverse(phy2pphy)
|
151
|
+
|
152
|
+
pphy2mlog = phy2mlog.gather(
|
153
|
+
-1, pphy2phy
|
154
|
+
) # [num_layers * num_nodes, num_log_per_nodes]
|
155
|
+
pphy2mlog = (
|
156
|
+
pphy2mlog.view(num_layers, num_nodes, -1)
|
157
|
+
+ torch.arange(
|
158
|
+
0,
|
159
|
+
num_logical_experts,
|
160
|
+
num_logical_experts // num_nodes,
|
161
|
+
device=group_pack_index.device,
|
162
|
+
).view(1, -1, 1)
|
163
|
+
).flatten(-2)
|
164
|
+
pphy2log = mlog2log.gather(-1, pphy2mlog)
|
165
|
+
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
|
166
|
+
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
|
167
|
+
return pphy2log, pphyrank, logcnt
|
168
|
+
|
169
|
+
|
170
|
+
def rebalance_experts(
|
171
|
+
weight: torch.Tensor,
|
172
|
+
num_replicas: int,
|
173
|
+
num_groups: int,
|
174
|
+
num_nodes: int,
|
175
|
+
num_gpus: int,
|
176
|
+
enable_hierarchical: bool,
|
177
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
178
|
+
"""
|
179
|
+
Entry point for expert-parallelism load balancer.
|
180
|
+
|
181
|
+
Parameters:
|
182
|
+
weight: [layers, num_logical_experts], the load statistics for all logical experts
|
183
|
+
num_replicas: number of physical experts, must be a multiple of `num_gpus`
|
184
|
+
num_groups: number of expert groups
|
185
|
+
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
186
|
+
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
187
|
+
|
188
|
+
Returns:
|
189
|
+
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
|
190
|
+
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
|
191
|
+
expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
|
192
|
+
"""
|
193
|
+
|
194
|
+
num_layers, num_logical_experts = weight.shape
|
195
|
+
weight = weight.float().cpu()
|
196
|
+
if enable_hierarchical:
|
197
|
+
# use hierarchical load-balance policy
|
198
|
+
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
199
|
+
weight, num_replicas, num_groups, num_nodes, num_gpus
|
200
|
+
)
|
201
|
+
else:
|
202
|
+
# use global load-balance policy
|
203
|
+
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
204
|
+
weight, num_replicas, 1, 1, num_gpus
|
205
|
+
)
|
206
|
+
maxlogcnt = logcnt.max().item()
|
207
|
+
log2phy: torch.Tensor = torch.full(
|
208
|
+
(num_layers, num_logical_experts, maxlogcnt),
|
209
|
+
-1,
|
210
|
+
dtype=torch.int64,
|
211
|
+
device=logcnt.device,
|
212
|
+
)
|
213
|
+
log2phy.view(num_layers, -1).scatter_(
|
214
|
+
-1,
|
215
|
+
phy2log * maxlogcnt + phyrank,
|
216
|
+
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
|
217
|
+
num_layers, -1
|
218
|
+
),
|
219
|
+
)
|
220
|
+
return phy2log, log2phy, logcnt
|
221
|
+
|
222
|
+
|
223
|
+
__all__ = ["rebalance_experts"]
|
@@ -1,6 +1,5 @@
|
|
1
1
|
# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
|
2
|
-
|
3
|
-
from typing import Literal, Tuple
|
2
|
+
from typing import Optional, Tuple
|
4
3
|
|
5
4
|
import torch
|
6
5
|
|
@@ -257,11 +256,11 @@ def rebalance_experts(
|
|
257
256
|
tokens_per_expert: torch.Tensor,
|
258
257
|
num_physical_experts: int,
|
259
258
|
num_local_physical_experts: int,
|
260
|
-
num_groups: int,
|
259
|
+
num_groups: Optional[int],
|
261
260
|
num_nodes: int,
|
262
|
-
|
261
|
+
enable_hierarchical: bool,
|
263
262
|
):
|
264
|
-
if
|
263
|
+
if enable_hierarchical:
|
265
264
|
return prefill_rebalance_experts(
|
266
265
|
tokens_per_expert=tokens_per_expert,
|
267
266
|
num_physical_experts=num_physical_experts,
|
@@ -269,10 +268,9 @@ def rebalance_experts(
|
|
269
268
|
num_groups=num_groups,
|
270
269
|
num_nodes=num_nodes,
|
271
270
|
)
|
272
|
-
|
271
|
+
else:
|
273
272
|
return decode_rebalance_experts(
|
274
273
|
tokens_per_expert=tokens_per_expert,
|
275
274
|
num_physical_experts=num_physical_experts,
|
276
275
|
num_local_physical_experts=num_local_physical_experts,
|
277
276
|
)
|
278
|
-
raise NotImplementedError
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
-
from typing import TYPE_CHECKING
|
3
|
+
from typing import TYPE_CHECKING, List
|
4
4
|
|
5
5
|
import torch.cuda
|
6
6
|
|
@@ -20,27 +20,45 @@ class EPLBManager:
|
|
20
20
|
super().__init__()
|
21
21
|
self._model_runner = model_runner
|
22
22
|
self._server_args = model_runner.server_args
|
23
|
+
self._rebalance_layers_per_chunk = (
|
24
|
+
self._server_args.eplb_rebalance_layers_per_chunk
|
25
|
+
)
|
26
|
+
self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
|
23
27
|
|
24
28
|
# Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
|
25
29
|
assert (
|
26
30
|
self._server_args.eplb_rebalance_num_iterations
|
27
|
-
|
28
|
-
), "eplb_rebalance_num_iterations must be
|
31
|
+
>= self._server_args.expert_distribution_recorder_buffer_size
|
32
|
+
), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
|
29
33
|
|
30
|
-
get_global_expert_distribution_recorder().
|
34
|
+
if not get_global_expert_distribution_recorder().recording:
|
35
|
+
get_global_expert_distribution_recorder().start_record()
|
31
36
|
|
32
37
|
logger.info(
|
33
|
-
f"[EPLBManager] system started, will rebalance per {self.
|
38
|
+
f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
|
34
39
|
)
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
|
41
|
+
self._main_generator = self._entrypoint()
|
42
|
+
|
43
|
+
def on_forward_pass_end(self):
|
44
|
+
next(self._main_generator)
|
45
|
+
|
46
|
+
# can be more complex if needed
|
47
|
+
def _entrypoint(self):
|
48
|
+
while True:
|
49
|
+
for _ in range(self._rebalance_num_iterations):
|
50
|
+
yield
|
51
|
+
|
52
|
+
yield from self.rebalance()
|
39
53
|
|
40
54
|
def rebalance(self):
|
41
55
|
logger.info("[EPLBManager] rebalance start")
|
42
|
-
|
43
|
-
|
56
|
+
|
57
|
+
enable_timing = self._rebalance_layers_per_chunk is None
|
58
|
+
|
59
|
+
if enable_timing:
|
60
|
+
torch.cuda.synchronize()
|
61
|
+
time_start = time.time()
|
44
62
|
|
45
63
|
logical_count = get_global_expert_distribution_recorder().dump_record(
|
46
64
|
output_mode="object"
|
@@ -48,8 +66,31 @@ class EPLBManager:
|
|
48
66
|
expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
|
49
67
|
self._server_args, self._model_runner.model_config, logical_count
|
50
68
|
)
|
51
|
-
self._model_runner.update_expert_location(expert_location_metadata)
|
52
69
|
|
53
|
-
|
54
|
-
|
55
|
-
|
70
|
+
update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
|
71
|
+
for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
|
72
|
+
if len(update_layer_ids_chunks) > 1:
|
73
|
+
yield
|
74
|
+
self._model_runner.update_expert_location(
|
75
|
+
expert_location_metadata,
|
76
|
+
update_layer_ids=update_layer_ids,
|
77
|
+
)
|
78
|
+
|
79
|
+
msg = f"[EPLBManager] rebalance end"
|
80
|
+
if enable_timing:
|
81
|
+
torch.cuda.synchronize()
|
82
|
+
time_end = time.time()
|
83
|
+
msg += f" time={time_end - time_start:.3f}s"
|
84
|
+
logger.info(msg)
|
85
|
+
|
86
|
+
def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
|
87
|
+
all_layer_ids = sorted(
|
88
|
+
list(self._model_runner.model.routed_experts_weights_of_layer.keys())
|
89
|
+
)
|
90
|
+
chunk_size = self._rebalance_layers_per_chunk or 1000000
|
91
|
+
return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
|
92
|
+
|
93
|
+
|
94
|
+
def _chunk_list(items: List, chunk_size):
|
95
|
+
for start_index in range(0, len(items), chunk_size):
|
96
|
+
yield items[start_index : start_index + chunk_size]
|