sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,81 +1,903 @@
|
|
1
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
2
14
|
import logging
|
15
|
+
import os
|
3
16
|
import time
|
4
|
-
from
|
5
|
-
from
|
17
|
+
from abc import ABC
|
18
|
+
from collections import deque
|
19
|
+
from contextlib import contextmanager
|
20
|
+
from pathlib import Path
|
21
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Type
|
6
22
|
|
23
|
+
import einops
|
7
24
|
import torch
|
25
|
+
import torch.distributed
|
26
|
+
|
27
|
+
from sglang.srt.managers.expert_location import ExpertLocationMetadata
|
28
|
+
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
29
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
30
|
+
from sglang.srt.server_args import ServerArgs
|
31
|
+
from sglang.srt.utils import Withable, get_bool_env_var
|
8
32
|
|
9
33
|
logger = logging.getLogger(__name__)
|
10
34
|
|
35
|
+
# --------------------------------------- Entrypoint -----------------------------------------
|
36
|
+
|
37
|
+
_OutputMode = Literal["file", "object"]
|
38
|
+
|
39
|
+
|
40
|
+
class ExpertDistributionRecorder(ABC):
|
41
|
+
"""Global expert distribution recording"""
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def init_new(
|
45
|
+
server_args: ServerArgs,
|
46
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
47
|
+
rank: int,
|
48
|
+
):
|
49
|
+
if server_args.expert_distribution_recorder_mode is not None:
|
50
|
+
return _ExpertDistributionRecorderReal(
|
51
|
+
server_args, expert_location_metadata, rank
|
52
|
+
)
|
53
|
+
else:
|
54
|
+
return _ExpertDistributionRecorderNoop()
|
55
|
+
|
56
|
+
@contextmanager
|
57
|
+
def with_current_layer(self, layer_idx):
|
58
|
+
yield
|
59
|
+
|
60
|
+
@contextmanager
|
61
|
+
def with_debug_name(self, debug_name):
|
62
|
+
yield
|
63
|
+
|
64
|
+
@contextmanager
|
65
|
+
def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
|
66
|
+
yield
|
67
|
+
|
68
|
+
def on_select_experts(self, topk_ids: torch.Tensor):
|
69
|
+
pass
|
70
|
+
|
71
|
+
def on_deepep_dispatch_normal(
|
72
|
+
self,
|
73
|
+
local_physical_count_of_layer: List[int],
|
74
|
+
num_tokens_per_rank,
|
75
|
+
num_tokens_per_rdma_rank,
|
76
|
+
num_tokens_per_expert,
|
77
|
+
):
|
78
|
+
pass
|
79
|
+
|
80
|
+
def on_deepep_dispatch_low_latency(
|
81
|
+
self, local_physical_count_of_layer: torch.Tensor
|
82
|
+
):
|
83
|
+
pass
|
84
|
+
|
85
|
+
def start_record(self):
|
86
|
+
self._on_not_implemented()
|
87
|
+
|
88
|
+
def stop_record(self):
|
89
|
+
self._on_not_implemented()
|
90
|
+
|
91
|
+
def dump_record(self, output_mode: _OutputMode = "file"):
|
92
|
+
self._on_not_implemented()
|
93
|
+
|
94
|
+
@property
|
95
|
+
def recording(self):
|
96
|
+
return False
|
97
|
+
|
98
|
+
def _on_not_implemented(self):
|
99
|
+
raise Exception(
|
100
|
+
"Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
class _ExpertDistributionRecorderNoop(ExpertDistributionRecorder):
|
105
|
+
pass
|
106
|
+
|
107
|
+
|
108
|
+
class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
109
|
+
def __init__(
|
110
|
+
self,
|
111
|
+
server_args: ServerArgs,
|
112
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
113
|
+
rank: int,
|
114
|
+
):
|
115
|
+
self._server_args = server_args
|
116
|
+
self._expert_location_metadata = expert_location_metadata
|
117
|
+
|
118
|
+
self._recording = False
|
119
|
+
self._current_forward_pass_id = Withable()
|
120
|
+
self._current_layer_idx = Withable()
|
121
|
+
self._current_debug_name = Withable()
|
122
|
+
self._accumulator = _Accumulator.init_new(
|
123
|
+
server_args, expert_location_metadata, rank
|
124
|
+
)
|
125
|
+
self._single_pass_gatherers = {
|
126
|
+
k: _SinglePassGatherer.init_new(server_args, expert_location_metadata, rank)
|
127
|
+
for k in self._accumulator.get_single_pass_gatherer_keys()
|
128
|
+
}
|
129
|
+
|
130
|
+
if server_args.enable_expert_distribution_metrics:
|
131
|
+
logger.info(
|
132
|
+
"ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
|
133
|
+
)
|
134
|
+
self.start_record()
|
135
|
+
|
136
|
+
def with_current_layer(self, layer_idx):
|
137
|
+
return self._current_layer_idx.with_value(layer_idx)
|
138
|
+
|
139
|
+
def with_debug_name(self, debug_name):
|
140
|
+
return self._current_debug_name.with_value(debug_name)
|
11
141
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
142
|
+
@contextmanager
|
143
|
+
def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
|
144
|
+
with self._current_forward_pass_id.with_value(forward_pass_id):
|
145
|
+
self._on_forward_pass_start(forward_batch)
|
146
|
+
try:
|
147
|
+
yield
|
148
|
+
finally:
|
149
|
+
self._on_forward_pass_end(forward_pass_id)
|
19
150
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
151
|
+
def _on_forward_pass_start(self, forward_batch: ForwardBatch):
|
152
|
+
if not self._recording:
|
153
|
+
return
|
154
|
+
for gatherer_key, gatherer in self._single_pass_gatherers.items():
|
155
|
+
gatherer.reset()
|
156
|
+
gatherer.on_forward_pass_start(forward_batch)
|
157
|
+
|
158
|
+
def _on_forward_pass_end(self, forward_pass_id: int):
|
159
|
+
if not self._recording:
|
160
|
+
return
|
161
|
+
for gatherer_key, gatherer in self._single_pass_gatherers.items():
|
162
|
+
single_pass_data = gatherer.collect()
|
163
|
+
self._accumulator.append(forward_pass_id, gatherer_key, single_pass_data)
|
164
|
+
|
165
|
+
def on_select_experts(self, topk_ids: torch.Tensor):
|
166
|
+
self._on_hook("on_select_experts", topk_ids=topk_ids)
|
167
|
+
|
168
|
+
def on_deepep_dispatch_normal(
|
169
|
+
self,
|
170
|
+
local_physical_count_of_layer: List[int],
|
171
|
+
num_tokens_per_rank,
|
172
|
+
num_tokens_per_rdma_rank,
|
173
|
+
num_tokens_per_expert,
|
174
|
+
):
|
175
|
+
self._on_hook(
|
176
|
+
"on_deepep_dispatch_normal",
|
177
|
+
local_physical_count_of_layer=local_physical_count_of_layer,
|
178
|
+
num_tokens_per_rank=num_tokens_per_rank,
|
179
|
+
num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
|
180
|
+
num_tokens_per_expert=num_tokens_per_expert,
|
26
181
|
)
|
27
|
-
self._record = False
|
28
|
-
self._current_layer_id = "UNKNOWN"
|
29
182
|
|
30
|
-
def
|
31
|
-
self
|
183
|
+
def on_deepep_dispatch_low_latency(
|
184
|
+
self, local_physical_count_of_layer: torch.Tensor
|
185
|
+
):
|
186
|
+
self._on_hook(
|
187
|
+
"on_deepep_dispatch_low_latency",
|
188
|
+
local_physical_count_of_layer=local_physical_count_of_layer,
|
189
|
+
)
|
32
190
|
|
33
|
-
def
|
34
|
-
if not self.
|
191
|
+
def _on_hook(self, hook_name: str, **kwargs):
|
192
|
+
if not (self._recording or torch.cuda.is_current_stream_capturing()):
|
35
193
|
return
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
194
|
+
gatherer = self._single_pass_gatherers[
|
195
|
+
self._accumulator.get_single_pass_gatherer_key(
|
196
|
+
self._current_debug_name.value
|
197
|
+
)
|
198
|
+
]
|
199
|
+
getattr(gatherer, hook_name)(layer_idx=self._current_layer_idx.value, **kwargs)
|
40
200
|
|
41
|
-
def
|
201
|
+
def _reset(self):
|
42
202
|
"""Reset the expert distribution recorder."""
|
43
|
-
logger.info("Resetting
|
44
|
-
|
45
|
-
|
46
|
-
self.
|
203
|
+
logger.info("Resetting ExpertDistributionRecorder...")
|
204
|
+
assert (
|
205
|
+
self._current_layer_idx.value is None
|
206
|
+
), f"{self._current_layer_idx.value=}"
|
207
|
+
for gatherer in self._single_pass_gatherers.values():
|
208
|
+
gatherer.reset()
|
209
|
+
self._accumulator.reset()
|
47
210
|
|
48
211
|
def start_record(self):
|
49
|
-
"""Start recording the expert distribution.
|
50
|
-
if self.
|
212
|
+
"""Start recording the expert distribution."""
|
213
|
+
if self._recording:
|
51
214
|
logger.warning(
|
52
215
|
"SGLang server is already recording expert ids. Did you forget to dump the expert ids recorded so far by sending requests to the `/stop_expert_distribution_record` and `/dump_expert_distribution_record` endpoints?"
|
53
216
|
)
|
54
|
-
self.
|
55
|
-
self.
|
217
|
+
self._reset()
|
218
|
+
self._recording = True
|
56
219
|
|
57
220
|
def stop_record(self):
|
58
|
-
"""Stop recording the expert distribution.
|
59
|
-
if self.
|
221
|
+
"""Stop recording the expert distribution."""
|
222
|
+
if not self._recording:
|
60
223
|
logger.warning(
|
61
224
|
"SGLang server has not been recording expert ids. Did you forget to start recording by sending request to the `/start_expert_distribution_record` endpoint?"
|
62
225
|
)
|
63
|
-
self.
|
64
|
-
|
65
|
-
def dump_record(self):
|
66
|
-
"""Dump the expert distribution record
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
226
|
+
self._recording = False
|
227
|
+
|
228
|
+
def dump_record(self, output_mode: _OutputMode = "file"):
|
229
|
+
"""Dump the expert distribution record and reset the recorder after dumping."""
|
230
|
+
output = self._accumulator.dump(output_mode=output_mode)
|
231
|
+
self._reset()
|
232
|
+
return output
|
233
|
+
|
234
|
+
@property
|
235
|
+
def recording(self):
|
236
|
+
return self._recording
|
237
|
+
|
238
|
+
|
239
|
+
_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
|
240
|
+
_ExpertDistributionRecorderNoop()
|
241
|
+
)
|
242
|
+
|
243
|
+
|
244
|
+
def get_global_expert_distribution_recorder():
|
245
|
+
return _global_expert_distribution_recorder
|
246
|
+
|
247
|
+
|
248
|
+
def set_global_expert_distribution_recorder(value):
|
249
|
+
global _global_expert_distribution_recorder
|
250
|
+
_global_expert_distribution_recorder = value
|
251
|
+
|
252
|
+
|
253
|
+
# --------------------------------------- SinglePassGatherer -----------------------------------------
|
254
|
+
|
255
|
+
|
256
|
+
class _SinglePassGatherer(ABC):
|
257
|
+
@staticmethod
|
258
|
+
def init_new(
|
259
|
+
server_args: ServerArgs,
|
260
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
261
|
+
rank: int,
|
262
|
+
) -> "_SinglePassGatherer":
|
263
|
+
if server_args.expert_distribution_recorder_mode == "per_token":
|
264
|
+
return _DetailSinglePassGatherer(
|
265
|
+
server_args, expert_location_metadata, rank
|
266
|
+
)
|
267
|
+
|
268
|
+
if server_args.expert_distribution_recorder_mode == "stat_approx":
|
269
|
+
if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
|
270
|
+
return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
|
271
|
+
else:
|
272
|
+
raise NotImplementedError
|
273
|
+
|
274
|
+
if server_args.enable_deepep_moe:
|
275
|
+
if server_args.deepep_mode == "normal":
|
276
|
+
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
277
|
+
elif server_args.deepep_mode == "low_latency":
|
278
|
+
return _DeepepLowLatencySinglePassGatherer(
|
279
|
+
expert_location_metadata, rank
|
280
|
+
)
|
281
|
+
else:
|
282
|
+
raise NotImplementedError
|
283
|
+
|
284
|
+
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
285
|
+
|
286
|
+
def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
|
287
|
+
self._expert_location_metadata = expert_location_metadata
|
288
|
+
self._rank = rank
|
289
|
+
|
290
|
+
def on_forward_pass_start(self, forward_batch: ForwardBatch):
|
291
|
+
pass
|
292
|
+
|
293
|
+
def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
|
294
|
+
pass
|
295
|
+
|
296
|
+
def on_deepep_dispatch_normal(
|
297
|
+
self,
|
298
|
+
layer_idx: int,
|
299
|
+
local_physical_count_of_layer: List[int],
|
300
|
+
num_tokens_per_rank,
|
301
|
+
num_tokens_per_rdma_rank,
|
302
|
+
num_tokens_per_expert,
|
303
|
+
):
|
304
|
+
pass
|
305
|
+
|
306
|
+
def on_deepep_dispatch_low_latency(
|
307
|
+
self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
|
308
|
+
):
|
309
|
+
pass
|
310
|
+
|
311
|
+
def reset(self):
|
312
|
+
raise NotImplementedError
|
313
|
+
|
314
|
+
def collect(self) -> Dict:
|
315
|
+
raise NotImplementedError
|
316
|
+
|
317
|
+
|
318
|
+
class _DetailSinglePassGatherer(_SinglePassGatherer):
|
319
|
+
# DeepSeek V3 has this value; should generalize later
|
320
|
+
_TOP_K_NUM = 8
|
321
|
+
|
322
|
+
def __init__(
|
323
|
+
self,
|
324
|
+
server_args: ServerArgs,
|
325
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
326
|
+
rank: int,
|
327
|
+
):
|
328
|
+
super().__init__(expert_location_metadata, rank)
|
329
|
+
self._metadata: Optional[Dict[str, Any]] = None
|
330
|
+
self._topk_ids_of_layer = torch.zeros(
|
331
|
+
(
|
332
|
+
expert_location_metadata.num_layers,
|
333
|
+
# TODO determine the max number
|
334
|
+
server_args.chunked_prefill_size * 8,
|
335
|
+
self._TOP_K_NUM,
|
336
|
+
),
|
337
|
+
dtype=torch.int32,
|
338
|
+
device=server_args.device,
|
339
|
+
)
|
340
|
+
self._misc_objects: List[Dict[str, Any]] = []
|
341
|
+
assert (
|
342
|
+
not server_args.enable_two_batch_overlap
|
343
|
+
), "DetailSinglePassGatherer does not support TBO yet"
|
344
|
+
# TODO assert shared experts fusion is disabled, o/w data is wrong
|
345
|
+
|
346
|
+
def on_forward_pass_start(self, forward_batch: ForwardBatch):
|
347
|
+
assert self._metadata is None
|
348
|
+
self._metadata = dict(
|
349
|
+
# TODO pr-chain
|
350
|
+
# rids=forward_batch.rids,
|
351
|
+
input_ids=forward_batch.input_ids.cpu().tolist(),
|
352
|
+
positions=forward_batch.positions.cpu().tolist(),
|
353
|
+
extend_seq_lens=forward_batch.extend_seq_lens_cpu,
|
354
|
+
forward_mode=forward_batch.forward_mode.value,
|
355
|
+
)
|
356
|
+
|
357
|
+
def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
|
358
|
+
self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
|
359
|
+
topk_ids
|
360
|
+
)
|
361
|
+
|
362
|
+
def on_deepep_dispatch_normal(
|
363
|
+
self,
|
364
|
+
layer_idx: int,
|
365
|
+
local_physical_count_of_layer: List[int],
|
366
|
+
num_tokens_per_rank,
|
367
|
+
num_tokens_per_rdma_rank,
|
368
|
+
num_tokens_per_expert,
|
369
|
+
):
|
370
|
+
self._misc_objects.append(
|
371
|
+
dict(
|
372
|
+
layer_id=layer_idx,
|
373
|
+
num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
|
374
|
+
num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
|
375
|
+
num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
|
376
|
+
)
|
377
|
+
)
|
378
|
+
|
379
|
+
def reset(self):
|
380
|
+
self._topk_ids_of_layer[...] = -1
|
381
|
+
self._misc_objects.clear()
|
382
|
+
self._metadata = None
|
383
|
+
|
384
|
+
def collect(self) -> Dict:
|
385
|
+
num_tokens = len(self._metadata["input_ids"])
|
386
|
+
return dict(
|
387
|
+
**self._metadata,
|
388
|
+
topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
|
389
|
+
misc_objects=self._misc_objects,
|
390
|
+
)
|
391
|
+
|
392
|
+
|
393
|
+
class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
|
394
|
+
def __init__(self, *args, **kwargs):
|
395
|
+
super().__init__(*args, **kwargs)
|
396
|
+
self._objects_of_layer = {}
|
397
|
+
|
398
|
+
def _on_layer_data(self, layer_idx: int, objects: List[int]):
|
399
|
+
assert 0 <= layer_idx < self._expert_location_metadata.num_layers
|
400
|
+
if layer_idx in self._objects_of_layer:
|
401
|
+
self._objects_of_layer[layer_idx] = _list_sum(
|
402
|
+
self._objects_of_layer[layer_idx], objects
|
403
|
+
)
|
404
|
+
else:
|
405
|
+
self._objects_of_layer[layer_idx] = objects
|
406
|
+
|
407
|
+
def reset(self):
|
408
|
+
self._objects_of_layer.clear()
|
409
|
+
|
410
|
+
def _collect_objects(self, pad_len: int) -> torch.Tensor:
|
411
|
+
data = [
|
412
|
+
self._objects_of_layer.get(layer_index) or ([0] * pad_len)
|
413
|
+
for layer_index in range(self._expert_location_metadata.num_layers)
|
414
|
+
]
|
415
|
+
return torch.tensor(data)
|
416
|
+
|
417
|
+
|
418
|
+
def _list_sum(a: List, b: List) -> List:
|
419
|
+
return [x + y for x, y in zip(a, b, strict=True)]
|
420
|
+
|
421
|
+
|
422
|
+
class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
423
|
+
def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
|
424
|
+
super().__init__(*args, **kwargs)
|
425
|
+
self._enable_global_physical_experts = enable_global_physical_experts
|
426
|
+
self._data = torch.zeros(
|
427
|
+
(
|
428
|
+
self._expert_location_metadata.num_layers,
|
429
|
+
(
|
430
|
+
self._expert_location_metadata.num_physical_experts
|
431
|
+
if enable_global_physical_experts
|
432
|
+
else self._expert_location_metadata.num_local_physical_experts
|
433
|
+
),
|
434
|
+
),
|
435
|
+
dtype=torch.int,
|
436
|
+
device="cuda",
|
437
|
+
)
|
438
|
+
|
439
|
+
def reset(self):
|
440
|
+
self._data[...] = 0
|
441
|
+
|
442
|
+
def collect(self) -> Dict:
|
443
|
+
if self._enable_global_physical_experts:
|
444
|
+
global_physical_count = self._data
|
445
|
+
else:
|
446
|
+
# Can optimize if bottleneck
|
447
|
+
global_physical_count = _convert_local_to_global_physical_count(
|
448
|
+
self._data,
|
449
|
+
rank=self._rank,
|
450
|
+
num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
|
451
|
+
num_physical_experts=self._expert_location_metadata.num_physical_experts,
|
452
|
+
)
|
453
|
+
|
454
|
+
return dict(global_physical_count=global_physical_count)
|
455
|
+
|
456
|
+
|
457
|
+
class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
|
458
|
+
def __init__(self, *args, **kwargs):
|
459
|
+
super().__init__(*args, **kwargs, enable_global_physical_experts=True)
|
460
|
+
|
461
|
+
# can optimize (e.g. fuse / compile)
|
462
|
+
def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
|
463
|
+
topk_ids = topk_ids.flatten()
|
464
|
+
mask = topk_ids != -1
|
465
|
+
self._data[layer_idx, :].scatter_add_(
|
466
|
+
dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
|
467
|
+
)
|
468
|
+
|
469
|
+
|
470
|
+
class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
|
471
|
+
def __init__(self, *args, **kwargs):
|
472
|
+
super().__init__(*args, **kwargs)
|
473
|
+
if torch.distributed.get_rank() == 0:
|
474
|
+
logger.info(
|
475
|
+
"DeepepNormalSinglePassGatherer gathers approximate statistics. "
|
476
|
+
"If used with small batch size, consider using expert_distribution_recorder_mode=stat."
|
477
|
+
)
|
478
|
+
|
479
|
+
def on_deepep_dispatch_normal(
|
480
|
+
self,
|
481
|
+
layer_idx: int,
|
482
|
+
local_physical_count_of_layer: List[int],
|
483
|
+
num_tokens_per_rank,
|
484
|
+
num_tokens_per_rdma_rank,
|
485
|
+
num_tokens_per_expert,
|
486
|
+
):
|
487
|
+
assert isinstance(local_physical_count_of_layer, list)
|
488
|
+
self._on_layer_data(layer_idx, local_physical_count_of_layer)
|
489
|
+
|
490
|
+
def collect(self) -> Dict:
|
491
|
+
local_physical_count = super()._collect_objects(
|
492
|
+
pad_len=self._expert_location_metadata.num_local_physical_experts
|
493
|
+
)
|
494
|
+
global_physical_count = _convert_local_to_global_physical_count(
|
495
|
+
local_physical_count,
|
496
|
+
rank=self._rank,
|
497
|
+
num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
|
498
|
+
num_physical_experts=self._expert_location_metadata.num_physical_experts,
|
499
|
+
)
|
500
|
+
return dict(global_physical_count=global_physical_count)
|
501
|
+
|
502
|
+
|
503
|
+
class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
|
504
|
+
def __init__(self, *args, **kwargs):
|
505
|
+
super().__init__(*args, **kwargs, enable_global_physical_experts=False)
|
506
|
+
|
507
|
+
def on_deepep_dispatch_low_latency(
|
508
|
+
self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
|
509
|
+
):
|
510
|
+
# Most naive implementation, can optimize later
|
511
|
+
self._data[layer_idx, :] += local_physical_count_of_layer
|
512
|
+
|
513
|
+
|
514
|
+
def _convert_local_to_global_physical_count(
|
515
|
+
local_physical_count: torch.Tensor,
|
516
|
+
rank: int,
|
517
|
+
num_local_physical_experts: int,
|
518
|
+
num_physical_experts: int,
|
519
|
+
) -> torch.Tensor:
|
520
|
+
dtype = local_physical_count.dtype
|
521
|
+
device = local_physical_count.device
|
522
|
+
num_layers, _ = local_physical_count.shape
|
523
|
+
|
524
|
+
ans = torch.zeros((num_layers, num_physical_experts), dtype=dtype, device=device)
|
525
|
+
ans[
|
526
|
+
:, num_local_physical_experts * rank : num_local_physical_experts * (rank + 1)
|
527
|
+
] = local_physical_count
|
528
|
+
return ans
|
529
|
+
|
530
|
+
|
531
|
+
# --------------------------------------- Accumulator -----------------------------------------
|
532
|
+
|
533
|
+
_SINGLE_PASS_GATHERER_KEY_PRIMARY = "primary"
|
534
|
+
|
535
|
+
|
536
|
+
class _Accumulator(ABC):
|
537
|
+
@staticmethod
|
538
|
+
def init_new(
|
539
|
+
server_args: ServerArgs,
|
540
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
541
|
+
rank: int,
|
542
|
+
) -> "_Accumulator":
|
543
|
+
return _Accumulator.get_class(server_args)(
|
544
|
+
server_args, expert_location_metadata, rank
|
545
|
+
)
|
546
|
+
|
547
|
+
@staticmethod
|
548
|
+
def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
|
549
|
+
return {
|
550
|
+
"stat": _StatAccumulator,
|
551
|
+
"stat_approx": _StatAccumulator,
|
552
|
+
"per_pass": _DetailAccumulator,
|
553
|
+
"per_token": _DetailAccumulator,
|
554
|
+
}[server_args.expert_distribution_recorder_mode]
|
555
|
+
|
556
|
+
def __init__(
|
557
|
+
self,
|
558
|
+
server_args: ServerArgs,
|
559
|
+
expert_location_metadata: "ExpertLocationMetadata",
|
560
|
+
rank: int,
|
561
|
+
):
|
562
|
+
self._server_args = server_args
|
563
|
+
self._expert_location_metadata = expert_location_metadata
|
564
|
+
self._rank = rank
|
565
|
+
|
566
|
+
def get_single_pass_gatherer_keys(self):
|
567
|
+
return [_SINGLE_PASS_GATHERER_KEY_PRIMARY]
|
568
|
+
|
569
|
+
def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
|
570
|
+
return _SINGLE_PASS_GATHERER_KEY_PRIMARY
|
571
|
+
|
572
|
+
def append(
|
573
|
+
self,
|
574
|
+
forward_pass_id: int,
|
575
|
+
gatherer_key: str,
|
576
|
+
single_pass_data: Dict,
|
577
|
+
):
|
578
|
+
pass
|
579
|
+
|
580
|
+
def reset(self):
|
581
|
+
pass
|
582
|
+
|
583
|
+
def dump(self, output_mode: _OutputMode):
|
584
|
+
pass
|
585
|
+
|
586
|
+
|
587
|
+
class _UtilizationRateAccumulatorMixin(_Accumulator):
|
588
|
+
def __init__(self, *args, **kwargs):
|
589
|
+
super().__init__(*args, **kwargs)
|
590
|
+
|
591
|
+
self._enable = self._server_args.enable_expert_distribution_metrics
|
592
|
+
|
593
|
+
if self._enable:
|
594
|
+
window_sizes = [10, 100, 1000]
|
595
|
+
self._history = _DequeCollection(maxlens=window_sizes)
|
596
|
+
self._rank = torch.distributed.get_rank()
|
597
|
+
|
598
|
+
def append(
|
599
|
+
self,
|
600
|
+
forward_pass_id: int,
|
601
|
+
gatherer_key: str,
|
602
|
+
single_pass_data: Dict,
|
603
|
+
):
|
604
|
+
super().append(forward_pass_id, gatherer_key, single_pass_data)
|
605
|
+
if self._enable:
|
606
|
+
self._append_utilization_rate(
|
607
|
+
forward_pass_id, single_pass_data["global_physical_count"]
|
608
|
+
)
|
609
|
+
|
610
|
+
def reset(self):
|
611
|
+
super().reset()
|
612
|
+
if self._enable:
|
613
|
+
self._history.clear()
|
614
|
+
|
615
|
+
def _append_utilization_rate(
|
616
|
+
self, forward_pass_id: int, single_pass_global_physical_count: torch.Tensor
|
617
|
+
):
|
618
|
+
gpu_physical_count = compute_gpu_physical_count(
|
619
|
+
single_pass_global_physical_count,
|
620
|
+
num_gpu=self._expert_location_metadata.ep_size,
|
621
|
+
)
|
622
|
+
gpu_physical_count = gpu_physical_count.to(self._server_args.device)
|
623
|
+
torch.distributed.reduce(
|
624
|
+
gpu_physical_count, dst=0, op=torch.distributed.ReduceOp.SUM
|
625
|
+
)
|
626
|
+
|
627
|
+
if self._rank == 0:
|
628
|
+
utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
|
629
|
+
utilization_rate = torch.mean(utilization_rate_tensor).item()
|
630
|
+
self._history.append(utilization_rate)
|
631
|
+
|
632
|
+
gpu_physical_count_sum = gpu_physical_count.sum().item()
|
633
|
+
|
634
|
+
logger.info(
|
635
|
+
f"[Expert Balancedness] "
|
636
|
+
f"forward_pass_id={forward_pass_id} "
|
637
|
+
f"current_pass_balancedness={utilization_rate:.03f} "
|
638
|
+
f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
|
639
|
+
f"gpu_physical_count_sum={gpu_physical_count_sum}"
|
640
|
+
# f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
|
641
|
+
)
|
642
|
+
|
643
|
+
|
644
|
+
class _DequeCollection:
|
645
|
+
def __init__(self, maxlens: List[int]):
|
646
|
+
self._dequeues = [deque(maxlen=maxlen) for maxlen in maxlens]
|
647
|
+
|
648
|
+
def append(self, value):
|
649
|
+
for d in self._dequeues:
|
650
|
+
d.append(value)
|
651
|
+
|
652
|
+
def clear(self):
|
653
|
+
for d in self._dequeues:
|
654
|
+
d.clear()
|
655
|
+
|
656
|
+
def mean(self) -> Dict[int, float]:
|
657
|
+
return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
|
658
|
+
|
659
|
+
|
660
|
+
class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
|
661
|
+
def __init__(self, *args, **kwargs):
|
662
|
+
super().__init__(*args, **kwargs)
|
663
|
+
self._records = []
|
664
|
+
|
665
|
+
def get_single_pass_gatherer_keys(self):
|
666
|
+
if False: # TODO `server_args.enable_two_batch_overlap`
|
667
|
+
return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
|
668
|
+
return super().get_single_pass_gatherer_keys()
|
669
|
+
|
670
|
+
def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
|
671
|
+
if False: # TODO `server_args.enable_two_batch_overlap`
|
672
|
+
return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
|
673
|
+
return super().get_single_pass_gatherer_key(debug_name)
|
674
|
+
|
675
|
+
def append(
|
676
|
+
self,
|
677
|
+
forward_pass_id: int,
|
678
|
+
gatherer_key: str,
|
679
|
+
single_pass_data: Dict,
|
680
|
+
):
|
681
|
+
super().append(forward_pass_id, gatherer_key, single_pass_data)
|
682
|
+
|
683
|
+
def _process_object(obj):
|
684
|
+
if isinstance(obj, torch.Tensor):
|
685
|
+
return obj.cpu().clone()
|
686
|
+
return obj
|
687
|
+
|
688
|
+
single_pass_data_processed = {
|
689
|
+
k: _process_object(v) for k, v in single_pass_data.items()
|
690
|
+
}
|
691
|
+
|
692
|
+
self._records.append(
|
693
|
+
dict(
|
694
|
+
forward_pass_id=forward_pass_id,
|
695
|
+
rank=self._rank,
|
696
|
+
gatherer_key=gatherer_key,
|
697
|
+
**single_pass_data_processed,
|
698
|
+
)
|
699
|
+
)
|
700
|
+
|
701
|
+
def reset(self):
|
702
|
+
super().reset()
|
703
|
+
self._records.clear()
|
704
|
+
|
705
|
+
def dump(self, output_mode: _OutputMode):
|
706
|
+
assert output_mode == "file"
|
707
|
+
output = dict(
|
708
|
+
records=self._records,
|
709
|
+
# NOTE: This may change during recording, so here we say it is the "last" one
|
710
|
+
last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
|
711
|
+
)
|
712
|
+
_dump_to_file(
|
713
|
+
f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
|
714
|
+
)
|
715
|
+
|
716
|
+
|
717
|
+
class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
718
|
+
def __init__(self, *args, **kwargs):
|
719
|
+
super().__init__(*args, **kwargs)
|
720
|
+
self._global_physical_count_of_buffered_step = _Buffer.init_new(
|
721
|
+
item_shape=(
|
722
|
+
self._expert_location_metadata.num_layers,
|
723
|
+
# Cannot use local_physical_count to support select_experts
|
724
|
+
self._expert_location_metadata.num_physical_experts,
|
725
|
+
),
|
726
|
+
buffer_size=self._server_args.expert_distribution_recorder_buffer_size,
|
727
|
+
dtype=torch.int32,
|
728
|
+
device=self._server_args.device,
|
729
|
+
)
|
730
|
+
self._first_dump = True
|
731
|
+
|
732
|
+
def append(
|
733
|
+
self,
|
734
|
+
forward_pass_id: int,
|
735
|
+
gatherer_key: str,
|
736
|
+
single_pass_data: Dict,
|
737
|
+
):
|
738
|
+
super().append(forward_pass_id, gatherer_key, single_pass_data)
|
739
|
+
# Can optimize if overhead here is large
|
740
|
+
self._global_physical_count_of_buffered_step.append(
|
741
|
+
single_pass_data["global_physical_count"]
|
742
|
+
)
|
743
|
+
|
744
|
+
def reset(self):
|
745
|
+
super().reset()
|
746
|
+
self._global_physical_count_of_buffered_step.reset()
|
747
|
+
|
748
|
+
def dump(self, output_mode: _OutputMode):
|
749
|
+
logical_count_of_buffered_step = _convert_global_physical_count_to_logical_count(
|
750
|
+
self._global_physical_count_of_buffered_step.get_all(),
|
751
|
+
num_layers=self._expert_location_metadata.num_layers,
|
752
|
+
num_logical_experts=self._expert_location_metadata.num_logical_experts,
|
753
|
+
physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
|
754
|
+
)
|
755
|
+
|
756
|
+
if self._first_dump:
|
757
|
+
self._first_dump = False
|
758
|
+
torch.cuda.empty_cache()
|
759
|
+
|
760
|
+
torch.distributed.all_reduce(
|
761
|
+
logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
|
762
|
+
)
|
763
|
+
|
764
|
+
output = dict(
|
765
|
+
rank=self._rank,
|
766
|
+
logical_count=logical_count_of_buffered_step,
|
767
|
+
)
|
768
|
+
|
769
|
+
if output_mode == "file":
|
770
|
+
if self._rank == 0:
|
771
|
+
_dump_to_file(f"expert_distribution_recorder_{time.time()}.pt", output)
|
772
|
+
elif output_mode == "object":
|
773
|
+
return output
|
774
|
+
else:
|
775
|
+
raise NotImplementedError
|
776
|
+
|
777
|
+
|
778
|
+
def _dump_to_file(name, data):
|
779
|
+
save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
|
780
|
+
path_output = save_dir / name
|
781
|
+
logger.info(f"Write expert distribution to {path_output}")
|
782
|
+
if not save_dir.exists():
|
783
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
784
|
+
torch.save(data, str(path_output))
|
785
|
+
|
786
|
+
|
787
|
+
class _Buffer:
|
788
|
+
@staticmethod
|
789
|
+
def init_new(item_shape: Tuple, buffer_size: int, dtype, device):
|
790
|
+
if buffer_size < 0:
|
791
|
+
return _InfiniteBuffer(item_shape, dtype=dtype, device=device)
|
792
|
+
else:
|
793
|
+
return _CircularBuffer(item_shape, buffer_size, dtype=dtype, device=device)
|
794
|
+
|
795
|
+
def append(self, value: torch.Tensor):
|
796
|
+
raise NotImplementedError
|
797
|
+
|
798
|
+
def get_all(self) -> torch.Tensor:
|
799
|
+
raise NotImplementedError
|
800
|
+
|
801
|
+
def reset(self):
|
802
|
+
raise NotImplementedError
|
803
|
+
|
804
|
+
|
805
|
+
class _CircularBuffer(_Buffer):
|
806
|
+
def __init__(self, item_shape: Tuple, buffer_size: int, dtype, device):
|
807
|
+
self._buffer = torch.zeros(
|
808
|
+
(buffer_size, *item_shape), dtype=dtype, device=device
|
809
|
+
)
|
810
|
+
self._curr_index = 0
|
811
|
+
|
812
|
+
def append(self, value: torch.Tensor):
|
813
|
+
self._buffer[self._curr_index] = value
|
814
|
+
self._curr_index = (self._curr_index + 1) % len(self._buffer)
|
815
|
+
|
816
|
+
def get_all(self) -> torch.Tensor:
|
817
|
+
return self._buffer
|
818
|
+
|
819
|
+
def reset(self):
|
820
|
+
self._buffer[...] = 0
|
821
|
+
|
822
|
+
|
823
|
+
class _InfiniteBuffer(_Buffer):
|
824
|
+
def __init__(self, item_shape: Tuple, dtype, device):
|
825
|
+
self._item_shape = item_shape
|
826
|
+
self._buffer = torch.zeros((128, *item_shape), dtype=dtype, device=device)
|
827
|
+
self._size = 0
|
828
|
+
|
829
|
+
def append(self, value: torch.Tensor):
|
830
|
+
curr_buffer_size = len(self._buffer)
|
831
|
+
dtype = self._buffer.dtype
|
832
|
+
device = self._buffer.device
|
833
|
+
|
834
|
+
if self._size == curr_buffer_size:
|
835
|
+
new_buffer = torch.zeros(
|
836
|
+
(2 * curr_buffer_size, *self._item_shape), dtype=dtype, device=device
|
837
|
+
)
|
838
|
+
new_buffer[:curr_buffer_size] = self._buffer
|
839
|
+
self._buffer = new_buffer
|
840
|
+
|
841
|
+
self._buffer[self._size] = value
|
842
|
+
self._size += 1
|
843
|
+
|
844
|
+
def get_all(self) -> torch.Tensor:
|
845
|
+
return self._buffer[: self._size]
|
846
|
+
|
847
|
+
def reset(self):
|
848
|
+
self._buffer[...] = 0
|
849
|
+
self._size = 0
|
850
|
+
|
851
|
+
|
852
|
+
def _convert_global_physical_count_to_logical_count(
|
853
|
+
# (whatever, num_layers, num_physical_experts)
|
854
|
+
global_physical_count: torch.Tensor,
|
855
|
+
num_layers: int,
|
856
|
+
num_logical_experts: int,
|
857
|
+
physical_to_logical_map: torch.Tensor,
|
858
|
+
):
|
859
|
+
dim_extra, _, _ = global_physical_count.shape
|
860
|
+
dtype = global_physical_count.dtype
|
861
|
+
device = global_physical_count.device
|
862
|
+
logical_count = torch.zeros(
|
863
|
+
(dim_extra, num_layers, num_logical_experts), dtype=dtype, device=device
|
864
|
+
)
|
865
|
+
logical_count.scatter_add_(
|
866
|
+
dim=2,
|
867
|
+
index=physical_to_logical_map.unsqueeze(0)
|
868
|
+
.expand(dim_extra, -1, -1)
|
869
|
+
.to(torch.int64),
|
870
|
+
src=global_physical_count,
|
871
|
+
)
|
872
|
+
return logical_count
|
873
|
+
|
874
|
+
|
875
|
+
def compute_gpu_physical_count(
|
876
|
+
physical_count_of_whatever: torch.Tensor, # (..., num_layer, num_physical_expert)
|
877
|
+
num_gpu: int,
|
878
|
+
):
|
879
|
+
"""output: gpu_physical_count_of_batch (..., num_layer, num_gpu)"""
|
880
|
+
return einops.reduce(
|
881
|
+
physical_count_of_whatever,
|
882
|
+
"... num_layer (num_gpu num_expert_per_gpu) -> ... num_layer num_gpu",
|
883
|
+
"sum",
|
884
|
+
num_gpu=num_gpu,
|
885
|
+
)
|
886
|
+
|
887
|
+
|
888
|
+
def compute_utilization_rate(
|
889
|
+
gpu_physical_count_of_batch: torch.Tensor, # (..., num_layer, num_gpu)
|
890
|
+
):
|
891
|
+
"""output: utilization_rate (..., num_layer)"""
|
892
|
+
gpu_physical_count_of_batch = gpu_physical_count_of_batch.float()
|
893
|
+
max_gpu_physical_count = einops.reduce(
|
894
|
+
gpu_physical_count_of_batch,
|
895
|
+
"... num_layer num_gpu -> ... num_layer",
|
896
|
+
"max",
|
897
|
+
)
|
898
|
+
avg_gpu_physical_count = einops.reduce(
|
899
|
+
gpu_physical_count_of_batch,
|
900
|
+
"... num_layer num_gpu -> ... num_layer",
|
901
|
+
"mean",
|
902
|
+
)
|
903
|
+
return (avg_gpu_physical_count + 1e-5) / (max_gpu_physical_count + 1e-5)
|