sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -417,6 +417,8 @@ class SchedulerDisaggregationPrefillMixin:
|
|
417
417
|
self.tree_cache.cache_finished_req(req) # unlock the tree
|
418
418
|
req.finished_reason = FINISH_LENGTH(length=0)
|
419
419
|
# FIXME: clean up req's data in transfer engine
|
420
|
+
if hasattr(req.disagg_kv_sender, "clear"):
|
421
|
+
req.disagg_kv_sender.clear()
|
420
422
|
done_reqs.append(req)
|
421
423
|
elif poll == KVPoll.Failed:
|
422
424
|
error_message = f"Prefill transfer failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import dataclasses
|
4
4
|
import os
|
5
5
|
import random
|
6
|
+
import threading
|
6
7
|
import warnings
|
7
8
|
from collections import deque
|
8
9
|
from enum import Enum
|
@@ -13,7 +14,7 @@ import requests
|
|
13
14
|
import torch
|
14
15
|
import torch.distributed as dist
|
15
16
|
|
16
|
-
from sglang.srt.utils import get_ip
|
17
|
+
from sglang.srt.utils import get_ip, get_local_ip_by_remote
|
17
18
|
|
18
19
|
if TYPE_CHECKING:
|
19
20
|
from sglang.srt.managers.schedule_batch import Req
|
@@ -279,3 +280,39 @@ class MetadataBuffers:
|
|
279
280
|
] = torch.tensor(
|
280
281
|
req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
|
281
282
|
)
|
283
|
+
|
284
|
+
|
285
|
+
class FastQueue:
|
286
|
+
def __init__(self):
|
287
|
+
self._buf = deque()
|
288
|
+
self._cond = threading.Condition()
|
289
|
+
|
290
|
+
def put(self, item):
|
291
|
+
with self._cond:
|
292
|
+
self._buf.append(item)
|
293
|
+
# wake up a thread of wait()
|
294
|
+
self._cond.notify()
|
295
|
+
|
296
|
+
def get(self):
|
297
|
+
with self._cond:
|
298
|
+
# if queue is empty ,block until is notified()
|
299
|
+
while not self._buf:
|
300
|
+
self._cond.wait()
|
301
|
+
return self._buf.popleft()
|
302
|
+
|
303
|
+
|
304
|
+
def group_concurrent_contiguous(
|
305
|
+
src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
|
306
|
+
) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
|
307
|
+
"""Vectorised NumPy implementation."""
|
308
|
+
if src_indices.size == 0:
|
309
|
+
return [], []
|
310
|
+
|
311
|
+
brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
|
312
|
+
src_groups = np.split(src_indices, brk)
|
313
|
+
dst_groups = np.split(dst_indices, brk)
|
314
|
+
|
315
|
+
src_groups = [g.tolist() for g in src_groups]
|
316
|
+
dst_groups = [g.tolist() for g in dst_groups]
|
317
|
+
|
318
|
+
return src_groups, dst_groups
|
@@ -0,0 +1,315 @@
|
|
1
|
+
import bisect
|
2
|
+
import logging
|
3
|
+
import math
|
4
|
+
import os
|
5
|
+
from contextlib import contextmanager
|
6
|
+
from enum import IntEnum
|
7
|
+
from typing import Any, Callable, List, Optional, TypeVar, Union
|
8
|
+
|
9
|
+
import torch
|
10
|
+
import torch.distributed as dist
|
11
|
+
from torch.distributed import ProcessGroup, ReduceOp
|
12
|
+
|
13
|
+
from sglang.srt import _custom_ops as ops
|
14
|
+
from sglang.srt.utils import is_cuda, is_hip
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
_is_cuda = is_cuda()
|
19
|
+
_is_hip = is_hip()
|
20
|
+
|
21
|
+
mscclpp_is_available = False
|
22
|
+
if _is_hip:
|
23
|
+
# TODO(zyksir): mscclpp is untested on AMD and therefore disabled.
|
24
|
+
mscclpp_is_available = False
|
25
|
+
if _is_cuda:
|
26
|
+
try:
|
27
|
+
import sgl_kernel
|
28
|
+
|
29
|
+
mscclpp_is_available = True
|
30
|
+
except:
|
31
|
+
mscclpp_is_available = False
|
32
|
+
|
33
|
+
|
34
|
+
class MscclContextSelection(IntEnum):
|
35
|
+
MSCCL1SHOT1NODELL = 1
|
36
|
+
MSCCL1SHOT2NODELL = 2
|
37
|
+
|
38
|
+
|
39
|
+
def mscclpp_is_weak_contiguous(inp: torch.Tensor):
|
40
|
+
return inp.is_contiguous() or (
|
41
|
+
inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
|
42
|
+
== inp.numel() * inp.element_size()
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
def mscclpp_convert_to_bytes(size_str):
|
47
|
+
"""
|
48
|
+
Converts a human-readable size string (e.g., "1MB", "2.5kb", "3 GB")
|
49
|
+
into the equivalent number of bytes using binary units.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
size_str (str): A string representing size with unit (KB, MB, GB).
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
int: Number of bytes.
|
56
|
+
"""
|
57
|
+
size_str = size_str.strip().lower()
|
58
|
+
|
59
|
+
if not size_str:
|
60
|
+
raise ValueError("Empty input string")
|
61
|
+
|
62
|
+
# Extract numeric part and unit
|
63
|
+
for i in range(len(size_str)):
|
64
|
+
if not size_str[i].isdigit() and size_str[i] != ".":
|
65
|
+
break
|
66
|
+
num_str = size_str[:i]
|
67
|
+
unit = size_str[i:].strip()
|
68
|
+
|
69
|
+
try:
|
70
|
+
num = float(num_str)
|
71
|
+
except ValueError:
|
72
|
+
raise ValueError(f"Invalid numeric value in '{size_str}'")
|
73
|
+
|
74
|
+
# Conversion factors
|
75
|
+
if unit == "b":
|
76
|
+
return int(num)
|
77
|
+
elif unit == "kb":
|
78
|
+
return int(num * 1024)
|
79
|
+
elif unit == "mb":
|
80
|
+
return int(num * 1024 * 1024)
|
81
|
+
elif unit == "gb":
|
82
|
+
return int(num * 1024 * 1024 * 1024)
|
83
|
+
else:
|
84
|
+
raise ValueError(f"Unsupported unit: {unit}, support B, KB, MB, GB only")
|
85
|
+
|
86
|
+
|
87
|
+
def mscclpp_bench_time(func, test_niter: int = 10, warmup_niter: int = 2):
|
88
|
+
# warmup
|
89
|
+
for _ in range(warmup_niter):
|
90
|
+
func()
|
91
|
+
start_event = torch.cuda.Event(enable_timing=True)
|
92
|
+
end_event = torch.cuda.Event(enable_timing=True)
|
93
|
+
torch.cuda.synchronize()
|
94
|
+
dist.barrier()
|
95
|
+
start_event.record()
|
96
|
+
for _ in range(test_niter):
|
97
|
+
func()
|
98
|
+
end_event.record()
|
99
|
+
end_event.synchronize()
|
100
|
+
func_cost_us = start_event.elapsed_time(end_event) / test_niter * 1000
|
101
|
+
return func_cost_us
|
102
|
+
|
103
|
+
|
104
|
+
class PyMscclppCommunicator:
|
105
|
+
_SUPPORTED_WORLD_SIZES = [8, 16]
|
106
|
+
_MAX_BYTES = mscclpp_convert_to_bytes(os.getenv("SGLANG_MSCCLPP_MAX_BYTES", "1MB"))
|
107
|
+
_SUPPORTED_DTYPE = [torch.float, torch.float16, torch.bfloat16]
|
108
|
+
|
109
|
+
# max_bytes: max supported mscclpp allreduce size
|
110
|
+
# in A100 mscclpp is faster than nccl only under condition of msg size smaller than1MB
|
111
|
+
def __init__(
|
112
|
+
self,
|
113
|
+
group: ProcessGroup,
|
114
|
+
device: Union[int, str, torch.device],
|
115
|
+
max_bytes=_MAX_BYTES,
|
116
|
+
) -> None:
|
117
|
+
"""
|
118
|
+
Args:
|
119
|
+
group: the process group to work on. If None, it will use the
|
120
|
+
default process group.
|
121
|
+
device: the device to bind the CustomAllreduce to. If None,
|
122
|
+
it will be bind to f"cuda:{local_rank}".
|
123
|
+
It is the caller's responsibility to make sure each communicator
|
124
|
+
is bind to a unique device, and all communicators in this group
|
125
|
+
are in the same node.
|
126
|
+
"""
|
127
|
+
self._IS_CAPTURING = False
|
128
|
+
self.disabled = True
|
129
|
+
|
130
|
+
if not mscclpp_is_available:
|
131
|
+
# disable because of missing mscclpp library
|
132
|
+
# e.g. in a non-cuda environment
|
133
|
+
return
|
134
|
+
|
135
|
+
self.group = group
|
136
|
+
|
137
|
+
assert (
|
138
|
+
dist.get_backend(group) != dist.Backend.NCCL
|
139
|
+
), "CustomAllreduce should be attached to a non-NCCL group."
|
140
|
+
|
141
|
+
rank = dist.get_rank(group=self.group)
|
142
|
+
world_size = dist.get_world_size(group=self.group)
|
143
|
+
if world_size == 1:
|
144
|
+
# No need to initialize mscclpp for single GPU case.
|
145
|
+
return
|
146
|
+
|
147
|
+
if world_size not in PyMscclppCommunicator._SUPPORTED_WORLD_SIZES:
|
148
|
+
logger.warning(
|
149
|
+
"PyMscclpp is disabled due to an unsupported world"
|
150
|
+
" size: %d. Supported world sizes: %s. To silence this "
|
151
|
+
"warning, specify disable_mscclpp=True explicitly.",
|
152
|
+
world_size,
|
153
|
+
str(PyMscclppCommunicator._SUPPORTED_WORLD_SIZES),
|
154
|
+
)
|
155
|
+
return
|
156
|
+
|
157
|
+
self.ranks = torch.distributed.get_process_group_ranks(group)
|
158
|
+
self.nranks_per_node = torch.cuda.device_count()
|
159
|
+
# for now mscclpp with stride in the communicator is not tested
|
160
|
+
if not (abs(self.ranks[-1] - self.ranks[0]) == world_size - 1):
|
161
|
+
logger.warning(
|
162
|
+
"PyMscclpp is disabled due to an unsupported group %s."
|
163
|
+
"Please ensure all ranks in the group are consecutive."
|
164
|
+
"To silence this warning, specify disable_mscclpp=True explicitly.",
|
165
|
+
str(self.ranks),
|
166
|
+
)
|
167
|
+
return
|
168
|
+
|
169
|
+
if isinstance(device, int):
|
170
|
+
device = torch.device(f"cuda:{device}")
|
171
|
+
elif isinstance(device, str):
|
172
|
+
device = torch.device(device)
|
173
|
+
# now `device` is a `torch.device` object
|
174
|
+
assert isinstance(device, torch.device)
|
175
|
+
self.device = device
|
176
|
+
|
177
|
+
self.max_bytes = max_bytes
|
178
|
+
self.rank = rank
|
179
|
+
self.world_size = world_size
|
180
|
+
|
181
|
+
if dist.get_rank(group) == 0:
|
182
|
+
unique_id = [ops.mscclpp_generate_unique_id()]
|
183
|
+
else:
|
184
|
+
unique_id = [None]
|
185
|
+
dist.broadcast_object_list(unique_id, src=self.ranks[0], group=self.group)
|
186
|
+
self.unique_id = unique_id[0]
|
187
|
+
self.rank_to_node, self.rank_to_ib = list(range(world_size)), list(
|
188
|
+
range(world_size)
|
189
|
+
)
|
190
|
+
for r in range(world_size):
|
191
|
+
self.rank_to_node[r] = r // 8
|
192
|
+
self.rank_to_ib[r] = self.rank % 8
|
193
|
+
|
194
|
+
self._context = None
|
195
|
+
self.context_selection = None
|
196
|
+
self.msg_size_for_finetune = [
|
197
|
+
2**i for i in range(10, math.floor(math.log2(self.max_bytes)) + 1)
|
198
|
+
]
|
199
|
+
self.msg_size2best_config = {}
|
200
|
+
if world_size == 8:
|
201
|
+
self.context_selection = MscclContextSelection.MSCCL1SHOT1NODELL
|
202
|
+
elif world_size == 16:
|
203
|
+
self.context_selection = MscclContextSelection.MSCCL1SHOT2NODELL
|
204
|
+
if not _is_hip:
|
205
|
+
self.scratch = torch.empty(
|
206
|
+
self.max_bytes * 8,
|
207
|
+
dtype=torch.uint8,
|
208
|
+
device=self.device,
|
209
|
+
)
|
210
|
+
self.put_buffer = torch.empty(
|
211
|
+
self.max_bytes * 8 // self.nranks_per_node,
|
212
|
+
dtype=torch.uint8,
|
213
|
+
device=self.device,
|
214
|
+
)
|
215
|
+
self._context = ops.mscclpp_init_context(
|
216
|
+
self.unique_id,
|
217
|
+
self.rank,
|
218
|
+
self.world_size,
|
219
|
+
self.scratch,
|
220
|
+
self.put_buffer,
|
221
|
+
self.nranks_per_node,
|
222
|
+
self.rank_to_node,
|
223
|
+
self.rank_to_ib,
|
224
|
+
int(self.context_selection),
|
225
|
+
)
|
226
|
+
else:
|
227
|
+
raise NotImplementedError("HIP Mscclpp is not supported yet.")
|
228
|
+
|
229
|
+
self.msg_size2best_config = {}
|
230
|
+
self.pre_tune_config()
|
231
|
+
if dist.get_rank(group) == 0:
|
232
|
+
msg_size2best_config = [self.msg_size2best_config]
|
233
|
+
else:
|
234
|
+
msg_size2best_config = [None]
|
235
|
+
dist.broadcast_object_list(
|
236
|
+
msg_size2best_config, src=self.ranks[0], group=self.group
|
237
|
+
)
|
238
|
+
self.msg_size2best_config = msg_size2best_config[0]
|
239
|
+
|
240
|
+
# PyMscclpp is enabled only in cuda graph
|
241
|
+
self.disabled = True
|
242
|
+
|
243
|
+
def pre_tune_config(self, dtype=torch.bfloat16) -> bool:
|
244
|
+
logger.debug(f"start to pre-tune configs for rank {self.rank}")
|
245
|
+
nthreads_to_try = [256, 512, 1024]
|
246
|
+
nblocks_to_try = [21, 42, 84]
|
247
|
+
inp_randn = torch.ones(
|
248
|
+
self.msg_size_for_finetune[-1] // dtype.itemsize, dtype=dtype, device="cuda"
|
249
|
+
)
|
250
|
+
oup_randn = torch.empty_like(inp_randn)
|
251
|
+
for msg_size in self.msg_size_for_finetune:
|
252
|
+
mock_inp, mock_outp = (
|
253
|
+
inp_randn[: msg_size // dtype.itemsize],
|
254
|
+
oup_randn[: msg_size // dtype.itemsize],
|
255
|
+
)
|
256
|
+
best_config, best_time = None, None
|
257
|
+
for nthreads in nthreads_to_try:
|
258
|
+
for nblocks in nblocks_to_try:
|
259
|
+
cur_cost = mscclpp_bench_time(
|
260
|
+
lambda: ops.mscclpp_allreduce(
|
261
|
+
self._context, mock_inp, mock_outp, nthreads, nblocks
|
262
|
+
)
|
263
|
+
)
|
264
|
+
if best_time is None or cur_cost < best_time:
|
265
|
+
best_config = (nthreads, nblocks)
|
266
|
+
best_time = cur_cost
|
267
|
+
self.msg_size2best_config[msg_size] = best_config
|
268
|
+
if self.rank == 0:
|
269
|
+
logger.debug(
|
270
|
+
f"for msg_size {msg_size}, best_config: {best_config}, best_time: {best_time}us"
|
271
|
+
)
|
272
|
+
|
273
|
+
def should_mscclpp_allreduce(
|
274
|
+
self, inp: torch.Tensor, op: ReduceOp = ReduceOp.SUM
|
275
|
+
) -> bool:
|
276
|
+
if self.disabled or self._context is None:
|
277
|
+
return False
|
278
|
+
if inp.dtype not in PyMscclppCommunicator._SUPPORTED_DTYPE:
|
279
|
+
return False
|
280
|
+
if not mscclpp_is_weak_contiguous(inp):
|
281
|
+
return False
|
282
|
+
# only support sum op
|
283
|
+
if op != ReduceOp.SUM:
|
284
|
+
return False
|
285
|
+
if inp.numel() * inp.element_size() > self.max_bytes:
|
286
|
+
return False
|
287
|
+
return True
|
288
|
+
|
289
|
+
def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM):
|
290
|
+
if self._IS_CAPTURING:
|
291
|
+
if torch.cuda.is_current_stream_capturing():
|
292
|
+
self.graph_input_set.add((tensor.dtype, tensor.numel()))
|
293
|
+
msg_size = tensor.numel() * tensor.itemsize
|
294
|
+
index = bisect.bisect_left(self.msg_size_for_finetune, msg_size)
|
295
|
+
msg_size_finetune = self.msg_size_for_finetune[index]
|
296
|
+
nthreads, nblocks = self.msg_size2best_config[msg_size_finetune]
|
297
|
+
result = torch.empty_like(tensor)
|
298
|
+
ops.mscclpp_allreduce(self._context, tensor, result, nthreads, nblocks)
|
299
|
+
return result
|
300
|
+
|
301
|
+
@contextmanager
|
302
|
+
def change_state(
|
303
|
+
self,
|
304
|
+
enable: Optional[bool] = None,
|
305
|
+
):
|
306
|
+
if enable is None:
|
307
|
+
# guess a default value when not specified
|
308
|
+
enable = self.available
|
309
|
+
|
310
|
+
old_disable = self.disabled
|
311
|
+
self.disabled = not enable
|
312
|
+
|
313
|
+
yield
|
314
|
+
|
315
|
+
self.disabled = old_disable
|
@@ -41,6 +41,7 @@ from torch.distributed import Backend, ProcessGroup
|
|
41
41
|
|
42
42
|
from sglang.srt.utils import (
|
43
43
|
direct_register_custom_op,
|
44
|
+
get_bool_env_var,
|
44
45
|
is_cuda_alike,
|
45
46
|
is_npu,
|
46
47
|
supports_custom_op,
|
@@ -189,6 +190,7 @@ class GroupCoordinator:
|
|
189
190
|
cpu_group: ProcessGroup # group for CPU communication
|
190
191
|
device_group: ProcessGroup # group for device communication
|
191
192
|
use_pynccl: bool # a hint of whether to use PyNccl
|
193
|
+
use_pymscclpp: bool # a hint of whether to use PyMsccl
|
192
194
|
use_custom_allreduce: bool # a hint of whether to use CustomAllreduce
|
193
195
|
use_message_queue_broadcaster: (
|
194
196
|
bool # a hint of whether to use message queue broadcaster
|
@@ -204,6 +206,7 @@ class GroupCoordinator:
|
|
204
206
|
local_rank: int,
|
205
207
|
torch_distributed_backend: Union[str, Backend],
|
206
208
|
use_pynccl: bool,
|
209
|
+
use_pymscclpp: bool,
|
207
210
|
use_custom_allreduce: bool,
|
208
211
|
use_hpu_communicator: bool,
|
209
212
|
use_xpu_communicator: bool,
|
@@ -243,6 +246,7 @@ class GroupCoordinator:
|
|
243
246
|
self.device = torch.device("cpu")
|
244
247
|
|
245
248
|
self.use_pynccl = use_pynccl
|
249
|
+
self.use_pymscclpp = use_pymscclpp
|
246
250
|
self.use_custom_allreduce = use_custom_allreduce
|
247
251
|
self.use_hpu_communicator = use_hpu_communicator
|
248
252
|
self.use_xpu_communicator = use_xpu_communicator
|
@@ -264,6 +268,17 @@ class GroupCoordinator:
|
|
264
268
|
device=self.device,
|
265
269
|
)
|
266
270
|
|
271
|
+
from sglang.srt.distributed.device_communicators.pymscclpp import (
|
272
|
+
PyMscclppCommunicator,
|
273
|
+
)
|
274
|
+
|
275
|
+
self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
|
276
|
+
if use_pymscclpp and self.world_size > 1:
|
277
|
+
self.pymscclpp_comm = PyMscclppCommunicator(
|
278
|
+
group=self.cpu_group,
|
279
|
+
device=self.device,
|
280
|
+
)
|
281
|
+
|
267
282
|
self.ca_comm: Optional[CustomAllreduce] = None
|
268
283
|
if use_custom_allreduce and self.world_size > 1:
|
269
284
|
# Initialize a custom fast all-reduce implementation.
|
@@ -372,11 +387,15 @@ class GroupCoordinator:
|
|
372
387
|
# --------------------------------------------
|
373
388
|
# custom allreduce | enabled | enabled |
|
374
389
|
# PyNccl | disabled| enabled |
|
390
|
+
# PyMscclpp | disabled| enabled |
|
375
391
|
# torch.distributed | enabled | disabled|
|
376
392
|
#
|
377
393
|
# Note that custom allreduce will have a runtime check, if the
|
378
394
|
# tensor size is too large, it will fallback to the next
|
379
395
|
# available option.
|
396
|
+
# Note that the PyMsccl needs to register the tensor in ahead,
|
397
|
+
# which will introduce large overhead in the eager case,
|
398
|
+
# therefore it is only supported in the graph case.
|
380
399
|
# In summary: When using CUDA graph, we use
|
381
400
|
# either custom all-reduce kernel or pynccl. When not using
|
382
401
|
# CUDA graph, we use either custom all-reduce kernel or
|
@@ -391,7 +410,14 @@ class GroupCoordinator:
|
|
391
410
|
maybe_pynccl_context = pynccl_comm.change_state(
|
392
411
|
enable=True, stream=torch.cuda.current_stream()
|
393
412
|
)
|
394
|
-
|
413
|
+
|
414
|
+
pymscclpp_comm = self.pymscclpp_comm
|
415
|
+
maybe_pymscclpp_context: Any
|
416
|
+
if not pymscclpp_comm:
|
417
|
+
maybe_pymscclpp_context = nullcontext()
|
418
|
+
else:
|
419
|
+
maybe_pymscclpp_context = pymscclpp_comm.change_state(enable=True)
|
420
|
+
with maybe_pynccl_context, maybe_pymscclpp_context:
|
395
421
|
yield graph_capture_context
|
396
422
|
|
397
423
|
def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
|
@@ -436,6 +462,10 @@ class GroupCoordinator:
|
|
436
462
|
self.ca_comm is not None
|
437
463
|
and not self.ca_comm.disabled
|
438
464
|
and self.ca_comm.should_custom_ar(input_)
|
465
|
+
) or (
|
466
|
+
self.pymscclpp_comm is not None
|
467
|
+
and not self.pymscclpp_comm.disabled
|
468
|
+
and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
|
439
469
|
):
|
440
470
|
return torch.ops.sglang.outplace_all_reduce(
|
441
471
|
input_, group_name=self.unique_name
|
@@ -446,9 +476,13 @@ class GroupCoordinator:
|
|
446
476
|
|
447
477
|
def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
|
448
478
|
ca_comm = self.ca_comm
|
449
|
-
|
450
|
-
assert not
|
451
|
-
|
479
|
+
pymscclpp_comm = self.pymscclpp_comm
|
480
|
+
assert ca_comm is not None or pymscclpp_comm is not None
|
481
|
+
if ca_comm is not None and not ca_comm.disabled:
|
482
|
+
out = ca_comm.custom_all_reduce(input_)
|
483
|
+
else:
|
484
|
+
assert not pymscclpp_comm.disabled
|
485
|
+
out = pymscclpp_comm.all_reduce(input_)
|
452
486
|
assert out is not None
|
453
487
|
return out
|
454
488
|
|
@@ -957,6 +991,7 @@ def init_world_group(
|
|
957
991
|
local_rank=local_rank,
|
958
992
|
torch_distributed_backend=backend,
|
959
993
|
use_pynccl=False,
|
994
|
+
use_pymscclpp=False,
|
960
995
|
use_custom_allreduce=False,
|
961
996
|
use_hpu_communicator=False,
|
962
997
|
use_xpu_communicator=False,
|
@@ -972,14 +1007,18 @@ def init_model_parallel_group(
|
|
972
1007
|
use_custom_allreduce: Optional[bool] = None,
|
973
1008
|
use_message_queue_broadcaster: bool = False,
|
974
1009
|
group_name: Optional[str] = None,
|
1010
|
+
use_mscclpp_allreduce: Optional[bool] = None,
|
975
1011
|
) -> GroupCoordinator:
|
976
1012
|
if use_custom_allreduce is None:
|
977
1013
|
use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
|
1014
|
+
if use_mscclpp_allreduce is None:
|
1015
|
+
use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
|
978
1016
|
return GroupCoordinator(
|
979
1017
|
group_ranks=group_ranks,
|
980
1018
|
local_rank=local_rank,
|
981
1019
|
torch_distributed_backend=backend,
|
982
1020
|
use_pynccl=not is_npu(),
|
1021
|
+
use_pymscclpp=use_mscclpp_allreduce,
|
983
1022
|
use_custom_allreduce=use_custom_allreduce,
|
984
1023
|
use_hpu_communicator=True,
|
985
1024
|
use_xpu_communicator=True,
|
@@ -1036,6 +1075,7 @@ def graph_capture():
|
|
1036
1075
|
logger = logging.getLogger(__name__)
|
1037
1076
|
|
1038
1077
|
_ENABLE_CUSTOM_ALL_REDUCE = True
|
1078
|
+
_ENABLE_MSCCLPP_ALL_REDUCE = False
|
1039
1079
|
|
1040
1080
|
|
1041
1081
|
def set_custom_all_reduce(enable: bool):
|
@@ -1043,6 +1083,11 @@ def set_custom_all_reduce(enable: bool):
|
|
1043
1083
|
_ENABLE_CUSTOM_ALL_REDUCE = enable
|
1044
1084
|
|
1045
1085
|
|
1086
|
+
def set_mscclpp_all_reduce(enable: bool):
|
1087
|
+
global _ENABLE_MSCCLPP_ALL_REDUCE
|
1088
|
+
_ENABLE_MSCCLPP_ALL_REDUCE = enable
|
1089
|
+
|
1090
|
+
|
1046
1091
|
def init_distributed_environment(
|
1047
1092
|
world_size: int = -1,
|
1048
1093
|
rank: int = -1,
|
@@ -1153,7 +1198,9 @@ def initialize_model_parallel(
|
|
1153
1198
|
group_ranks,
|
1154
1199
|
get_world_group().local_rank,
|
1155
1200
|
backend,
|
1156
|
-
use_message_queue_broadcaster=
|
1201
|
+
use_message_queue_broadcaster=get_bool_env_var(
|
1202
|
+
"SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
|
1203
|
+
),
|
1157
1204
|
group_name="tp",
|
1158
1205
|
)
|
1159
1206
|
|
@@ -23,6 +23,12 @@ class EngineBase(ABC):
|
|
23
23
|
token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
|
24
24
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
|
25
25
|
custom_logit_processor: Optional[Union[List[str], str]] = None,
|
26
|
+
return_hidden_states: Optional[bool] = None,
|
27
|
+
stream: Optional[bool] = None,
|
28
|
+
bootstrap_host: Optional[Union[List[str], str]] = None,
|
29
|
+
bootstrap_port: Optional[Union[List[int], int]] = None,
|
30
|
+
bootstrap_room: Optional[Union[List[int], int]] = None,
|
31
|
+
data_parallel_rank: Optional[int] = None,
|
26
32
|
) -> Union[Dict, Iterator[Dict]]:
|
27
33
|
"""Generate outputs based on given inputs."""
|
28
34
|
pass
|