sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/layers/layernorm.py
CHANGED
@@ -20,10 +20,11 @@ import torch
|
|
20
20
|
import torch.nn as nn
|
21
21
|
|
22
22
|
from sglang.srt.custom_op import CustomOp
|
23
|
-
from sglang.srt.utils import is_cuda, is_hip
|
23
|
+
from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip
|
24
24
|
|
25
25
|
_is_cuda = is_cuda()
|
26
26
|
_is_hip = is_hip()
|
27
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
27
28
|
|
28
29
|
if _is_cuda:
|
29
30
|
from sgl_kernel import (
|
@@ -33,7 +34,10 @@ if _is_cuda:
|
|
33
34
|
rmsnorm,
|
34
35
|
)
|
35
36
|
|
36
|
-
if
|
37
|
+
if _use_aiter:
|
38
|
+
from aiter import rmsnorm2d_fwd as rms_norm
|
39
|
+
from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
|
40
|
+
elif _is_hip:
|
37
41
|
from vllm._custom_ops import fused_add_rms_norm, rms_norm
|
38
42
|
|
39
43
|
logger = logging.getLogger(__name__)
|
@@ -48,16 +52,8 @@ class RMSNorm(CustomOp):
|
|
48
52
|
super().__init__()
|
49
53
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
50
54
|
self.variance_epsilon = eps
|
51
|
-
|
52
|
-
|
53
|
-
if torch.compiler.is_compiling():
|
54
|
-
return self.forward_native(*args, **kwargs)
|
55
|
-
if _is_cuda:
|
56
|
-
return self.forward_cuda(*args, **kwargs)
|
57
|
-
elif _is_hip:
|
58
|
-
return self.forward_hip(*args, **kwargs)
|
59
|
-
else:
|
60
|
-
return self.forward_native(*args, **kwargs)
|
55
|
+
if _use_aiter:
|
56
|
+
self._forward_method = self.forward_aiter
|
61
57
|
|
62
58
|
def forward_cuda(
|
63
59
|
self,
|
@@ -70,6 +66,25 @@ class RMSNorm(CustomOp):
|
|
70
66
|
out = rmsnorm(x, self.weight.data, self.variance_epsilon)
|
71
67
|
return out
|
72
68
|
|
69
|
+
def forward_aiter(
|
70
|
+
self,
|
71
|
+
x: torch.Tensor,
|
72
|
+
residual: Optional[torch.Tensor] = None,
|
73
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
74
|
+
if residual is not None:
|
75
|
+
residual_out = torch.empty_like(x)
|
76
|
+
output = torch.empty_like(x)
|
77
|
+
fused_add_rms_norm(
|
78
|
+
output,
|
79
|
+
x,
|
80
|
+
residual,
|
81
|
+
residual_out,
|
82
|
+
self.weight.data,
|
83
|
+
self.variance_epsilon,
|
84
|
+
)
|
85
|
+
return output, residual_out
|
86
|
+
return rms_norm(x, self.weight.data, self.variance_epsilon)
|
87
|
+
|
73
88
|
def forward_hip(
|
74
89
|
self,
|
75
90
|
x: torch.Tensor,
|
@@ -117,13 +132,9 @@ class GemmaRMSNorm(CustomOp):
|
|
117
132
|
self.weight = nn.Parameter(torch.zeros(hidden_size))
|
118
133
|
self.variance_epsilon = eps
|
119
134
|
|
120
|
-
|
121
|
-
if
|
122
|
-
|
123
|
-
if _is_cuda:
|
124
|
-
return self.forward_cuda(*args, **kwargs)
|
125
|
-
else:
|
126
|
-
return self.forward_native(*args, **kwargs)
|
135
|
+
# Re-dispatch
|
136
|
+
if _is_hip:
|
137
|
+
self._forward_method = self.forward_native
|
127
138
|
|
128
139
|
def forward_native(
|
129
140
|
self,
|
sglang/srt/layers/linear.py
CHANGED
@@ -546,8 +546,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
|
546
546
|
param.shard_id.append(loaded_shard_id)
|
547
547
|
param.shard_id_map[loaded_shard_id] = len(param.data_container)
|
548
548
|
param.data_container.append(loaded_weight)
|
549
|
-
if len(param.data_container) == 2:
|
550
|
-
self.qweight = param.materialize_nested()
|
551
549
|
return
|
552
550
|
|
553
551
|
param_data = param.data
|
@@ -961,8 +959,6 @@ class QKVParallelLinear(ColumnParallelLinear):
|
|
961
959
|
param.shard_id.append(loaded_shard_id)
|
962
960
|
param.shard_id_map[loaded_shard_id] = len(param.data_container)
|
963
961
|
param.data_container.append(loaded_weight)
|
964
|
-
if len(param.data_container) == 3:
|
965
|
-
self.qweight = param.materialize_nested()
|
966
962
|
return
|
967
963
|
|
968
964
|
param_data = param.data
|
@@ -47,18 +47,6 @@ from sglang.srt.utils import dump_to_file
|
|
47
47
|
logger = logging.getLogger(__name__)
|
48
48
|
|
49
49
|
|
50
|
-
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
51
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
52
|
-
from sglang.srt.model_executor.forward_batch_info import (
|
53
|
-
CaptureHiddenMode,
|
54
|
-
ForwardBatch,
|
55
|
-
ForwardMode,
|
56
|
-
)
|
57
|
-
from sglang.srt.utils import dump_to_file
|
58
|
-
|
59
|
-
logger = logging.getLogger(__name__)
|
60
|
-
|
61
|
-
|
62
50
|
@dataclasses.dataclass
|
63
51
|
class LogitsProcessorOutput:
|
64
52
|
## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""CUTLASS based Fused MoE kernels."""
|
2
2
|
|
3
3
|
import functools
|
4
4
|
import json
|
@@ -8,19 +8,24 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
8
8
|
|
9
9
|
import torch
|
10
10
|
|
11
|
+
from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
|
11
12
|
from sglang.srt.utils import is_cuda
|
12
13
|
|
13
14
|
_is_cuda = is_cuda()
|
14
15
|
if _is_cuda:
|
15
16
|
import sgl_kernel
|
16
17
|
from sgl_kernel import (
|
18
|
+
apply_shuffle_mul_sum,
|
19
|
+
cutlass_fp4_group_mm,
|
17
20
|
fp8_blockwise_scaled_grouped_mm,
|
18
21
|
prepare_moe_input,
|
22
|
+
scaled_fp4_experts_quant,
|
23
|
+
shuffle_rows,
|
19
24
|
silu_and_mul,
|
20
25
|
)
|
21
26
|
|
22
27
|
|
23
|
-
def
|
28
|
+
def cutlass_fused_experts_fp8(
|
24
29
|
a: torch.Tensor,
|
25
30
|
w1_q: torch.Tensor,
|
26
31
|
w2_q: torch.Tensor,
|
@@ -147,8 +152,8 @@ def cutlass_fused_experts(
|
|
147
152
|
k,
|
148
153
|
)
|
149
154
|
|
150
|
-
rep_a_q = a_q
|
151
|
-
rep_a1_scales = a1_scale
|
155
|
+
rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
|
156
|
+
rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
|
152
157
|
|
153
158
|
c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
|
154
159
|
c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
|
@@ -202,6 +207,164 @@ def cutlass_fused_experts(
|
|
202
207
|
expert_offsets[:-1],
|
203
208
|
workspace,
|
204
209
|
)
|
205
|
-
|
206
|
-
|
207
|
-
|
210
|
+
|
211
|
+
result = torch.empty((m, k), device=device, dtype=out_dtype)
|
212
|
+
return apply_shuffle_mul_sum(c2, result, c_map, topk_weights)
|
213
|
+
|
214
|
+
|
215
|
+
FLOAT4_E2M1_MAX = 6.0
|
216
|
+
FLOAT8_E4M3_MAX = 448.0
|
217
|
+
|
218
|
+
|
219
|
+
def cutlass_moe_fp4(
|
220
|
+
a: torch.Tensor,
|
221
|
+
a1_gscale: torch.Tensor,
|
222
|
+
w1_fp4: torch.Tensor,
|
223
|
+
w1_blockscale: torch.Tensor,
|
224
|
+
w1_alphas: torch.Tensor,
|
225
|
+
a2_gscale: torch.Tensor,
|
226
|
+
w2_fp4: torch.Tensor,
|
227
|
+
w2_blockscale: torch.Tensor,
|
228
|
+
w2_alphas: torch.Tensor,
|
229
|
+
topk_weights: torch.Tensor,
|
230
|
+
topk_ids: torch.Tensor,
|
231
|
+
params: CutlassMoEParams,
|
232
|
+
apply_router_weight_on_input: bool = False,
|
233
|
+
):
|
234
|
+
"""
|
235
|
+
MoE implementation for FP4 Inputs
|
236
|
+
|
237
|
+
# Gemm 1
|
238
|
+
a: Input tensor: [m, k] (half/bfloat16)
|
239
|
+
a1_gscale: Activation scale per expert: [e] (float32)
|
240
|
+
w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
|
241
|
+
w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
|
242
|
+
(Note: `n` is the up projection output dim, `k` is the input dim in
|
243
|
+
full precision)
|
244
|
+
w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
|
245
|
+
(Block size = 16 for NVFP4)
|
246
|
+
|
247
|
+
# Gemm 2
|
248
|
+
a2_gscale: Activation scale per expert: [e]
|
249
|
+
w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
|
250
|
+
w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
|
251
|
+
w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
|
252
|
+
|
253
|
+
Strides for activations, weights and output in logical number of elements.
|
254
|
+
The activations & output stride is the number of elements to the next row.
|
255
|
+
The weights stride is the number of elements to the next row per expert.
|
256
|
+
For example, if the weight is [e, n, k], then the b_stride is a tensor of
|
257
|
+
shape [e] with each element being k. Similarly for activations, if the
|
258
|
+
shape is [m, k], then the a_stride has shape [e] with each value k.
|
259
|
+
Similarly for output, if the output is [m, n], then the c_stride is a
|
260
|
+
tensor of shape [e] with each element being k.
|
261
|
+
|
262
|
+
Note: cutlass_fp4_group_mm is designed to accept the strides of
|
263
|
+
activations and weights to be the same, so it is passed in as a single
|
264
|
+
tensor.
|
265
|
+
ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
|
266
|
+
ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
|
267
|
+
c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
|
268
|
+
c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]
|
269
|
+
|
270
|
+
topk_weights: [m, topk] dtype: float8
|
271
|
+
topk_ids: [m, topk] dtype: float8
|
272
|
+
|
273
|
+
m, n, k: Unquantized weight shapes, dtype: int
|
274
|
+
e: number of experts for the current rank, dtype: int
|
275
|
+
assumes that topk < k < n to satisfy - up/down projection expectations.
|
276
|
+
"""
|
277
|
+
assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
|
278
|
+
assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
|
279
|
+
assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
|
280
|
+
assert (
|
281
|
+
w1_fp4.ndim == 3
|
282
|
+
and w2_fp4.ndim == 3
|
283
|
+
and w1_blockscale.ndim == 3
|
284
|
+
and w2_blockscale.ndim == 3
|
285
|
+
), "All Weights must be of rank 3 for cutlass_moe_fp4"
|
286
|
+
m_a, k_a = a.shape
|
287
|
+
e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
|
288
|
+
e_w2, k_w2, half_n_w2 = w2_fp4.shape
|
289
|
+
|
290
|
+
assert e_w1 == e_w2 and e_w1 == params.num_experts, (
|
291
|
+
"Number of experts must match",
|
292
|
+
" between weights.",
|
293
|
+
)
|
294
|
+
assert (
|
295
|
+
k_a // 2 == half_k_w1 and params.hidden_size == k_w2
|
296
|
+
), "Hidden size mismatch between a, w1 and w2"
|
297
|
+
assert (
|
298
|
+
nx2_w1 == params.intermediate_size_per_partition * 2
|
299
|
+
and half_n_w2 == params.intermediate_size_per_partition // 2
|
300
|
+
), ("mismatch in " "expected `n`")
|
301
|
+
assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
|
302
|
+
assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
|
303
|
+
|
304
|
+
out_dtype = a.dtype
|
305
|
+
num_topk = topk_ids.shape[1]
|
306
|
+
device = a.device
|
307
|
+
a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
|
308
|
+
c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
|
309
|
+
prepare_moe_input(
|
310
|
+
topk_ids,
|
311
|
+
params.expert_offsets,
|
312
|
+
params.problem_sizes1,
|
313
|
+
params.problem_sizes2,
|
314
|
+
a_map,
|
315
|
+
c_map,
|
316
|
+
params.num_experts,
|
317
|
+
params.intermediate_size_per_partition,
|
318
|
+
params.hidden_size,
|
319
|
+
params.blockscale_offsets,
|
320
|
+
)
|
321
|
+
|
322
|
+
rep_a_fp4, rep_a_blockscale = scaled_fp4_experts_quant(
|
323
|
+
a,
|
324
|
+
a1_gscale,
|
325
|
+
params.expert_offsets,
|
326
|
+
params.blockscale_offsets,
|
327
|
+
num_topk,
|
328
|
+
expert_map=a_map,
|
329
|
+
)
|
330
|
+
c1 = cutlass_fp4_group_mm(
|
331
|
+
rep_a_fp4,
|
332
|
+
w1_fp4,
|
333
|
+
rep_a_blockscale,
|
334
|
+
w1_blockscale,
|
335
|
+
w1_alphas,
|
336
|
+
out_dtype,
|
337
|
+
device,
|
338
|
+
params.to_gemm1_args(),
|
339
|
+
)
|
340
|
+
del rep_a_fp4, rep_a_blockscale
|
341
|
+
|
342
|
+
# hidden size dimension is split to one halfpytho sized tensor.
|
343
|
+
intermediate = torch.empty(
|
344
|
+
(m_a * num_topk, w1_fp4.shape[1] // 2), device=device, dtype=out_dtype
|
345
|
+
)
|
346
|
+
silu_and_mul(c1, intermediate)
|
347
|
+
|
348
|
+
int_fp4, int_blockscale = scaled_fp4_experts_quant(
|
349
|
+
intermediate,
|
350
|
+
a2_gscale,
|
351
|
+
params.expert_offsets,
|
352
|
+
params.blockscale_offsets,
|
353
|
+
num_topk,
|
354
|
+
)
|
355
|
+
c2 = cutlass_fp4_group_mm(
|
356
|
+
int_fp4,
|
357
|
+
w2_fp4,
|
358
|
+
int_blockscale,
|
359
|
+
w2_blockscale,
|
360
|
+
w2_alphas,
|
361
|
+
out_dtype,
|
362
|
+
device,
|
363
|
+
params.to_gemm2_args(),
|
364
|
+
)
|
365
|
+
del int_fp4, int_blockscale
|
366
|
+
c2 = shuffle_rows(c2, c_map, (m_a * num_topk, params.hidden_size))
|
367
|
+
c2 = c2.view(m_a, num_topk, params.hidden_size)
|
368
|
+
if not apply_router_weight_on_input:
|
369
|
+
c2 = c2 * topk_weights.view(m_a, num_topk, 1).to(out_dtype)
|
370
|
+
return c2.sum(dim=1).to(out_dtype)
|
@@ -0,0 +1,169 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from enum import Enum, auto
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
import torch
|
6
|
+
|
7
|
+
|
8
|
+
class CutlassMoEType(Enum):
|
9
|
+
"""
|
10
|
+
Enum for the different types of cutlass moe operations
|
11
|
+
that are currently supported in SGLang.
|
12
|
+
"""
|
13
|
+
|
14
|
+
BlockscaledFP8 = auto()
|
15
|
+
BlockscaledFP4 = auto()
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class CutlassMoEParams:
|
20
|
+
"""
|
21
|
+
Parameters for the cutlass moe operation.
|
22
|
+
"""
|
23
|
+
|
24
|
+
# Type as defined above
|
25
|
+
cutlass_moe_type: CutlassMoEType
|
26
|
+
|
27
|
+
# Strides for activations, weights and output in logical number of elements.
|
28
|
+
# The activations & output stride is the number of elements to the next row.
|
29
|
+
# The weights stride is the number of elements to the next row per expert.
|
30
|
+
# For example, if the weight is [e, n, k], then the b_stride is a tensor of
|
31
|
+
# shape [e] with each element being k. Similarly for activations, if the
|
32
|
+
# shape is [m, k], then the a_stride has shape [e] with each value k.
|
33
|
+
# Similarly for output, if the output is [m, n], then the c_stride is a
|
34
|
+
# tensor of shape [e] with each element being k.
|
35
|
+
|
36
|
+
# Note: cutlass_fp4_group_mm is designed to accept the strides of
|
37
|
+
# activations and weights to be the same, so it is passed in as a single
|
38
|
+
# tensor.
|
39
|
+
# ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
|
40
|
+
# ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
|
41
|
+
# c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
|
42
|
+
# c_strides_2: [e] dtype: int64 [Gemm 2: Output Strides]
|
43
|
+
ab_strides_13: torch.Tensor
|
44
|
+
ab_strides_2: torch.Tensor
|
45
|
+
c_strides_13: torch.Tensor
|
46
|
+
c_strides_2: torch.Tensor
|
47
|
+
|
48
|
+
# m: Total number of tokens
|
49
|
+
# n: intermediate size per partition
|
50
|
+
# k: hidden size per expert
|
51
|
+
# e: Number of experts
|
52
|
+
# device: Device to run computation on and store tensors
|
53
|
+
m: int
|
54
|
+
intermediate_size_per_partition: int
|
55
|
+
hidden_size: int
|
56
|
+
num_experts: int
|
57
|
+
device: torch.device
|
58
|
+
|
59
|
+
# Pointers container for calculating offsets of the input activations for each expert
|
60
|
+
# a_ptrs: [e] dtype: int64
|
61
|
+
a_ptrs: torch.Tensor
|
62
|
+
|
63
|
+
# Pointers container for calculating offsets of the input weights for each expert
|
64
|
+
# b_ptrs: [e] dtype: int64
|
65
|
+
b_ptrs: torch.Tensor
|
66
|
+
|
67
|
+
# Pointers container for calculating offsets of the output activations for each expert
|
68
|
+
# out_ptrs: [e] dtype: int64
|
69
|
+
out_ptrs: torch.Tensor
|
70
|
+
# Pointers container for calculating offsets of the input scales for each expert
|
71
|
+
# a_scales_ptrs: [e] dtype: int64
|
72
|
+
# b_scales_ptrs: [e] dtype: int64
|
73
|
+
a_scales_ptrs: torch.Tensor
|
74
|
+
b_scales_ptrs: torch.Tensor
|
75
|
+
|
76
|
+
# Offsets that mark at which token index each expert begins its computation
|
77
|
+
# The number of tokens computed with expert E is expert_offsets[E + 1] - expert_offsets[E]
|
78
|
+
# expert_offsets: [e+1] dtype: int32
|
79
|
+
expert_offsets: torch.Tensor
|
80
|
+
|
81
|
+
# Problem size: (num_experts, (m,2n,k)) for first GEMM
|
82
|
+
# problem_sizes1: [e, 3] dtype: int32
|
83
|
+
# Problem size: (num_experts, (m,n,k)) for second GEMM
|
84
|
+
# problem_sizes2: [e, 3] dtype: int32
|
85
|
+
problem_sizes1: torch.Tensor
|
86
|
+
problem_sizes2: torch.Tensor
|
87
|
+
# Similar to expert_offsets, but for blockscales for FP4 blockscaled Group GEMM
|
88
|
+
blockscale_offsets: Optional[torch.Tensor] = None
|
89
|
+
|
90
|
+
def __init__(
|
91
|
+
self,
|
92
|
+
cutlass_moe_type: CutlassMoEType,
|
93
|
+
device: torch.device,
|
94
|
+
num_experts: int,
|
95
|
+
intermediate_size_per_partition: int,
|
96
|
+
hidden_size: int,
|
97
|
+
):
|
98
|
+
self.cutlass_moe_type = cutlass_moe_type
|
99
|
+
self.device = device
|
100
|
+
self.num_experts = num_experts
|
101
|
+
self.intermediate_size_per_partition = intermediate_size_per_partition
|
102
|
+
self.hidden_size = hidden_size
|
103
|
+
self.n = self.intermediate_size_per_partition
|
104
|
+
self.k = self.hidden_size
|
105
|
+
self.e = self.num_experts
|
106
|
+
self.ab_strides_13 = torch.full(
|
107
|
+
(self.e,), self.k, dtype=torch.int64, device=self.device
|
108
|
+
)
|
109
|
+
self.ab_strides_2 = torch.full(
|
110
|
+
(self.e,), self.n, dtype=torch.int64, device=self.device
|
111
|
+
)
|
112
|
+
self.c_strides_13 = torch.full(
|
113
|
+
(self.e,), 2 * self.n, dtype=torch.int64, device=self.device
|
114
|
+
)
|
115
|
+
self.c_strides_2 = torch.full(
|
116
|
+
(self.e,), self.k, dtype=torch.int64, device=self.device
|
117
|
+
)
|
118
|
+
self.expert_offsets = torch.empty(
|
119
|
+
(self.e + 1,), dtype=torch.int32, device=self.device
|
120
|
+
)
|
121
|
+
self.problem_sizes1 = torch.empty(
|
122
|
+
(self.e, 3), dtype=torch.int32, device=self.device
|
123
|
+
)
|
124
|
+
self.problem_sizes2 = torch.empty(
|
125
|
+
(self.e, 3), dtype=torch.int32, device=self.device
|
126
|
+
)
|
127
|
+
if self.cutlass_moe_type == CutlassMoEType.BlockscaledFP4:
|
128
|
+
self.blockscale_offsets = torch.empty(
|
129
|
+
(self.e + 1,), dtype=torch.int32, device=self.device
|
130
|
+
)
|
131
|
+
else:
|
132
|
+
self.blockscale_offsets = None
|
133
|
+
self.a_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
|
134
|
+
self.b_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
|
135
|
+
self.out_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
|
136
|
+
self.a_scales_ptrs = torch.empty(
|
137
|
+
(self.e,), dtype=torch.int64, device=self.device
|
138
|
+
)
|
139
|
+
self.b_scales_ptrs = torch.empty(
|
140
|
+
(self.e,), dtype=torch.int64, device=self.device
|
141
|
+
)
|
142
|
+
|
143
|
+
def to_gemm1_args(self) -> dict:
|
144
|
+
return {
|
145
|
+
"ab_strides": self.ab_strides_13,
|
146
|
+
"c_strides": self.c_strides_13,
|
147
|
+
"problem_sizes": self.problem_sizes1,
|
148
|
+
"expert_offsets": self.expert_offsets[:-1],
|
149
|
+
"blockscale_offsets": self.blockscale_offsets[:-1],
|
150
|
+
# "a_ptrs": self.a_ptrs,
|
151
|
+
# "b_ptrs": self.b_ptrs,
|
152
|
+
# "out_ptrs": self.out_ptrs,
|
153
|
+
# "a_scales_ptrs": self.a_scales_ptrs,
|
154
|
+
# "b_scales_ptrs": self.b_scales_ptrs,
|
155
|
+
}
|
156
|
+
|
157
|
+
def to_gemm2_args(self) -> dict:
|
158
|
+
return {
|
159
|
+
"ab_strides": self.ab_strides_2,
|
160
|
+
"c_strides": self.c_strides_2,
|
161
|
+
"problem_sizes": self.problem_sizes2,
|
162
|
+
"expert_offsets": self.expert_offsets[:-1],
|
163
|
+
"blockscale_offsets": self.blockscale_offsets[:-1],
|
164
|
+
# "a_ptrs": self.a_ptrs,
|
165
|
+
# "b_ptrs": self.b_ptrs,
|
166
|
+
# "out_ptrs": self.out_ptrs,
|
167
|
+
# "a_scales_ptrs": self.a_scales_ptrs,
|
168
|
+
# "b_scales_ptrs": self.b_scales_ptrs,
|
169
|
+
}
|
@@ -4,6 +4,7 @@ from typing import List, Optional
|
|
4
4
|
import torch
|
5
5
|
import triton
|
6
6
|
|
7
|
+
from sglang.math_utils import ceil_div
|
7
8
|
from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
|
8
9
|
from sglang.srt.utils import dispose_tensor, is_cuda
|
9
10
|
|
@@ -15,11 +16,6 @@ if _is_cuda:
|
|
15
16
|
sglang_per_token_group_quant_fp8 as per_token_group_quant_fp8,
|
16
17
|
)
|
17
18
|
|
18
|
-
try:
|
19
|
-
from deep_gemm import ceil_div
|
20
|
-
except ImportError:
|
21
|
-
logger.error(f"Failed to import ceil_div from deep_gemm.")
|
22
|
-
|
23
19
|
import triton.language as tl
|
24
20
|
|
25
21
|
|
@@ -178,26 +174,33 @@ def pre_reorder_triton_kernel(
|
|
178
174
|
topk,
|
179
175
|
hidden_size,
|
180
176
|
BLOCK_SIZE: tl.constexpr,
|
177
|
+
use_per_token_if_dynamic: tl.constexpr,
|
181
178
|
):
|
182
179
|
OutDtype = gateup_input_ptr.dtype.element_ty
|
183
180
|
|
184
181
|
src_idx = tl.program_id(0)
|
185
182
|
src2dst_ptr = src2dst_ptr + src_idx * topk
|
186
183
|
topk_ids_ptr = topk_ids_ptr + src_idx * topk
|
187
|
-
|
188
184
|
src_ptr = input_ptr + src_idx * hidden_size
|
185
|
+
|
186
|
+
vec = tl.arange(0, BLOCK_SIZE)
|
187
|
+
|
188
|
+
if a1_scales_ptr is not None and use_per_token_if_dynamic:
|
189
|
+
scale = 1.0 / tl.load(a1_scales_ptr + src_idx)
|
190
|
+
|
189
191
|
for idx in range(topk):
|
190
192
|
expert_id = tl.load(topk_ids_ptr + idx)
|
191
193
|
if expert_id >= start_expert_id and expert_id <= end_expert_id:
|
192
194
|
if a1_scales_ptr is not None:
|
193
|
-
|
195
|
+
if not use_per_token_if_dynamic:
|
196
|
+
scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
|
194
197
|
else:
|
195
198
|
scale = 1.0
|
196
199
|
|
197
200
|
dst_idx = tl.load(src2dst_ptr + idx)
|
198
201
|
dst_ptr = gateup_input_ptr + dst_idx * hidden_size
|
199
202
|
for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
|
200
|
-
offset = start_offset +
|
203
|
+
offset = start_offset + vec
|
201
204
|
mask = offset < hidden_size
|
202
205
|
in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
|
203
206
|
out_data = (in_data * scale).to(OutDtype)
|
@@ -271,6 +274,7 @@ def _silu_and_mul_post_quant_kernel(
|
|
271
274
|
fp8_min,
|
272
275
|
BLOCK_N: tl.constexpr,
|
273
276
|
NUM_STAGE: tl.constexpr,
|
277
|
+
SCALE_UE8M0: tl.constexpr,
|
274
278
|
):
|
275
279
|
expert_id = tl.program_id(2)
|
276
280
|
token_id = tl.program_id(1)
|
@@ -312,6 +316,8 @@ def _silu_and_mul_post_quant_kernel(
|
|
312
316
|
gate_up = up * gate
|
313
317
|
_absmax = tl.maximum(tl.max(tl.abs(gate_up)), 1e-10)
|
314
318
|
output_s = _absmax / fp8_max
|
319
|
+
if SCALE_UE8M0:
|
320
|
+
output_s = tl.exp2(tl.ceil(tl.log2(tl.abs(output_s))))
|
315
321
|
output_q = tl.clamp(gate_up / output_s, fp8_min, fp8_max).to(
|
316
322
|
output_ptr.dtype.element_ty
|
317
323
|
)
|
@@ -332,6 +338,7 @@ def silu_and_mul_masked_post_quant_fwd(
|
|
332
338
|
output_scale: torch.Tensor,
|
333
339
|
quant_group_size: int,
|
334
340
|
masked_m: torch.Tensor,
|
341
|
+
scale_ue8m0: bool = False,
|
335
342
|
):
|
336
343
|
"""
|
337
344
|
input shape [expert_num, token_num_padded, hidden_dim]
|
@@ -388,6 +395,7 @@ def silu_and_mul_masked_post_quant_fwd(
|
|
388
395
|
BLOCK_N=BLOCK_N,
|
389
396
|
NUM_STAGE=NUM_STAGES,
|
390
397
|
num_warps=num_warps,
|
398
|
+
SCALE_UE8M0=scale_ue8m0,
|
391
399
|
)
|
392
400
|
return
|
393
401
|
|
@@ -481,8 +489,11 @@ def post_reorder_triton_kernel(
|
|
481
489
|
|
482
490
|
computed = False
|
483
491
|
store_ptr = output_ptr + src_idx * hidden_size
|
492
|
+
|
493
|
+
vec = tl.arange(0, BLOCK_SIZE)
|
494
|
+
|
484
495
|
for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
|
485
|
-
offset = start_offset +
|
496
|
+
offset = start_offset + vec
|
486
497
|
mask = offset < hidden_size
|
487
498
|
|
488
499
|
sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
|
@@ -499,7 +510,7 @@ def post_reorder_triton_kernel(
|
|
499
510
|
|
500
511
|
if computed == False:
|
501
512
|
for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
|
502
|
-
offset = start_offset +
|
513
|
+
offset = start_offset + vec
|
503
514
|
mask = offset < hidden_size
|
504
515
|
tl.store(
|
505
516
|
store_ptr + offset, tl.zeros([BLOCK_SIZE], dtype=InDtype), mask=mask
|
@@ -553,6 +564,7 @@ def grouped_gemm_triton_kernel(
|
|
553
564
|
bs_stride_0: tl.constexpr,
|
554
565
|
bs_stride_2: tl.constexpr,
|
555
566
|
bs_stride_1: tl.constexpr,
|
567
|
+
use_per_token_if_dynamic: tl.constexpr,
|
556
568
|
BLOCK_SIZE_M: tl.constexpr,
|
557
569
|
BLOCK_SIZE_N: tl.constexpr,
|
558
570
|
BLOCK_SIZE_K: tl.constexpr,
|
@@ -616,7 +628,10 @@ def grouped_gemm_triton_kernel(
|
|
616
628
|
b_ptr += BLOCK_SIZE_K
|
617
629
|
|
618
630
|
if use_fp8_w8a8 and not (group_k > 0 and group_n > 0):
|
619
|
-
|
631
|
+
if use_per_token_if_dynamic:
|
632
|
+
scale_a_value = tl.load(scale_a + (m_range_start + offs_am[:, None]))
|
633
|
+
else:
|
634
|
+
scale_a_value = tl.load(scale_a + expert_id)
|
620
635
|
scale_b_value = tl.load(scale_b + expert_id)
|
621
636
|
accumulator *= scale_a_value * scale_b_value
|
622
637
|
|
@@ -653,6 +668,7 @@ def grouped_gemm_triton(
|
|
653
668
|
scale_b: torch.Tensor = None,
|
654
669
|
block_shape: Optional[List[int]] = None,
|
655
670
|
c_dtype=None,
|
671
|
+
use_per_token_if_dynamic: bool = True,
|
656
672
|
):
|
657
673
|
assert weight_column_major == True # TODO: more
|
658
674
|
if use_fp8_w8a8 and block_shape is None:
|
@@ -693,6 +709,11 @@ def grouped_gemm_triton(
|
|
693
709
|
triton.cdiv(b.size(1), META["BLOCK_SIZE_N"]),
|
694
710
|
)
|
695
711
|
|
712
|
+
if use_fp8_w8a8 and block_shape is None and use_per_token_if_dynamic:
|
713
|
+
assert (
|
714
|
+
scale_a.shape[0] == a.shape[0]
|
715
|
+
), f"scale_a.shape: {scale_a.shape}, a.shape: {a.shape}"
|
716
|
+
|
696
717
|
grouped_gemm_triton_kernel[grid](
|
697
718
|
a,
|
698
719
|
b,
|
@@ -716,6 +737,7 @@ def grouped_gemm_triton(
|
|
716
737
|
scale_b.stride(0) if scale_b is not None and scale_b.ndim >= 2 else 0,
|
717
738
|
scale_b.stride(2) if scale_b is not None and scale_b.ndim == 3 else 0,
|
718
739
|
scale_b.stride(1) if scale_b is not None and scale_b.ndim >= 2 else 0,
|
740
|
+
use_per_token_if_dynamic,
|
719
741
|
**config,
|
720
742
|
)
|
721
743
|
return c
|