sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,8 @@ import torch
|
|
23
23
|
import triton
|
24
24
|
import triton.language as tl
|
25
25
|
|
26
|
-
from sglang.
|
26
|
+
from sglang.math_utils import align
|
27
|
+
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
27
28
|
from sglang.srt.utils import (
|
28
29
|
direct_register_custom_op,
|
29
30
|
get_device_core_count,
|
@@ -44,10 +45,6 @@ if _is_cuda:
|
|
44
45
|
sgl_per_token_quant_fp8,
|
45
46
|
)
|
46
47
|
|
47
|
-
from sglang.srt.layers.quantization.deep_gemm import (
|
48
|
-
gemm_nt_f8f8bf16 as deep_gemm_gemm_nt_f8f8bf16,
|
49
|
-
)
|
50
|
-
|
51
48
|
logger = logging.getLogger(__name__)
|
52
49
|
|
53
50
|
|
@@ -67,7 +64,6 @@ else:
|
|
67
64
|
fp8_max = torch.finfo(fp8_dtype).max
|
68
65
|
fp8_min = -fp8_max
|
69
66
|
|
70
|
-
|
71
67
|
if supports_custom_op():
|
72
68
|
|
73
69
|
def deep_gemm_fp8_fp8_bf16_nt(
|
@@ -77,7 +73,7 @@ if supports_custom_op():
|
|
77
73
|
Bs: torch.Tensor,
|
78
74
|
C: torch.Tensor,
|
79
75
|
) -> None:
|
80
|
-
|
76
|
+
deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
|
81
77
|
|
82
78
|
def deep_gemm_fp8_fp8_bf16_nt_fake(
|
83
79
|
A: torch.Tensor,
|
@@ -280,6 +276,7 @@ def sglang_per_token_group_quant_fp8(
|
|
280
276
|
eps: float = 1e-10,
|
281
277
|
column_major_scales: bool = False,
|
282
278
|
scale_tma_aligned: bool = False,
|
279
|
+
scale_ue8m0: bool = False,
|
283
280
|
):
|
284
281
|
assert (
|
285
282
|
x.shape[-1] % group_size == 0
|
@@ -287,8 +284,21 @@ def sglang_per_token_group_quant_fp8(
|
|
287
284
|
assert x.is_contiguous(), "`x` is not contiguous"
|
288
285
|
|
289
286
|
x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
|
290
|
-
if
|
287
|
+
if scale_ue8m0:
|
288
|
+
assert column_major_scales and scale_tma_aligned
|
289
|
+
x_q_mn, x_q_k = x.shape
|
290
|
+
x_s_mn, x_s_k = x_q_mn, x_q_k // 128
|
291
|
+
aligned_mn = align(x_s_mn, 4)
|
292
|
+
aligned_k = align(x_s_k, 4)
|
293
|
+
# TODO(FIXME): Fix cuda kernel and recover here to empty.
|
294
|
+
x_s = torch.zeros(
|
295
|
+
(aligned_k // 4, aligned_mn),
|
296
|
+
device=x.device,
|
297
|
+
dtype=torch.int,
|
298
|
+
).transpose(0, 1)[:x_s_mn, :]
|
299
|
+
elif column_major_scales:
|
291
300
|
if scale_tma_aligned:
|
301
|
+
# TODO extract "align" function
|
292
302
|
# aligned to 4 * sizeof(float)
|
293
303
|
aligned_size = (x.shape[-2] + 3) // 4 * 4
|
294
304
|
x_s = torch.empty(
|
@@ -309,7 +319,9 @@ def sglang_per_token_group_quant_fp8(
|
|
309
319
|
dtype=torch.float32,
|
310
320
|
)
|
311
321
|
if x.shape[0] > 0:
|
312
|
-
sgl_per_token_group_quant_fp8(
|
322
|
+
sgl_per_token_group_quant_fp8(
|
323
|
+
x, x_q, x_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
|
324
|
+
)
|
313
325
|
|
314
326
|
return x_q, x_s
|
315
327
|
|
@@ -740,7 +752,76 @@ if _is_hip:
|
|
740
752
|
return _w8a8_block_fp8_matmul
|
741
753
|
|
742
754
|
|
743
|
-
def
|
755
|
+
def prepare_block_fp8_matmul_inputs(
|
756
|
+
A: torch.Tensor,
|
757
|
+
B: torch.Tensor,
|
758
|
+
As: torch.Tensor,
|
759
|
+
Bs: torch.Tensor,
|
760
|
+
block_size: List[int],
|
761
|
+
output_dtype: torch.dtype = torch.float16,
|
762
|
+
) -> Tuple[int, int, int]:
|
763
|
+
assert len(block_size) == 2
|
764
|
+
block_n, block_k = block_size[0], block_size[1]
|
765
|
+
|
766
|
+
assert A.shape[-1] == B.shape[-1]
|
767
|
+
assert A.shape[:-1] == As.shape[:-1]
|
768
|
+
assert A.is_contiguous()
|
769
|
+
|
770
|
+
if As.dtype == torch.float:
|
771
|
+
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
|
772
|
+
elif As.dtype == torch.int:
|
773
|
+
assert (
|
774
|
+
triton.cdiv(triton.cdiv(A.shape[-1], block_k), 4) == As.shape[-1]
|
775
|
+
), f"{A.shape=} {As.shape=} {block_size=}"
|
776
|
+
else:
|
777
|
+
raise NotImplementedError
|
778
|
+
|
779
|
+
M = A.numel() // A.shape[-1]
|
780
|
+
|
781
|
+
assert B.ndim == 2
|
782
|
+
assert B.is_contiguous()
|
783
|
+
assert Bs.ndim == 2
|
784
|
+
N, K = B.shape
|
785
|
+
|
786
|
+
if Bs.dtype == torch.float:
|
787
|
+
assert triton.cdiv(N, block_n) == Bs.shape[0]
|
788
|
+
assert triton.cdiv(K, block_k) == Bs.shape[1]
|
789
|
+
elif Bs.dtype == torch.int:
|
790
|
+
assert N == Bs.shape[0], f"{B.shape=} {Bs.shape=} {block_size=}"
|
791
|
+
assert (
|
792
|
+
triton.cdiv(triton.cdiv(K, block_k), 4) == Bs.shape[1]
|
793
|
+
), f"{B.shape=} {Bs.shape=} {block_size=}"
|
794
|
+
else:
|
795
|
+
raise NotImplementedError
|
796
|
+
|
797
|
+
C_shape = A.shape[:-1] + (N,)
|
798
|
+
C = A.new_empty(C_shape, dtype=output_dtype)
|
799
|
+
|
800
|
+
return M, N, K, C
|
801
|
+
|
802
|
+
|
803
|
+
def w8a8_block_fp8_matmul_deepgemm(
|
804
|
+
A: torch.Tensor,
|
805
|
+
B: torch.Tensor,
|
806
|
+
As: torch.Tensor,
|
807
|
+
Bs: torch.Tensor,
|
808
|
+
block_size: List[int],
|
809
|
+
output_dtype: torch.dtype,
|
810
|
+
) -> torch.Tensor:
|
811
|
+
M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
|
812
|
+
|
813
|
+
# Deepgemm only supports output tensor type as bfloat16
|
814
|
+
assert C.dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
|
815
|
+
|
816
|
+
if supports_custom_op():
|
817
|
+
torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
|
818
|
+
else:
|
819
|
+
deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
|
820
|
+
|
821
|
+
return C
|
822
|
+
|
823
|
+
|
824
|
+
def w8a8_block_fp8_matmul_triton(
|
744
825
|
A: torch.Tensor,
|
745
826
|
B: torch.Tensor,
|
746
827
|
As: torch.Tensor,
|
@@ -764,81 +845,81 @@ def w8a8_block_fp8_matmul(
|
|
764
845
|
Returns:
|
765
846
|
torch.Tensor: The result of matmul.
|
766
847
|
"""
|
767
|
-
assert len(block_size) == 2
|
768
|
-
block_n, block_k = block_size[0], block_size[1]
|
769
848
|
|
770
|
-
|
771
|
-
assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
|
772
|
-
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
|
773
|
-
M = A.numel() // A.shape[-1]
|
849
|
+
M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
|
774
850
|
|
775
|
-
|
776
|
-
N, K = B.shape
|
777
|
-
assert triton.cdiv(N, block_n) == Bs.shape[0]
|
778
|
-
assert triton.cdiv(K, block_k) == Bs.shape[1]
|
851
|
+
block_n, block_k = block_size
|
779
852
|
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
if supports_custom_op():
|
786
|
-
torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
|
787
|
-
else:
|
788
|
-
deep_gemm_gemm_nt_f8f8bf16((A, As), (B, Bs), C)
|
853
|
+
configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
|
854
|
+
if configs:
|
855
|
+
# If an optimal configuration map has been found, look up the
|
856
|
+
# optimal config
|
857
|
+
config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
|
789
858
|
else:
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
}
|
806
|
-
|
807
|
-
def grid(META):
|
808
|
-
return (
|
809
|
-
triton.cdiv(M, META["BLOCK_SIZE_M"])
|
810
|
-
* triton.cdiv(N, META["BLOCK_SIZE_N"]),
|
811
|
-
)
|
859
|
+
# Default config
|
860
|
+
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
|
861
|
+
config = {
|
862
|
+
"BLOCK_SIZE_M": 64,
|
863
|
+
"BLOCK_SIZE_N": block_size[0],
|
864
|
+
"BLOCK_SIZE_K": block_size[1],
|
865
|
+
"GROUP_SIZE_M": 32,
|
866
|
+
"num_warps": 4,
|
867
|
+
"num_stages": 3,
|
868
|
+
}
|
869
|
+
|
870
|
+
def grid(META):
|
871
|
+
return (
|
872
|
+
triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
|
873
|
+
)
|
812
874
|
|
813
|
-
|
875
|
+
kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
|
814
876
|
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
877
|
+
kernel[grid](
|
878
|
+
A,
|
879
|
+
B,
|
880
|
+
C,
|
881
|
+
As,
|
882
|
+
Bs,
|
883
|
+
M,
|
884
|
+
N,
|
885
|
+
K,
|
886
|
+
block_n,
|
887
|
+
block_k,
|
888
|
+
A.stride(-2),
|
889
|
+
A.stride(-1),
|
890
|
+
B.stride(1),
|
891
|
+
B.stride(0),
|
892
|
+
C.stride(-2),
|
893
|
+
C.stride(-1),
|
894
|
+
As.stride(-2),
|
895
|
+
As.stride(-1),
|
896
|
+
Bs.stride(1),
|
897
|
+
Bs.stride(0),
|
898
|
+
**config,
|
899
|
+
)
|
838
900
|
|
839
901
|
return C
|
840
902
|
|
841
903
|
|
904
|
+
# universal entry point, for testing purposes
|
905
|
+
def w8a8_block_fp8_matmul(
|
906
|
+
A: torch.Tensor,
|
907
|
+
B: torch.Tensor,
|
908
|
+
As: torch.Tensor,
|
909
|
+
Bs: torch.Tensor,
|
910
|
+
block_size: List[int],
|
911
|
+
output_dtype: torch.dtype = torch.float16,
|
912
|
+
) -> torch.Tensor:
|
913
|
+
if output_dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
|
914
|
+
return w8a8_block_fp8_matmul_deepgemm(
|
915
|
+
A, B, As, Bs, block_size, output_dtype=output_dtype
|
916
|
+
)
|
917
|
+
|
918
|
+
return w8a8_block_fp8_matmul_triton(
|
919
|
+
A, B, As, Bs, block_size, output_dtype=output_dtype
|
920
|
+
)
|
921
|
+
|
922
|
+
|
842
923
|
@triton.jit
|
843
924
|
def _per_tensor_quant_mla_fp8_stage1(
|
844
925
|
x_ptr,
|
@@ -1,9 +1,12 @@
|
|
1
|
-
import
|
2
|
-
from typing import List, Optional, Tuple
|
1
|
+
from typing import Callable, List, Optional, Tuple
|
3
2
|
|
3
|
+
import einops
|
4
4
|
import torch
|
5
5
|
|
6
|
+
from sglang.math_utils import align
|
7
|
+
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
6
8
|
from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
|
9
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
7
10
|
|
8
11
|
try:
|
9
12
|
from vllm import _custom_ops as ops
|
@@ -12,7 +15,6 @@ try:
|
|
12
15
|
except ImportError:
|
13
16
|
VLLM_AVAILABLE = False
|
14
17
|
|
15
|
-
from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
|
16
18
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
17
19
|
fp8_dtype,
|
18
20
|
fp8_max,
|
@@ -21,13 +23,15 @@ from sglang.srt.layers.quantization.fp8_kernel import (
|
|
21
23
|
scaled_fp8_quant,
|
22
24
|
sglang_per_token_quant_fp8,
|
23
25
|
static_quant_fp8,
|
24
|
-
|
26
|
+
w8a8_block_fp8_matmul_deepgemm,
|
27
|
+
w8a8_block_fp8_matmul_triton,
|
25
28
|
)
|
26
29
|
from sglang.srt.utils import (
|
27
30
|
get_bool_env_var,
|
28
31
|
get_cuda_version,
|
29
32
|
get_device_capability,
|
30
33
|
is_cuda,
|
34
|
+
is_flashinfer_available,
|
31
35
|
is_hip,
|
32
36
|
)
|
33
37
|
|
@@ -35,10 +39,10 @@ _is_hip = is_hip()
|
|
35
39
|
_is_cuda = is_cuda()
|
36
40
|
_is_fp8_fnuz = is_fp8_fnuz()
|
37
41
|
|
38
|
-
|
42
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
39
43
|
|
40
|
-
if
|
41
|
-
from aiter import
|
44
|
+
if _use_aiter:
|
45
|
+
from aiter import gemm_a8w8_blockscale_CK
|
42
46
|
|
43
47
|
if _is_cuda:
|
44
48
|
from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
|
@@ -80,12 +84,6 @@ def cutlass_fp8_supported():
|
|
80
84
|
return False
|
81
85
|
|
82
86
|
|
83
|
-
def is_sm100_supported(device=None) -> bool:
|
84
|
-
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
85
|
-
torch.version.cuda >= "12.8"
|
86
|
-
)
|
87
|
-
|
88
|
-
|
89
87
|
def normalize_e4m3fn_to_e4m3fnuz(
|
90
88
|
weight: torch.Tensor,
|
91
89
|
weight_scale: torch.Tensor,
|
@@ -111,7 +109,7 @@ def normalize_e4m3fn_to_e4m3fnuz(
|
|
111
109
|
|
112
110
|
|
113
111
|
def cutlass_block_fp8_supported() -> bool:
|
114
|
-
if not get_bool_env_var("
|
112
|
+
if not get_bool_env_var("SGLANG_SUPPORT_CUTLASS_BLOCK_FP8"):
|
115
113
|
return False
|
116
114
|
if _is_cuda:
|
117
115
|
major, minor = torch.cuda.get_device_capability()
|
@@ -123,9 +121,29 @@ def cutlass_block_fp8_supported() -> bool:
|
|
123
121
|
|
124
122
|
|
125
123
|
CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
|
124
|
+
ENABLE_FLASHINFER_GEMM = (
|
125
|
+
get_bool_env_var("SGLANG_ENABLE_FLASHINFER_GEMM")
|
126
|
+
and is_sm100_supported()
|
127
|
+
and is_flashinfer_available()
|
128
|
+
)
|
129
|
+
if ENABLE_FLASHINFER_GEMM:
|
130
|
+
from flashinfer.gemm import gemm_fp8_nt_groupwise
|
131
|
+
|
132
|
+
|
133
|
+
def dispatch_w8a8_block_fp8_linear() -> Callable:
|
134
|
+
if ENABLE_FLASHINFER_GEMM:
|
135
|
+
return flashinfer_gemm_w8a8_block_fp8_linear
|
136
|
+
elif CUTLASS_BLOCK_FP8_SUPPORTED:
|
137
|
+
return cutlass_w8a8_block_fp8_linear_with_fallback
|
138
|
+
elif _use_aiter:
|
139
|
+
return aiter_w8a8_block_fp8_linear
|
140
|
+
elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
|
141
|
+
return deepgemm_w8a8_block_fp8_linear_with_fallback
|
142
|
+
else:
|
143
|
+
return triton_w8a8_block_fp8_linear
|
126
144
|
|
127
145
|
|
128
|
-
def
|
146
|
+
def flashinfer_gemm_w8a8_block_fp8_linear(
|
129
147
|
input: torch.Tensor,
|
130
148
|
weight: torch.Tensor,
|
131
149
|
block_size: List[int],
|
@@ -134,49 +152,159 @@ def apply_w8a8_block_fp8_linear(
|
|
134
152
|
bias: Optional[torch.Tensor] = None,
|
135
153
|
) -> torch.Tensor:
|
136
154
|
assert input_scale is None
|
137
|
-
|
155
|
+
|
138
156
|
input_2d = input.view(-1, input.shape[-1])
|
139
157
|
output_shape = [*input.shape[:-1], weight.shape[0]]
|
140
|
-
|
141
|
-
|
142
|
-
|
158
|
+
|
159
|
+
q_input, x_scale = sglang_per_token_group_quant_fp8(
|
160
|
+
input_2d, block_size[1], column_major_scales=False
|
143
161
|
)
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
output
|
156
|
-
|
157
|
-
|
158
|
-
|
162
|
+
|
163
|
+
output = gemm_fp8_nt_groupwise(
|
164
|
+
q_input,
|
165
|
+
weight,
|
166
|
+
x_scale,
|
167
|
+
weight_scale,
|
168
|
+
scale_major_mode="K",
|
169
|
+
out_dtype=input_2d.dtype,
|
170
|
+
)
|
171
|
+
|
172
|
+
if bias is not None:
|
173
|
+
output += bias
|
174
|
+
|
175
|
+
return output.to(dtype=input_2d.dtype).view(*output_shape)
|
176
|
+
|
177
|
+
|
178
|
+
def cutlass_w8a8_block_fp8_linear_with_fallback(
|
179
|
+
input: torch.Tensor,
|
180
|
+
weight: torch.Tensor,
|
181
|
+
block_size: List[int],
|
182
|
+
weight_scale: torch.Tensor,
|
183
|
+
input_scale: Optional[torch.Tensor] = None,
|
184
|
+
bias: Optional[torch.Tensor] = None,
|
185
|
+
) -> torch.Tensor:
|
186
|
+
assert input_scale is None
|
187
|
+
|
188
|
+
# TODO: add more robust shape check here
|
189
|
+
shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
|
190
|
+
|
191
|
+
if not shape_supported:
|
192
|
+
# fallback to triton
|
193
|
+
return triton_w8a8_block_fp8_linear(
|
194
|
+
input, weight, block_size, weight_scale, input_scale, bias
|
159
195
|
)
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
196
|
+
|
197
|
+
input_2d = input.view(-1, input.shape[-1])
|
198
|
+
output_shape = [*input.shape[:-1], weight.shape[0]]
|
199
|
+
|
200
|
+
q_input, x_scale = per_token_group_quant_fp8(
|
201
|
+
input_2d, block_size[1], column_major_scales=True
|
202
|
+
)
|
203
|
+
output = fp8_blockwise_scaled_mm(
|
204
|
+
q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
|
205
|
+
)
|
206
|
+
if bias is not None:
|
207
|
+
output += bias
|
208
|
+
return output.to(dtype=input_2d.dtype).view(*output_shape)
|
209
|
+
|
210
|
+
|
211
|
+
def deepgemm_w8a8_block_fp8_linear_with_fallback(
|
212
|
+
input: torch.Tensor,
|
213
|
+
weight: torch.Tensor,
|
214
|
+
block_size: List[int],
|
215
|
+
weight_scale: torch.Tensor,
|
216
|
+
input_scale: Optional[torch.Tensor] = None,
|
217
|
+
bias: Optional[torch.Tensor] = None,
|
218
|
+
) -> torch.Tensor:
|
219
|
+
assert input_scale is None
|
220
|
+
|
221
|
+
output_dtype = input.dtype
|
222
|
+
dtype_supported = output_dtype == torch.bfloat16
|
223
|
+
|
224
|
+
# TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
|
225
|
+
shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
|
226
|
+
|
227
|
+
if not (shape_supported and dtype_supported):
|
228
|
+
# fall back to triton
|
229
|
+
return triton_w8a8_block_fp8_linear(
|
230
|
+
input, weight, block_size, weight_scale, input_scale, bias
|
175
231
|
)
|
176
232
|
|
233
|
+
input_2d = input.view(-1, input.shape[-1])
|
234
|
+
output_shape = [*input.shape[:-1], weight.shape[0]]
|
235
|
+
|
236
|
+
q_input, x_scale = sglang_per_token_group_quant_fp8(
|
237
|
+
input_2d,
|
238
|
+
block_size[1],
|
239
|
+
column_major_scales=True,
|
240
|
+
scale_tma_aligned=True,
|
241
|
+
scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
|
242
|
+
)
|
243
|
+
|
244
|
+
# NOTE(alcanderian): Useless when scale is packed to int32
|
245
|
+
# if get_bool_env_var("SGLANG_W8A8_DEEPGEMM_SANITY_CHECK_UE8M0"):
|
246
|
+
# _check_ue8m0("x_scale", x_scale)
|
247
|
+
# _check_ue8m0("weight_scale", ws)
|
248
|
+
|
249
|
+
output = w8a8_block_fp8_matmul_deepgemm(
|
250
|
+
q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
|
251
|
+
)
|
177
252
|
if bias is not None:
|
178
|
-
output
|
179
|
-
return output.to(dtype=
|
253
|
+
output += bias
|
254
|
+
return output.to(dtype=output_dtype).view(*output_shape)
|
255
|
+
|
256
|
+
|
257
|
+
def _check_ue8m0(name, x):
|
258
|
+
x_ceil = ceil_to_ue8m0(x)
|
259
|
+
assert torch.all(x == x_ceil), f"{name=} {x=} {x_ceil=}"
|
260
|
+
|
261
|
+
|
262
|
+
def aiter_w8a8_block_fp8_linear(
|
263
|
+
input: torch.Tensor,
|
264
|
+
weight: torch.Tensor,
|
265
|
+
block_size: List[int],
|
266
|
+
weight_scale: torch.Tensor,
|
267
|
+
input_scale: Optional[torch.Tensor] = None,
|
268
|
+
bias: Optional[torch.Tensor] = None,
|
269
|
+
) -> torch.Tensor:
|
270
|
+
assert input_scale is None
|
271
|
+
input_2d = input.view(-1, input.shape[-1])
|
272
|
+
output_shape = [*input.shape[:-1], weight.shape[0]]
|
273
|
+
|
274
|
+
q_input, x_scale = per_token_group_quant_fp8(
|
275
|
+
input_2d, block_size[1], column_major_scales=False
|
276
|
+
)
|
277
|
+
output = gemm_a8w8_blockscale_CK(
|
278
|
+
q_input, weight, x_scale, weight_scale, dtype=input.dtype
|
279
|
+
)
|
280
|
+
|
281
|
+
if bias is not None:
|
282
|
+
output += bias
|
283
|
+
|
284
|
+
return output.to(dtype=input_2d.dtype).view(*output_shape)
|
285
|
+
|
286
|
+
|
287
|
+
def triton_w8a8_block_fp8_linear(
|
288
|
+
input: torch.Tensor,
|
289
|
+
weight: torch.Tensor,
|
290
|
+
block_size: List[int],
|
291
|
+
weight_scale: torch.Tensor,
|
292
|
+
input_scale: Optional[torch.Tensor] = None,
|
293
|
+
bias: Optional[torch.Tensor] = None,
|
294
|
+
) -> torch.Tensor:
|
295
|
+
assert input_scale is None
|
296
|
+
input_2d = input.view(-1, input.shape[-1])
|
297
|
+
output_shape = [*input.shape[:-1], weight.shape[0]]
|
298
|
+
|
299
|
+
q_input, x_scale = per_token_group_quant_fp8(
|
300
|
+
input_2d, block_size[1], column_major_scales=False
|
301
|
+
)
|
302
|
+
output = w8a8_block_fp8_matmul_triton(
|
303
|
+
q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
|
304
|
+
)
|
305
|
+
if bias is not None:
|
306
|
+
output += bias
|
307
|
+
return output.to(dtype=input_2d.dtype).view(*output_shape)
|
180
308
|
|
181
309
|
|
182
310
|
def input_to_float8(
|
@@ -253,27 +381,80 @@ def block_quant_dequant(
|
|
253
381
|
The output is an unquantized tensor with dtype.
|
254
382
|
"""
|
255
383
|
block_n, block_k = block_size[0], block_size[1]
|
256
|
-
n, k = x_q_block.shape
|
257
|
-
n_tiles = (n + block_n - 1) // block_n
|
258
|
-
k_tiles = (k + block_k - 1) // block_k
|
259
|
-
assert n_tiles == x_s.shape[0]
|
260
|
-
assert k_tiles == x_s.shape[1]
|
384
|
+
*_, n, k = x_q_block.shape
|
261
385
|
|
262
|
-
|
386
|
+
# ... n_scale k_scale -> ... (n_scale block_n) (k_scale block_k)
|
387
|
+
x_scale_repeat = x_s.repeat_interleave(block_n, dim=-2).repeat_interleave(
|
388
|
+
block_k, dim=-1
|
389
|
+
)
|
390
|
+
x_scale_repeat = x_scale_repeat[..., :n, :k]
|
391
|
+
|
392
|
+
return (x_q_block.to(torch.float32) * x_scale_repeat).to(dtype)
|
393
|
+
|
394
|
+
|
395
|
+
def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
|
396
|
+
assert isinstance(weight, torch.nn.Parameter)
|
397
|
+
assert isinstance(weight_scale_inv, torch.nn.Parameter)
|
398
|
+
weight.data, weight_scale_inv.data = _requant_weight_ue8m0(
|
399
|
+
weight, weight_scale_inv, weight_block_size
|
400
|
+
)
|
401
|
+
|
402
|
+
|
403
|
+
def _requant_weight_ue8m0(
|
404
|
+
weight: torch.Tensor,
|
405
|
+
weight_scale_inv: torch.Tensor,
|
406
|
+
weight_block_size: List[int],
|
407
|
+
):
|
408
|
+
assert weight_block_size == [128, 128]
|
409
|
+
|
410
|
+
*_, n, k = weight.shape
|
411
|
+
|
412
|
+
weight_dequant = block_quant_dequant(
|
413
|
+
weight,
|
414
|
+
weight_scale_inv,
|
415
|
+
weight_block_size,
|
416
|
+
torch.bfloat16,
|
417
|
+
)
|
418
|
+
|
419
|
+
weight_dequant_flat = weight_dequant.view((-1, k))
|
420
|
+
out_w_flat, out_s_flat = per_block_cast_to_fp8(weight_dequant_flat)
|
421
|
+
|
422
|
+
out_w = out_w_flat.view(weight.shape)
|
423
|
+
out_s = out_s_flat.view(weight_scale_inv.shape)
|
424
|
+
|
425
|
+
# NOTE copy and modified from DeepGEMM
|
426
|
+
def _transform_scale(sf, mn: int):
|
427
|
+
import deep_gemm.utils.layout
|
428
|
+
|
429
|
+
sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128)
|
430
|
+
sf = deep_gemm.utils.layout.get_col_major_tma_aligned_packed_tensor(sf)
|
431
|
+
return sf
|
432
|
+
|
433
|
+
out_s = _transform_scale(out_s, mn=out_w.shape[-2])
|
434
|
+
|
435
|
+
return out_w, out_s
|
436
|
+
|
437
|
+
|
438
|
+
# COPIED FROM DeepGEMM
|
439
|
+
def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
440
|
+
assert x.dim() == 2
|
441
|
+
m, n = x.shape
|
442
|
+
x_padded = torch.zeros(
|
443
|
+
(align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
|
444
|
+
)
|
445
|
+
x_padded[:m, :n] = x
|
446
|
+
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
447
|
+
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
448
|
+
sf = ceil_to_ue8m0(x_amax / 448.0)
|
449
|
+
x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
|
450
|
+
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
|
451
|
+
x_view.size(0), x_view.size(2)
|
452
|
+
)
|
263
453
|
|
264
|
-
for j in range(n_tiles):
|
265
|
-
for i in range(k_tiles):
|
266
|
-
x_q_block_tile = x_q_block[
|
267
|
-
j * block_n : min((j + 1) * block_n, n),
|
268
|
-
i * block_k : min((i + 1) * block_k, k),
|
269
|
-
]
|
270
|
-
x_dq_block_tile = x_dq_block[
|
271
|
-
j * block_n : min((j + 1) * block_n, n),
|
272
|
-
i * block_k : min((i + 1) * block_k, k),
|
273
|
-
]
|
274
|
-
x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
|
275
454
|
|
276
|
-
|
455
|
+
# COPIED FROM DeepGEMM
|
456
|
+
def ceil_to_ue8m0(x: torch.Tensor):
|
457
|
+
return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
|
277
458
|
|
278
459
|
|
279
460
|
def channel_quant_to_tensor_quant(
|