sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -5,33 +5,22 @@ from dataclasses import dataclass
|
|
5
5
|
from enum import IntEnum, auto
|
6
6
|
from typing import Callable, Dict, List, Optional, Tuple
|
7
7
|
|
8
|
-
import torch
|
9
8
|
from tqdm.contrib.concurrent import thread_map
|
10
9
|
|
10
|
+
from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
|
11
|
+
DEEPGEMM_BLACKWELL,
|
12
|
+
ENABLE_JIT_DEEPGEMM,
|
13
|
+
)
|
11
14
|
from sglang.srt.server_args import ServerArgs
|
12
|
-
from sglang.srt.utils import get_bool_env_var,
|
15
|
+
from sglang.srt.utils import get_bool_env_var, get_int_env_var
|
13
16
|
|
14
17
|
logger = logging.getLogger(__name__)
|
15
|
-
_ENABLE_JIT_DEEPGEMM = False
|
16
18
|
|
17
|
-
|
18
|
-
import deep_gemm
|
19
|
+
if ENABLE_JIT_DEEPGEMM and not DEEPGEMM_BLACKWELL:
|
19
20
|
from deep_gemm import get_num_sms
|
20
|
-
from deep_gemm.jit
|
21
|
+
from deep_gemm.jit import build
|
21
22
|
from deep_gemm.jit_kernels.gemm import get_best_configs
|
22
23
|
from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
|
23
|
-
from deep_gemm.jit_kernels.tuner import jit_tuner
|
24
|
-
|
25
|
-
sm_version = get_device_sm()
|
26
|
-
if sm_version == 90:
|
27
|
-
if get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true"):
|
28
|
-
_ENABLE_JIT_DEEPGEMM = True
|
29
|
-
except ImportError:
|
30
|
-
logger.warning("Failed to import deepgemm, disable _ENABLE_JIT_DEEPGEMM.")
|
31
|
-
|
32
|
-
|
33
|
-
def get_enable_jit_deepgemm():
|
34
|
-
return _ENABLE_JIT_DEEPGEMM
|
35
24
|
|
36
25
|
|
37
26
|
_BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
|
@@ -52,8 +41,10 @@ os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
|
|
52
41
|
# NVRTC may have performance loss with some cases.
|
53
42
|
# And NVCC JIT speed is also 9x faster in the ref commit
|
54
43
|
_USE_NVRTC_DEFAULT = "0"
|
55
|
-
if
|
44
|
+
if ENABLE_JIT_DEEPGEMM:
|
56
45
|
try:
|
46
|
+
from deep_gemm.jit.compiler import get_nvcc_compiler
|
47
|
+
|
57
48
|
get_nvcc_compiler()
|
58
49
|
except:
|
59
50
|
logger.warning(
|
@@ -114,11 +105,12 @@ class DeepGemmKernelHelper:
|
|
114
105
|
_INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
|
115
106
|
|
116
107
|
|
108
|
+
# TODO improve naming
|
117
109
|
def _compile_warning_1():
|
118
110
|
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
|
119
111
|
logger.warning(
|
120
112
|
"Entering DeepGEMM JIT Pre-Compile session. "
|
121
|
-
"
|
113
|
+
"It may takes a long time (typically 10-20 mins) "
|
122
114
|
"if you have not run `sglang.compile_deep_gemm`. "
|
123
115
|
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
|
124
116
|
" for pre-compilation to reduce the overhead if you have not run it before. "
|
@@ -127,6 +119,7 @@ def _compile_warning_1():
|
|
127
119
|
)
|
128
120
|
|
129
121
|
|
122
|
+
# TODO improve naming
|
130
123
|
def _compile_warning_2():
|
131
124
|
logger.warning(
|
132
125
|
"Entering DeepGEMM JIT Single Kernel Compile session. "
|
@@ -148,32 +141,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
|
|
148
141
|
block_k = 128
|
149
142
|
num_tma_threads = 128
|
150
143
|
num_math_threads_per_group = 128
|
144
|
+
|
151
145
|
kwargs = {
|
146
|
+
"GEMM_TYPE": GemmType.GroupedMasked,
|
152
147
|
"NUM_TMA_THREADS": num_tma_threads,
|
153
148
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
149
|
+
"N": n,
|
150
|
+
"K": k,
|
151
|
+
"NUM_GROUPS": 1,
|
152
|
+
"BLOCK_M": block_m,
|
153
|
+
"BLOCK_N": block_n,
|
154
154
|
"BLOCK_K": block_k,
|
155
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
156
|
+
"BLOCK_N_PADDING": smem_config[2],
|
157
|
+
"NUM_STAGES": num_stages,
|
158
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
159
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
155
160
|
"NUM_SMS": num_sms,
|
156
161
|
"SMEM_SIZE": smem_config[0],
|
157
162
|
}
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
"N": n,
|
162
|
-
"K": k,
|
163
|
-
"BLOCK_M": block_m,
|
164
|
-
"BLOCK_N": block_n,
|
165
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
166
|
-
"BLOCK_N_PADDING": smem_config[2],
|
167
|
-
"NUM_GROUPS": num_groups,
|
168
|
-
"NUM_STAGES": num_stages,
|
169
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
170
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
171
|
-
"GEMM_TYPE": GemmType.GroupedMasked,
|
172
|
-
},
|
173
|
-
space=(),
|
174
|
-
kwargs=kwargs,
|
175
|
-
runtime_cls=FP8GemmRuntime,
|
176
|
-
)
|
163
|
+
|
164
|
+
code = FP8GemmRuntime.generate(kwargs)
|
165
|
+
_ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
177
166
|
|
178
167
|
|
179
168
|
def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
|
@@ -187,31 +176,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
|
|
187
176
|
num_tma_threads = 128
|
188
177
|
num_math_threads_per_group = 128
|
189
178
|
kwargs = {
|
179
|
+
"GEMM_TYPE": GemmType.GroupedContiguous,
|
190
180
|
"NUM_TMA_THREADS": num_tma_threads,
|
191
181
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
182
|
+
"N": n,
|
183
|
+
"K": k,
|
184
|
+
"NUM_GROUPS": 1,
|
185
|
+
"BLOCK_M": block_m,
|
186
|
+
"BLOCK_N": block_n,
|
192
187
|
"BLOCK_K": block_k,
|
188
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
189
|
+
"BLOCK_N_PADDING": smem_config[2],
|
190
|
+
"NUM_STAGES": num_stages,
|
191
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
192
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
193
193
|
"NUM_SMS": num_sms,
|
194
194
|
"SMEM_SIZE": smem_config[0],
|
195
195
|
}
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
"N": n,
|
200
|
-
"K": k,
|
201
|
-
"BLOCK_M": block_m,
|
202
|
-
"BLOCK_N": block_n,
|
203
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
204
|
-
"BLOCK_N_PADDING": smem_config[2],
|
205
|
-
"NUM_GROUPS": num_groups,
|
206
|
-
"NUM_STAGES": num_stages,
|
207
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
208
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
209
|
-
"GEMM_TYPE": GemmType.GroupedContiguous,
|
210
|
-
},
|
211
|
-
space=(),
|
212
|
-
kwargs=kwargs,
|
213
|
-
runtime_cls=FP8GemmRuntime,
|
214
|
-
)
|
196
|
+
|
197
|
+
code = FP8GemmRuntime.generate(kwargs)
|
198
|
+
_ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
215
199
|
|
216
200
|
|
217
201
|
def _compile_gemm_nt_f8f8bf16_one(
|
@@ -228,30 +212,26 @@ def _compile_gemm_nt_f8f8bf16_one(
|
|
228
212
|
"GEMM_TYPE": GemmType.Normal,
|
229
213
|
"NUM_TMA_THREADS": num_tma_threads,
|
230
214
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
215
|
+
"N": n,
|
216
|
+
"K": k,
|
231
217
|
"NUM_GROUPS": 1,
|
218
|
+
"BLOCK_M": block_m,
|
219
|
+
"BLOCK_N": block_n,
|
232
220
|
"BLOCK_K": block_k,
|
221
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
222
|
+
"BLOCK_N_PADDING": smem_config[2],
|
223
|
+
"NUM_STAGES": num_stages,
|
224
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
225
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
233
226
|
"NUM_SMS": num_sms,
|
234
227
|
"SMEM_SIZE": smem_config[0],
|
235
228
|
}
|
236
|
-
_, _ = jit_tuner.compile_and_tune(
|
237
|
-
name="gemm_fp8_fp8_bf16_nt",
|
238
|
-
keys={
|
239
|
-
"N": n,
|
240
|
-
"K": k,
|
241
|
-
"BLOCK_M": block_m,
|
242
|
-
"BLOCK_N": block_n,
|
243
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
244
|
-
"BLOCK_N_PADDING": smem_config[2],
|
245
|
-
"NUM_STAGES": num_stages,
|
246
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
247
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
248
|
-
},
|
249
|
-
space=(),
|
250
|
-
kwargs=kwargs,
|
251
|
-
runtime_cls=FP8GemmRuntime,
|
252
|
-
)
|
253
229
|
|
230
|
+
code = FP8GemmRuntime.generate(kwargs)
|
231
|
+
_ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
254
232
|
|
233
|
+
|
234
|
+
# TODO further refactor warmup-related
|
255
235
|
_KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
|
256
236
|
DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: DeepGemmKernelHelper(
|
257
237
|
name="m_grouped_gemm_fp8_fp8_bf16_nt_masked",
|
@@ -284,7 +264,6 @@ def _maybe_compile_deep_gemm_one_type_all(
|
|
284
264
|
num_groups: int,
|
285
265
|
m_list: Optional[List[int]] = None,
|
286
266
|
) -> None:
|
287
|
-
|
288
267
|
global _INITIALIZATION_DICT
|
289
268
|
global _BUILTIN_M_LIST
|
290
269
|
|
@@ -318,56 +297,6 @@ def _maybe_compile_deep_gemm_one_type_all(
|
|
318
297
|
thread_map(compile_func, collected_configs, max_workers=_COMPILE_WORKERS)
|
319
298
|
|
320
299
|
|
321
|
-
def grouped_gemm_nt_f8f8bf16_masked(
|
322
|
-
lhs: Tuple[torch.Tensor, torch.Tensor],
|
323
|
-
rhs: Tuple[torch.Tensor, torch.Tensor],
|
324
|
-
out: torch.Tensor,
|
325
|
-
masked_m: torch.Tensor,
|
326
|
-
expected_m: int,
|
327
|
-
):
|
328
|
-
num_groups, _, k = lhs[0].shape
|
329
|
-
_, n, _ = rhs[0].shape
|
330
|
-
|
331
|
-
kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
|
332
|
-
_maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
|
333
|
-
|
334
|
-
with _log_jit_build(expected_m, n, k, kernel_type):
|
335
|
-
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
336
|
-
lhs, rhs, out, masked_m, expected_m
|
337
|
-
)
|
338
|
-
|
339
|
-
|
340
|
-
def grouped_gemm_nt_f8f8bf16_contig(
|
341
|
-
lhs: Tuple[torch.Tensor, torch.Tensor],
|
342
|
-
rhs: Tuple[torch.Tensor, torch.Tensor],
|
343
|
-
out: torch.Tensor,
|
344
|
-
m_indices: torch.Tensor,
|
345
|
-
):
|
346
|
-
m, k = lhs[0].shape
|
347
|
-
num_groups, n, _ = rhs[0].shape
|
348
|
-
|
349
|
-
kernel_type = DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
|
350
|
-
_maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
|
351
|
-
|
352
|
-
with _log_jit_build(m, n, k, kernel_type):
|
353
|
-
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs, rhs, out, m_indices)
|
354
|
-
|
355
|
-
|
356
|
-
def gemm_nt_f8f8bf16(
|
357
|
-
lhs: Tuple[torch.Tensor, torch.Tensor],
|
358
|
-
rhs: Tuple[torch.Tensor, torch.Tensor],
|
359
|
-
out: torch.Tensor,
|
360
|
-
):
|
361
|
-
m, k = lhs[0].shape
|
362
|
-
n, _ = rhs[0].shape
|
363
|
-
|
364
|
-
kernel_type = DeepGemmKernelType.GEMM_NT_F8F8BF16
|
365
|
-
_maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, 1)
|
366
|
-
|
367
|
-
with _log_jit_build(m, n, k, kernel_type):
|
368
|
-
deep_gemm.gemm_fp8_fp8_bf16_nt(lhs, rhs, out)
|
369
|
-
|
370
|
-
|
371
300
|
@contextmanager
|
372
301
|
def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
|
373
302
|
if _IN_PRECOMPILE_STAGE:
|
@@ -382,7 +311,8 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
|
|
382
311
|
ret = origin_func(self, *args, **kwargs)
|
383
312
|
if ret is None:
|
384
313
|
kernel_helper = _KERNEL_HELPER_DICT[kernel_type]
|
385
|
-
|
314
|
+
if not DEEPGEMM_BLACKWELL:
|
315
|
+
_compile_warning_2()
|
386
316
|
logger.warning(
|
387
317
|
f"DeepGEMM JIT Compiling for <{kernel_helper.name}> M={M}, N={N}, K={K}. Please wait."
|
388
318
|
)
|
@@ -391,3 +321,15 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
|
|
391
321
|
RuntimeCache.get = __patched_func
|
392
322
|
yield
|
393
323
|
RuntimeCache.get = origin_func
|
324
|
+
|
325
|
+
|
326
|
+
@contextmanager
|
327
|
+
def deep_gemm_execution_hook(
|
328
|
+
m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
|
329
|
+
):
|
330
|
+
# not supported yet
|
331
|
+
if not DEEPGEMM_BLACKWELL:
|
332
|
+
_maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
|
333
|
+
|
334
|
+
with _log_jit_build(m, n, k, kernel_type):
|
335
|
+
yield
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from sglang.srt.utils import get_bool_env_var, get_device_sm
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
def _compute_enable_deep_gemm():
|
9
|
+
sm_version = get_device_sm()
|
10
|
+
if sm_version < 90:
|
11
|
+
return False
|
12
|
+
|
13
|
+
try:
|
14
|
+
import deep_gemm
|
15
|
+
except ImportError:
|
16
|
+
logger.warning("Failed to import deep_gemm, disable ENABLE_JIT_DEEPGEMM.")
|
17
|
+
return False
|
18
|
+
|
19
|
+
return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
|
20
|
+
|
21
|
+
|
22
|
+
ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
|
23
|
+
|
24
|
+
try:
|
25
|
+
from deep_gemm import fp8_gemm_nt
|
26
|
+
|
27
|
+
# They have not given a name to this breaking change
|
28
|
+
DEEPGEMM_BLACKWELL = True
|
29
|
+
except ImportError:
|
30
|
+
DEEPGEMM_BLACKWELL = False
|
31
|
+
|
32
|
+
DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
|
@@ -0,0 +1,110 @@
|
|
1
|
+
import logging
|
2
|
+
from contextlib import contextmanager
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
import torch
|
6
|
+
|
7
|
+
from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
|
8
|
+
from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
|
9
|
+
DEEPGEMM_BLACKWELL,
|
10
|
+
DEEPGEMM_SCALE_UE8M0,
|
11
|
+
ENABLE_JIT_DEEPGEMM,
|
12
|
+
)
|
13
|
+
from sglang.srt.server_args import ServerArgs
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
if ENABLE_JIT_DEEPGEMM:
|
18
|
+
import deep_gemm
|
19
|
+
|
20
|
+
if DEEPGEMM_BLACKWELL:
|
21
|
+
from deep_gemm import fp8_gemm_nt as _gemm_nt_f8f8bf16_raw
|
22
|
+
from deep_gemm import (
|
23
|
+
fp8_m_grouped_gemm_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
|
24
|
+
)
|
25
|
+
from deep_gemm import (
|
26
|
+
m_grouped_fp8_gemm_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
|
27
|
+
)
|
28
|
+
else:
|
29
|
+
from deep_gemm import gemm_fp8_fp8_bf16_nt as _gemm_nt_f8f8bf16_raw
|
30
|
+
from deep_gemm import get_col_major_tma_aligned_tensor
|
31
|
+
from deep_gemm import (
|
32
|
+
m_grouped_gemm_fp8_fp8_bf16_nt_contiguous as _grouped_gemm_nt_f8f8bf16_contig_raw,
|
33
|
+
)
|
34
|
+
from deep_gemm import (
|
35
|
+
m_grouped_gemm_fp8_fp8_bf16_nt_masked as _grouped_gemm_nt_f8f8bf16_masked_raw,
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
def grouped_gemm_nt_f8f8bf16_masked(
|
40
|
+
lhs: Tuple[torch.Tensor, torch.Tensor],
|
41
|
+
rhs: Tuple[torch.Tensor, torch.Tensor],
|
42
|
+
out: torch.Tensor,
|
43
|
+
masked_m: torch.Tensor,
|
44
|
+
expected_m: int,
|
45
|
+
recipe=None,
|
46
|
+
):
|
47
|
+
num_groups, _, k = lhs[0].shape
|
48
|
+
_, n, _ = rhs[0].shape
|
49
|
+
kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
|
50
|
+
|
51
|
+
with compile_utils.deep_gemm_execution_hook(
|
52
|
+
expected_m, n, k, num_groups, kernel_type
|
53
|
+
):
|
54
|
+
_grouped_gemm_nt_f8f8bf16_masked_raw(
|
55
|
+
lhs,
|
56
|
+
rhs,
|
57
|
+
out,
|
58
|
+
masked_m,
|
59
|
+
expected_m,
|
60
|
+
**({"recipe": recipe} if DEEPGEMM_BLACKWELL else {})
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
def grouped_gemm_nt_f8f8bf16_contig(
|
65
|
+
lhs: Tuple[torch.Tensor, torch.Tensor],
|
66
|
+
rhs: Tuple[torch.Tensor, torch.Tensor],
|
67
|
+
out: torch.Tensor,
|
68
|
+
m_indices: torch.Tensor,
|
69
|
+
):
|
70
|
+
m, k = lhs[0].shape
|
71
|
+
num_groups, n, _ = rhs[0].shape
|
72
|
+
kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
|
73
|
+
|
74
|
+
with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
|
75
|
+
_grouped_gemm_nt_f8f8bf16_contig_raw(lhs, rhs, out, m_indices)
|
76
|
+
|
77
|
+
|
78
|
+
def gemm_nt_f8f8bf16(
|
79
|
+
lhs: Tuple[torch.Tensor, torch.Tensor],
|
80
|
+
rhs: Tuple[torch.Tensor, torch.Tensor],
|
81
|
+
out: torch.Tensor,
|
82
|
+
):
|
83
|
+
m, k = lhs[0].shape
|
84
|
+
n, _ = rhs[0].shape
|
85
|
+
num_groups = 1
|
86
|
+
kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
|
87
|
+
|
88
|
+
with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
|
89
|
+
_gemm_nt_f8f8bf16_raw(
|
90
|
+
lhs,
|
91
|
+
rhs,
|
92
|
+
out,
|
93
|
+
)
|
94
|
+
|
95
|
+
|
96
|
+
def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
|
97
|
+
compile_utils.update_deep_gemm_config(gpu_id, server_args)
|
98
|
+
|
99
|
+
|
100
|
+
@contextmanager
|
101
|
+
def configure_deep_gemm_num_sms(num_sms):
|
102
|
+
if num_sms is None:
|
103
|
+
yield
|
104
|
+
else:
|
105
|
+
original_num_sms = deep_gemm.get_num_sms()
|
106
|
+
deep_gemm.set_num_sms(num_sms)
|
107
|
+
try:
|
108
|
+
yield
|
109
|
+
finally:
|
110
|
+
deep_gemm.set_num_sms(original_num_sms)
|
@@ -49,10 +49,9 @@ from sglang.srt.layers.quantization.fp8_kernel import (
|
|
49
49
|
)
|
50
50
|
from sglang.srt.layers.quantization.fp8_utils import (
|
51
51
|
apply_fp8_linear,
|
52
|
-
apply_w8a8_block_fp8_linear,
|
53
52
|
cutlass_fp8_supported,
|
53
|
+
dispatch_w8a8_block_fp8_linear,
|
54
54
|
input_to_float8,
|
55
|
-
is_sm100_supported,
|
56
55
|
normalize_e4m3fn_to_e4m3fnuz,
|
57
56
|
)
|
58
57
|
from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
|
@@ -63,6 +62,7 @@ from sglang.srt.layers.quantization.utils import (
|
|
63
62
|
per_tensor_dequantize,
|
64
63
|
requantize_with_max_scale,
|
65
64
|
)
|
65
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
66
66
|
from sglang.srt.utils import (
|
67
67
|
get_bool_env_var,
|
68
68
|
is_cuda,
|
@@ -77,8 +77,8 @@ _is_cuda = is_cuda()
|
|
77
77
|
|
78
78
|
_is_fp8_fnuz = is_fp8_fnuz()
|
79
79
|
|
80
|
-
|
81
|
-
|
80
|
+
_use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
|
81
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
82
82
|
|
83
83
|
if _is_hip:
|
84
84
|
from aiter import ActivationType, QuantType
|
@@ -209,6 +209,8 @@ class Fp8LinearMethod(LinearMethodBase):
|
|
209
209
|
# Marlin doesn't support block-wise fp8
|
210
210
|
self.use_marlin = False
|
211
211
|
|
212
|
+
self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
|
213
|
+
|
212
214
|
def create_weights(
|
213
215
|
self,
|
214
216
|
layer: torch.nn.Module,
|
@@ -417,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
|
417
419
|
)
|
418
420
|
|
419
421
|
if self.block_quant:
|
420
|
-
return
|
422
|
+
return self.w8a8_block_fp8_linear(
|
421
423
|
input=x,
|
422
424
|
weight=layer.weight,
|
423
425
|
block_size=self.quant_config.weight_block_size,
|
@@ -485,7 +487,7 @@ class Fp8MoEMethod:
|
|
485
487
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
|
486
488
|
|
487
489
|
if self.quant_config.is_checkpoint_fp8_serialized:
|
488
|
-
params_dtype = torch.uint32 if
|
490
|
+
params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
|
489
491
|
tp_size = get_tensor_model_parallel_world_size()
|
490
492
|
if self.block_quant:
|
491
493
|
block_n, block_k = (
|
@@ -510,7 +512,7 @@ class Fp8MoEMethod:
|
|
510
512
|
)
|
511
513
|
|
512
514
|
# WEIGHTS
|
513
|
-
if _is_hip and
|
515
|
+
if _is_hip and _use_hip_int4:
|
514
516
|
# INT4 MoE weight - INT32 packed
|
515
517
|
w13_weight = torch.nn.Parameter(
|
516
518
|
torch.empty(
|
@@ -571,7 +573,7 @@ class Fp8MoEMethod:
|
|
571
573
|
layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
|
572
574
|
assert self.quant_config.activation_scheme == "dynamic"
|
573
575
|
if (
|
574
|
-
get_bool_env_var("
|
576
|
+
get_bool_env_var("SGLANG_CUTLASS_MOE")
|
575
577
|
and self.cutlass_fp8_supported
|
576
578
|
and is_sm100_supported()
|
577
579
|
):
|
@@ -639,7 +641,7 @@ class Fp8MoEMethod:
|
|
639
641
|
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
640
642
|
layer.register_parameter("w2_weight_scale", w2_weight_scale)
|
641
643
|
|
642
|
-
if _is_hip: #
|
644
|
+
if _is_hip: # _use_aiter: TODO: add check back after triton kernel
|
643
645
|
# ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
|
644
646
|
w13_weight_scale1 = torch.nn.Parameter(
|
645
647
|
torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
|
@@ -666,7 +668,7 @@ class Fp8MoEMethod:
|
|
666
668
|
set_weight_attrs(w13_weight_scale, extra_weight_attrs)
|
667
669
|
set_weight_attrs(w2_weight_scale, extra_weight_attrs)
|
668
670
|
|
669
|
-
if _is_hip and
|
671
|
+
if _is_hip and _use_hip_int4:
|
670
672
|
extra_weight_attrs.update(
|
671
673
|
{"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
|
672
674
|
)
|
@@ -698,7 +700,7 @@ class Fp8MoEMethod:
|
|
698
700
|
layer.w2_input_scale = None
|
699
701
|
|
700
702
|
def process_weights_after_loading(self, layer: Module) -> None:
|
701
|
-
if _is_hip and
|
703
|
+
if _is_hip and _use_hip_int4:
|
702
704
|
self.process_weights_hip_int4(layer)
|
703
705
|
return
|
704
706
|
|
@@ -729,7 +731,7 @@ class Fp8MoEMethod:
|
|
729
731
|
)
|
730
732
|
layer.w2_input_scale = None
|
731
733
|
|
732
|
-
if
|
734
|
+
if _use_aiter:
|
733
735
|
# Pre-shuffle weights
|
734
736
|
layer.w13_weight.data = shuffle_weight(
|
735
737
|
layer.w13_weight.contiguous(), (16, 16)
|
@@ -851,7 +853,7 @@ class Fp8MoEMethod:
|
|
851
853
|
return
|
852
854
|
|
853
855
|
def process_weights_hip_int4(self, layer: Module):
|
854
|
-
# TODO:
|
856
|
+
# TODO: _use_aiter: add after triton kernel added
|
855
857
|
# INT4-FP8 (INT4 MoE Weight, FP8 Compute)
|
856
858
|
# Weight Permutation
|
857
859
|
layer.w13_weight = torch.nn.Parameter(
|
@@ -898,7 +900,7 @@ class Fp8MoEMethod:
|
|
898
900
|
padding_size, # Avoid circular import
|
899
901
|
)
|
900
902
|
|
901
|
-
if
|
903
|
+
if _use_aiter:
|
902
904
|
layer.w13_weight = torch.nn.Parameter(
|
903
905
|
shuffle_weight(layer.w13_weight.data, (16, 16)),
|
904
906
|
requires_grad=False,
|
@@ -909,7 +911,7 @@ class Fp8MoEMethod:
|
|
909
911
|
requires_grad=False,
|
910
912
|
)
|
911
913
|
torch.cuda.empty_cache()
|
912
|
-
# ROCm (
|
914
|
+
# ROCm (_use_aiter): using column-wise scaling
|
913
915
|
layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
|
914
916
|
layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
|
915
917
|
elif get_bool_env_var("SGLANG_MOE_PADDING"):
|
@@ -935,6 +937,7 @@ class Fp8MoEMethod:
|
|
935
937
|
use_grouped_topk: bool,
|
936
938
|
topk_group: Optional[int] = None,
|
937
939
|
num_expert_group: Optional[int] = None,
|
940
|
+
num_fused_shared_experts: int = 0,
|
938
941
|
custom_routing_function: Optional[Callable] = None,
|
939
942
|
correction_bias: Optional[torch.Tensor] = None,
|
940
943
|
activation: str = "silu",
|
@@ -955,6 +958,7 @@ class Fp8MoEMethod:
|
|
955
958
|
renormalize=renormalize,
|
956
959
|
topk_group=topk_group,
|
957
960
|
num_expert_group=num_expert_group,
|
961
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
958
962
|
custom_routing_function=custom_routing_function,
|
959
963
|
correction_bias=correction_bias,
|
960
964
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -973,14 +977,14 @@ class Fp8MoEMethod:
|
|
973
977
|
return ret
|
974
978
|
|
975
979
|
if (
|
976
|
-
get_bool_env_var("
|
980
|
+
get_bool_env_var("SGLANG_CUTLASS_MOE")
|
977
981
|
and self.cutlass_fp8_supported
|
978
982
|
and self.block_quant
|
979
983
|
and is_sm100_supported()
|
980
984
|
):
|
981
|
-
from sglang.srt.layers.moe.cutlass_moe import
|
985
|
+
from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
|
982
986
|
|
983
|
-
return
|
987
|
+
return cutlass_fused_experts_fp8(
|
984
988
|
x,
|
985
989
|
layer.w13_weight.transpose(1, 2),
|
986
990
|
layer.w2_weight.transpose(1, 2),
|
@@ -1026,6 +1030,7 @@ class Fp8MoEMethod:
|
|
1026
1030
|
a2_scale=layer.w2_input_scale,
|
1027
1031
|
block_shape=self.quant_config.weight_block_size,
|
1028
1032
|
no_combine=no_combine,
|
1033
|
+
routed_scaling_factor=routed_scaling_factor,
|
1029
1034
|
)
|
1030
1035
|
|
1031
1036
|
def maybe_apply_hip_fused_experts(
|
@@ -1037,8 +1042,8 @@ class Fp8MoEMethod:
|
|
1037
1042
|
activation: str = "silu",
|
1038
1043
|
no_combine: bool = False,
|
1039
1044
|
) -> Optional[torch.Tensor]:
|
1040
|
-
if
|
1041
|
-
# TODO: add triton kernel and add check
|
1045
|
+
if _use_hip_int4:
|
1046
|
+
# TODO: add triton kernel and add check _use_aiter
|
1042
1047
|
assert not no_combine, f"{no_combine=} is not supported."
|
1043
1048
|
return ck_moe_2stages(
|
1044
1049
|
x,
|
@@ -1054,13 +1059,13 @@ class Fp8MoEMethod:
|
|
1054
1059
|
),
|
1055
1060
|
)
|
1056
1061
|
|
1057
|
-
if
|
1062
|
+
if _use_aiter:
|
1058
1063
|
assert not no_combine, f"{no_combine=} is not supported."
|
1059
1064
|
if self.block_quant:
|
1060
|
-
# TODO(
|
1065
|
+
# TODO(_use_aiter): FP8 block_quant only supports 'silu' for the time-being.
|
1061
1066
|
assert (
|
1062
1067
|
activation == "silu"
|
1063
|
-
), f"
|
1068
|
+
), f"_use_aiter: FP8 bloack_quant {activation=} will be supported later, unset _use_aiter"
|
1064
1069
|
return asm_moe(
|
1065
1070
|
x,
|
1066
1071
|
layer.w13_weight,
|