sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
"""
|
2
|
+
Utility functions for OpenAI API adapter.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import Dict, List
|
7
|
+
|
8
|
+
import jinja2.nodes
|
9
|
+
import transformers.utils.chat_template_utils as hf_chat_utils
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
# ============================================================================
|
14
|
+
# JINJA TEMPLATE CONTENT FORMAT DETECTION
|
15
|
+
# ============================================================================
|
16
|
+
#
|
17
|
+
# This adapts vLLM's approach for detecting chat template content format:
|
18
|
+
# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
|
19
|
+
# - Analyzes Jinja template AST to detect content iteration patterns
|
20
|
+
# - 'openai' format: templates with {%- for content in message['content'] -%} loops
|
21
|
+
# - 'string' format: templates that expect simple string content
|
22
|
+
# - Processes content accordingly to match template expectations
|
23
|
+
|
24
|
+
|
25
|
+
def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
|
26
|
+
"""Check if node is a variable access like {{ varname }}"""
|
27
|
+
if isinstance(node, jinja2.nodes.Name):
|
28
|
+
return node.ctx == "load" and node.name == varname
|
29
|
+
return False
|
30
|
+
|
31
|
+
|
32
|
+
def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
|
33
|
+
"""Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
|
34
|
+
if isinstance(node, jinja2.nodes.Getitem):
|
35
|
+
return (
|
36
|
+
_is_var_access(node.node, varname)
|
37
|
+
and isinstance(node.arg, jinja2.nodes.Const)
|
38
|
+
and node.arg.value == key
|
39
|
+
)
|
40
|
+
|
41
|
+
if isinstance(node, jinja2.nodes.Getattr):
|
42
|
+
return _is_var_access(node.node, varname) and node.attr == key
|
43
|
+
|
44
|
+
return False
|
45
|
+
|
46
|
+
|
47
|
+
def _is_var_or_elems_access(
|
48
|
+
node: jinja2.nodes.Node,
|
49
|
+
varname: str,
|
50
|
+
key: str = None,
|
51
|
+
) -> bool:
|
52
|
+
"""Check if node accesses varname or varname[key] with filters/tests"""
|
53
|
+
if isinstance(node, jinja2.nodes.Filter):
|
54
|
+
return node.node is not None and _is_var_or_elems_access(
|
55
|
+
node.node, varname, key
|
56
|
+
)
|
57
|
+
if isinstance(node, jinja2.nodes.Test):
|
58
|
+
return _is_var_or_elems_access(node.node, varname, key)
|
59
|
+
|
60
|
+
if isinstance(node, jinja2.nodes.Getitem) and isinstance(
|
61
|
+
node.arg, jinja2.nodes.Slice
|
62
|
+
):
|
63
|
+
return _is_var_or_elems_access(node.node, varname, key)
|
64
|
+
|
65
|
+
return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
|
66
|
+
|
67
|
+
|
68
|
+
def _try_extract_ast(chat_template: str):
|
69
|
+
"""Try to parse the Jinja template into an AST"""
|
70
|
+
try:
|
71
|
+
jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
|
72
|
+
return jinja_compiled.environment.parse(chat_template)
|
73
|
+
except Exception as e:
|
74
|
+
logger.debug(f"Error when compiling Jinja template: {e}")
|
75
|
+
return None
|
76
|
+
|
77
|
+
|
78
|
+
def detect_template_content_format(chat_template: str) -> str:
|
79
|
+
"""
|
80
|
+
Detect whether a chat template expects 'string' or 'openai' content format.
|
81
|
+
|
82
|
+
- 'string': content is a simple string (like DeepSeek templates)
|
83
|
+
- 'openai': content is a list of structured dicts (like Llama4 templates)
|
84
|
+
|
85
|
+
Detection logic:
|
86
|
+
- If template has loops like {%- for content in message['content'] -%} → 'openai'
|
87
|
+
- Otherwise → 'string'
|
88
|
+
"""
|
89
|
+
jinja_ast = _try_extract_ast(chat_template)
|
90
|
+
if jinja_ast is None:
|
91
|
+
return "string"
|
92
|
+
|
93
|
+
try:
|
94
|
+
# Look for patterns like: {%- for content in message['content'] -%}
|
95
|
+
for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
|
96
|
+
loop_iter = loop_ast.iter
|
97
|
+
|
98
|
+
# Check if iterating over message['content'] or similar
|
99
|
+
if _is_var_or_elems_access(loop_iter, "message", "content"):
|
100
|
+
return "openai" # Found content iteration → openai format
|
101
|
+
|
102
|
+
return "string" # No content loops found → string format
|
103
|
+
except Exception as e:
|
104
|
+
logger.debug(f"Error when parsing AST of Jinja template: {e}")
|
105
|
+
return "string"
|
106
|
+
|
107
|
+
|
108
|
+
def process_content_for_template_format(
|
109
|
+
msg_dict: dict,
|
110
|
+
content_format: str,
|
111
|
+
image_data: list,
|
112
|
+
audio_data: list,
|
113
|
+
modalities: list,
|
114
|
+
) -> dict:
|
115
|
+
"""
|
116
|
+
Process message content based on detected template format.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
msg_dict: Message dictionary with content
|
120
|
+
content_format: 'string' or 'openai' (detected via AST analysis)
|
121
|
+
image_data: List to append extracted image URLs
|
122
|
+
audio_data: List to append extracted audio URLs
|
123
|
+
modalities: List to append modalities
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
Processed message dictionary
|
127
|
+
"""
|
128
|
+
if not isinstance(msg_dict.get("content"), list):
|
129
|
+
# Already a string or None, no processing needed
|
130
|
+
return {k: v for k, v in msg_dict.items() if v is not None}
|
131
|
+
|
132
|
+
if content_format == "openai":
|
133
|
+
# OpenAI format: preserve structured content list, normalize types
|
134
|
+
processed_content_parts = []
|
135
|
+
for chunk in msg_dict["content"]:
|
136
|
+
if isinstance(chunk, dict):
|
137
|
+
chunk_type = chunk.get("type")
|
138
|
+
|
139
|
+
if chunk_type == "image_url":
|
140
|
+
image_data.append(chunk["image_url"]["url"])
|
141
|
+
if chunk.get("modalities"):
|
142
|
+
modalities.append(chunk.get("modalities"))
|
143
|
+
# Normalize to simple 'image' type for template compatibility
|
144
|
+
processed_content_parts.append({"type": "image"})
|
145
|
+
elif chunk_type == "audio_url":
|
146
|
+
audio_data.append(chunk["audio_url"]["url"])
|
147
|
+
# Normalize to simple 'audio' type
|
148
|
+
processed_content_parts.append({"type": "audio"})
|
149
|
+
else:
|
150
|
+
# Keep other content as-is (text, etc.)
|
151
|
+
processed_content_parts.append(chunk)
|
152
|
+
|
153
|
+
new_msg = {
|
154
|
+
k: v for k, v in msg_dict.items() if v is not None and k != "content"
|
155
|
+
}
|
156
|
+
new_msg["content"] = processed_content_parts
|
157
|
+
return new_msg
|
158
|
+
|
159
|
+
else: # content_format == "string"
|
160
|
+
# String format: flatten to text only (for templates like DeepSeek)
|
161
|
+
text_parts = []
|
162
|
+
for chunk in msg_dict["content"]:
|
163
|
+
if isinstance(chunk, dict) and chunk.get("type") == "text":
|
164
|
+
text_parts.append(chunk["text"])
|
165
|
+
# Note: For string format, we ignore images/audio since the template
|
166
|
+
# doesn't expect structured content - multimodal placeholders would
|
167
|
+
# need to be inserted differently
|
168
|
+
|
169
|
+
new_msg = msg_dict.copy()
|
170
|
+
new_msg["content"] = " ".join(text_parts) if text_parts else ""
|
171
|
+
new_msg = {k: v for k, v in new_msg.items() if v is not None}
|
172
|
+
return new_msg
|
sglang/srt/operations.py
CHANGED
@@ -12,7 +12,7 @@ if _ENABLE_PROFILE:
|
|
12
12
|
|
13
13
|
|
14
14
|
def execute_operations(inputs, operations):
|
15
|
-
stages = _convert_operations_to_stages(
|
15
|
+
stages = _convert_operations_to_stages(operations)
|
16
16
|
executor = _StageExecutor("primary", stages, inputs=inputs)
|
17
17
|
for _ in range(executor.num_stages):
|
18
18
|
executor.next()
|
@@ -20,6 +20,37 @@ def execute_operations(inputs, operations):
|
|
20
20
|
return executor.output
|
21
21
|
|
22
22
|
|
23
|
+
def execute_overlapped_operations(
|
24
|
+
inputs_arr: Sequence,
|
25
|
+
operations_arr: Sequence,
|
26
|
+
delta_stages: Sequence[int],
|
27
|
+
) -> Sequence:
|
28
|
+
# Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
|
29
|
+
inputs_a, inputs_b = inputs_arr
|
30
|
+
operations_a, operations_b = operations_arr
|
31
|
+
delta_stage_a, delta_stage_b = delta_stages
|
32
|
+
assert delta_stage_a == 0
|
33
|
+
delta_stage = delta_stage_b
|
34
|
+
|
35
|
+
stages_a = _convert_operations_to_stages(operations_a)
|
36
|
+
stages_b = _convert_operations_to_stages(operations_b)
|
37
|
+
executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
|
38
|
+
executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
|
39
|
+
|
40
|
+
for _ in range(delta_stage):
|
41
|
+
executor_a.next()
|
42
|
+
|
43
|
+
for _ in range(executor_a.num_stages - delta_stage):
|
44
|
+
executor_a.next()
|
45
|
+
executor_b.next()
|
46
|
+
|
47
|
+
for _ in range(delta_stage):
|
48
|
+
executor_b.next()
|
49
|
+
|
50
|
+
assert executor_a.done and executor_b.done
|
51
|
+
return [executor_a.output, executor_b.output]
|
52
|
+
|
53
|
+
|
23
54
|
class YieldOperation:
|
24
55
|
pass
|
25
56
|
|
@@ -109,6 +140,9 @@ class _StateDict:
|
|
109
140
|
for k, v in values.items():
|
110
141
|
setattr(self, k, v)
|
111
142
|
|
143
|
+
def get(self, item):
|
144
|
+
return self._data.get(item)
|
145
|
+
|
112
146
|
def clear(self, expect_keys: Sequence[str]):
|
113
147
|
if set(self._data.keys()) != set(expect_keys):
|
114
148
|
raise Exception(
|
@@ -119,6 +153,7 @@ class _StateDict:
|
|
119
153
|
|
120
154
|
|
121
155
|
def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
|
156
|
+
operations = _decorate_operations(operations)
|
122
157
|
operation_chunks = list(
|
123
158
|
_chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
|
124
159
|
)
|
@@ -140,7 +175,7 @@ def _chunk_by_separator(
|
|
140
175
|
yield pending_items
|
141
176
|
|
142
177
|
|
143
|
-
def
|
178
|
+
def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
|
144
179
|
return [_decorate_operation(op, debug_name_prefix) for op in operations]
|
145
180
|
|
146
181
|
|
@@ -1,31 +1,207 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import List, Optional
|
3
|
+
|
1
4
|
import torch
|
2
5
|
|
6
|
+
from sglang.srt import operations
|
7
|
+
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
|
8
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
9
|
+
from sglang.srt.operations import Operation
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class OperationsStrategy:
|
14
|
+
operations: List[Operation]
|
15
|
+
deep_gemm_num_sms: Optional[int] = None
|
16
|
+
tbo_delta_stages: Optional[int] = None
|
17
|
+
|
18
|
+
@classmethod
|
19
|
+
def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
|
20
|
+
return OperationsStrategy(
|
21
|
+
operations=[x for item in items for x in item.operations],
|
22
|
+
deep_gemm_num_sms=_assert_all_same(
|
23
|
+
[item.deep_gemm_num_sms for item in items]
|
24
|
+
),
|
25
|
+
tbo_delta_stages=_assert_all_same(
|
26
|
+
[item.tbo_delta_stages for item in items]
|
27
|
+
),
|
28
|
+
)
|
29
|
+
|
30
|
+
@staticmethod
|
31
|
+
def init_new_tbo(
|
32
|
+
layers: torch.nn.ModuleList,
|
33
|
+
forward_mode: ForwardMode,
|
34
|
+
) -> "OperationsStrategy":
|
35
|
+
layer_name = layers[0].__class__.__name__
|
36
|
+
if layer_name == "DeepseekV2DecoderLayer":
|
37
|
+
return OperationsStrategy.concat(
|
38
|
+
[
|
39
|
+
_compute_moe_deepseek_layer_operations_strategy_tbo(
|
40
|
+
layer, forward_mode
|
41
|
+
)
|
42
|
+
for layer in layers
|
43
|
+
]
|
44
|
+
)
|
45
|
+
elif layer_name == "Qwen3MoeDecoderLayer":
|
46
|
+
return OperationsStrategy.concat(
|
47
|
+
[
|
48
|
+
_compute_moe_qwen3_layer_operations_strategy_tbo(
|
49
|
+
layer, forward_mode
|
50
|
+
)
|
51
|
+
for layer in layers
|
52
|
+
]
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
raise NotImplementedError
|
56
|
+
|
57
|
+
|
58
|
+
def _assert_all_same(items: List):
|
59
|
+
assert all(item == items[0] for item in items)
|
60
|
+
return items[0]
|
61
|
+
|
62
|
+
|
63
|
+
# -------------------------------- Strategy for DeepSeek ---------------------------------------
|
64
|
+
|
65
|
+
|
66
|
+
# TODO can refactor to make it more fancy if we have more complex strategies
|
67
|
+
def _compute_moe_deepseek_layer_operations_strategy_tbo(
|
68
|
+
layer: torch.nn.Module,
|
69
|
+
forward_mode: ForwardMode,
|
70
|
+
) -> OperationsStrategy:
|
71
|
+
assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
|
72
|
+
if forward_mode == ForwardMode.EXTEND:
|
73
|
+
return _compute_moe_deepseek_blog_prefill(layer)
|
74
|
+
elif forward_mode == ForwardMode.DECODE:
|
75
|
+
return _compute_moe_deepseek_blog_decode(layer)
|
76
|
+
else:
|
77
|
+
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
78
|
+
|
79
|
+
|
80
|
+
def _compute_moe_deepseek_blog_prefill(layer):
|
81
|
+
device_properties = torch.cuda.get_device_properties(device="cuda")
|
82
|
+
total_num_sms = device_properties.multi_processor_count
|
83
|
+
deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
|
84
|
+
|
85
|
+
return OperationsStrategy(
|
86
|
+
deep_gemm_num_sms=deep_gemm_num_sms,
|
87
|
+
tbo_delta_stages=0,
|
88
|
+
operations=[
|
89
|
+
layer.op_comm_prepare_attn,
|
90
|
+
layer.self_attn.op_prepare,
|
91
|
+
layer.self_attn.op_core,
|
92
|
+
layer.op_comm_prepare_mlp,
|
93
|
+
layer.mlp.op_gate,
|
94
|
+
layer.mlp.op_select_experts,
|
95
|
+
layer.mlp.op_dispatch_a,
|
96
|
+
operations.YieldOperation(),
|
97
|
+
layer.mlp.op_dispatch_b,
|
98
|
+
layer.mlp.op_experts,
|
99
|
+
layer.mlp.op_combine_a,
|
100
|
+
operations.YieldOperation(),
|
101
|
+
layer.mlp.op_shared_experts,
|
102
|
+
layer.mlp.op_combine_b,
|
103
|
+
layer.mlp.op_output,
|
104
|
+
layer.op_comm_postprocess_layer,
|
105
|
+
],
|
106
|
+
)
|
107
|
+
|
108
|
+
|
109
|
+
def _compute_moe_deepseek_blog_decode(layer):
|
110
|
+
return OperationsStrategy(
|
111
|
+
deep_gemm_num_sms=None,
|
112
|
+
tbo_delta_stages=2,
|
113
|
+
operations=[
|
114
|
+
layer.op_comm_prepare_attn,
|
115
|
+
layer.self_attn.op_prepare,
|
116
|
+
operations.YieldOperation(),
|
117
|
+
layer.self_attn.op_core,
|
118
|
+
layer.op_comm_prepare_mlp,
|
119
|
+
layer.mlp.op_gate,
|
120
|
+
layer.mlp.op_select_experts,
|
121
|
+
operations.YieldOperation(),
|
122
|
+
layer.mlp.op_dispatch_a,
|
123
|
+
layer.mlp.op_shared_experts,
|
124
|
+
operations.YieldOperation(),
|
125
|
+
layer.mlp.op_dispatch_b,
|
126
|
+
layer.mlp.op_experts,
|
127
|
+
layer.mlp.op_combine_a,
|
128
|
+
operations.YieldOperation(),
|
129
|
+
layer.mlp.op_combine_b,
|
130
|
+
operations.YieldOperation(),
|
131
|
+
layer.mlp.op_output,
|
132
|
+
layer.op_comm_postprocess_layer,
|
133
|
+
],
|
134
|
+
)
|
135
|
+
|
136
|
+
|
137
|
+
# -------------------------------- Strategy for Qwen3 ---------------------------------------
|
138
|
+
|
3
139
|
|
4
|
-
|
140
|
+
# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
|
141
|
+
# convenience to adjust strategy
|
142
|
+
def _compute_moe_qwen3_layer_operations_strategy_tbo(
|
5
143
|
layer: torch.nn.Module,
|
6
|
-
|
7
|
-
|
8
|
-
|
144
|
+
forward_mode: ForwardMode,
|
145
|
+
) -> OperationsStrategy:
|
146
|
+
assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
|
147
|
+
if forward_mode == ForwardMode.EXTEND:
|
148
|
+
return _compute_moe_qwen3_prefill(layer)
|
149
|
+
elif forward_mode == ForwardMode.DECODE:
|
150
|
+
return _compute_moe_qwen3_decode(layer)
|
151
|
+
else:
|
152
|
+
raise NotImplementedError(f"Unsupported {forward_mode=}")
|
153
|
+
|
154
|
+
|
155
|
+
def _compute_moe_qwen3_prefill(layer):
|
156
|
+
device_properties = torch.cuda.get_device_properties(device="cuda")
|
157
|
+
total_num_sms = device_properties.multi_processor_count
|
158
|
+
deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
|
159
|
+
|
160
|
+
return OperationsStrategy(
|
161
|
+
deep_gemm_num_sms=deep_gemm_num_sms,
|
162
|
+
tbo_delta_stages=0,
|
163
|
+
operations=[
|
164
|
+
layer.op_comm_prepare_attn,
|
165
|
+
layer.self_attn.op_prepare,
|
166
|
+
layer.self_attn.op_core,
|
167
|
+
layer.op_comm_prepare_mlp,
|
168
|
+
layer.mlp.op_gate,
|
169
|
+
layer.mlp.op_select_experts,
|
170
|
+
layer.mlp.op_dispatch_a,
|
171
|
+
operations.YieldOperation(),
|
172
|
+
layer.mlp.op_dispatch_b,
|
173
|
+
layer.mlp.op_experts,
|
174
|
+
layer.mlp.op_combine_a,
|
175
|
+
operations.YieldOperation(),
|
176
|
+
layer.mlp.op_combine_b,
|
177
|
+
layer.mlp.op_output,
|
178
|
+
layer.op_comm_postprocess_layer,
|
179
|
+
],
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def _compute_moe_qwen3_decode(layer):
|
184
|
+
return OperationsStrategy(
|
185
|
+
deep_gemm_num_sms=None,
|
186
|
+
tbo_delta_stages=2,
|
187
|
+
operations=[
|
9
188
|
layer.op_comm_prepare_attn,
|
10
|
-
layer.
|
189
|
+
layer.self_attn.op_prepare,
|
190
|
+
operations.YieldOperation(),
|
191
|
+
layer.self_attn.op_core,
|
11
192
|
layer.op_comm_prepare_mlp,
|
12
|
-
layer.
|
193
|
+
layer.mlp.op_gate,
|
194
|
+
layer.mlp.op_select_experts,
|
195
|
+
operations.YieldOperation(),
|
196
|
+
layer.mlp.op_dispatch_a,
|
197
|
+
operations.YieldOperation(),
|
198
|
+
layer.mlp.op_dispatch_b,
|
199
|
+
layer.mlp.op_experts,
|
200
|
+
layer.mlp.op_combine_a,
|
201
|
+
operations.YieldOperation(),
|
202
|
+
layer.mlp.op_combine_b,
|
203
|
+
layer.mlp.op_output,
|
13
204
|
layer.op_comm_postprocess_layer,
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return [
|
18
|
-
layer.op_comm_prepare_attn,
|
19
|
-
layer.op_attn,
|
20
|
-
layer.op_comm_prepare_mlp,
|
21
|
-
layer.mlp.op_gate,
|
22
|
-
layer.mlp.op_shared_experts,
|
23
|
-
layer.mlp.op_select_experts,
|
24
|
-
layer.mlp.op_dispatch_a,
|
25
|
-
layer.mlp.op_dispatch_b,
|
26
|
-
layer.mlp.op_experts,
|
27
|
-
layer.mlp.op_combine_a,
|
28
|
-
layer.mlp.op_combine_b,
|
29
|
-
layer.mlp.op_output,
|
30
|
-
layer.op_comm_postprocess_layer,
|
31
|
-
]
|
205
|
+
operations.YieldOperation(),
|
206
|
+
],
|
207
|
+
)
|
@@ -9,10 +9,13 @@ import torch
|
|
9
9
|
|
10
10
|
import sglang.srt.sampling.penaltylib as penaltylib
|
11
11
|
from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
|
12
|
+
from sglang.srt.sampling.sampling_params import TOP_K_ALL
|
13
|
+
from sglang.srt.utils import merge_bias_tensor
|
12
14
|
|
13
15
|
if TYPE_CHECKING:
|
14
16
|
from sglang.srt.managers.schedule_batch import ScheduleBatch
|
15
17
|
|
18
|
+
|
16
19
|
logger = logging.getLogger(__name__)
|
17
20
|
|
18
21
|
|
@@ -27,6 +30,12 @@ class SamplingBatchInfo:
|
|
27
30
|
# Whether all requests use greedy sampling
|
28
31
|
is_all_greedy: bool
|
29
32
|
|
33
|
+
# Whether any requests use top_p sampling
|
34
|
+
need_top_p_sampling: bool
|
35
|
+
|
36
|
+
# Whether any requests use top_k sampling
|
37
|
+
need_top_k_sampling: bool
|
38
|
+
|
30
39
|
# Whether any request needs min_p sampling
|
31
40
|
need_min_p_sampling: bool
|
32
41
|
|
@@ -55,6 +64,9 @@ class SamplingBatchInfo:
|
|
55
64
|
# Device
|
56
65
|
device: str = "cuda"
|
57
66
|
|
67
|
+
# Handle logit bias
|
68
|
+
logit_bias: Optional[torch.Tensor] = None
|
69
|
+
|
58
70
|
@classmethod
|
59
71
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
60
72
|
reqs = batch.reqs
|
@@ -77,6 +89,14 @@ class SamplingBatchInfo:
|
|
77
89
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float
|
78
90
|
).to(device, non_blocking=True)
|
79
91
|
|
92
|
+
logit_bias = None
|
93
|
+
if any(r.sampling_params.logit_bias is not None for r in reqs):
|
94
|
+
logit_bias = torch.zeros(len(reqs), vocab_size, device=device)
|
95
|
+
for i, r in enumerate(reqs):
|
96
|
+
if r.sampling_params.logit_bias is not None:
|
97
|
+
for key, value in r.sampling_params.logit_bias.items():
|
98
|
+
logit_bias[i, int(key)] = value
|
99
|
+
|
80
100
|
# Check if any request has custom logit processor
|
81
101
|
has_custom_logit_processor = (
|
82
102
|
batch.enable_custom_logit_processor # check the flag first.
|
@@ -133,6 +153,8 @@ class SamplingBatchInfo:
|
|
133
153
|
top_ks=top_ks,
|
134
154
|
min_ps=min_ps,
|
135
155
|
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
|
156
|
+
need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
|
157
|
+
need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
|
136
158
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
137
159
|
vocab_size=vocab_size,
|
138
160
|
penalizer_orchestrator=penalizer_orchestrator,
|
@@ -140,6 +162,7 @@ class SamplingBatchInfo:
|
|
140
162
|
custom_params=custom_params,
|
141
163
|
custom_logit_processor=merged_custom_logit_processor,
|
142
164
|
device=device,
|
165
|
+
logit_bias=logit_bias,
|
143
166
|
)
|
144
167
|
return ret
|
145
168
|
|
@@ -167,7 +190,7 @@ class SamplingBatchInfo:
|
|
167
190
|
|
168
191
|
# Apply the mask
|
169
192
|
for i, grammar in enumerate(self.grammars):
|
170
|
-
if grammar and not grammar.finished:
|
193
|
+
if grammar and not grammar.finished and not grammar.is_terminated():
|
171
194
|
grammar.fill_vocab_mask(self.vocab_mask, i)
|
172
195
|
|
173
196
|
# Move the mask to the device if needed
|
@@ -196,6 +219,9 @@ class SamplingBatchInfo:
|
|
196
219
|
if self.vocab_mask is not None:
|
197
220
|
self.apply_mask_func(logits=logits, vocab_mask=self.vocab_mask)
|
198
221
|
|
222
|
+
if self.logit_bias is not None:
|
223
|
+
logits.add_(self.logit_bias)
|
224
|
+
|
199
225
|
def filter_batch(self, keep_indices: List[int], keep_indices_device: torch.Tensor):
|
200
226
|
self.penalizer_orchestrator.filter(keep_indices_device)
|
201
227
|
|
@@ -211,6 +237,9 @@ class SamplingBatchInfo:
|
|
211
237
|
value = getattr(self, item, None)
|
212
238
|
setattr(self, item, value[keep_indices_device])
|
213
239
|
|
240
|
+
if self.logit_bias is not None:
|
241
|
+
self.logit_bias = self.logit_bias[keep_indices_device]
|
242
|
+
|
214
243
|
def _filter_batch_custom_logit_processor(
|
215
244
|
self, keep_indices: List[int], keep_indices_device: torch.Tensor
|
216
245
|
):
|
@@ -308,4 +337,11 @@ class SamplingBatchInfo:
|
|
308
337
|
setattr(self, item, torch.cat([self_val, other_val]))
|
309
338
|
|
310
339
|
self.is_all_greedy &= other.is_all_greedy
|
340
|
+
self.need_top_p_sampling |= other.need_top_p_sampling
|
341
|
+
self.need_top_k_sampling |= other.need_top_k_sampling
|
311
342
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
343
|
+
|
344
|
+
# Merge logit bias
|
345
|
+
self.logit_bias = merge_bias_tensor(
|
346
|
+
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
347
|
+
)
|
@@ -16,6 +16,7 @@
|
|
16
16
|
from typing import Any, Dict, List, Optional, Union
|
17
17
|
|
18
18
|
_SAMPLING_EPS = 1e-6
|
19
|
+
TOP_K_ALL = 1 << 30
|
19
20
|
|
20
21
|
|
21
22
|
class SamplingParams:
|
@@ -51,6 +52,7 @@ class SamplingParams:
|
|
51
52
|
no_stop_trim: bool = False,
|
52
53
|
custom_params: Optional[Dict[str, Any]] = None,
|
53
54
|
stream_interval: Optional[int] = None,
|
55
|
+
logit_bias: Optional[Dict[str, float]] = None,
|
54
56
|
) -> None:
|
55
57
|
self.max_new_tokens = max_new_tokens
|
56
58
|
self.stop_strs = stop
|
@@ -77,6 +79,7 @@ class SamplingParams:
|
|
77
79
|
self.no_stop_trim = no_stop_trim
|
78
80
|
self.custom_params = custom_params
|
79
81
|
self.stream_interval = stream_interval
|
82
|
+
self.logit_bias = logit_bias
|
80
83
|
|
81
84
|
# Process some special cases
|
82
85
|
if 0 <= self.temperature < _SAMPLING_EPS:
|
@@ -84,7 +87,7 @@ class SamplingParams:
|
|
84
87
|
self.temperature = 1.0
|
85
88
|
self.top_k = 1
|
86
89
|
if self.top_k == -1:
|
87
|
-
self.top_k =
|
90
|
+
self.top_k = TOP_K_ALL # whole vocabulary
|
88
91
|
|
89
92
|
def verify(self):
|
90
93
|
if self.temperature < 0.0:
|