sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ import copy
|
|
18
18
|
import dataclasses
|
19
19
|
import json
|
20
20
|
import logging
|
21
|
+
import math
|
21
22
|
import os
|
22
23
|
import pickle
|
23
24
|
import signal
|
@@ -42,6 +43,7 @@ from typing import (
|
|
42
43
|
)
|
43
44
|
|
44
45
|
import fastapi
|
46
|
+
import torch
|
45
47
|
import uvloop
|
46
48
|
import zmq
|
47
49
|
import zmq.asyncio
|
@@ -114,6 +116,7 @@ from sglang.srt.sampling.sampling_params import SamplingParams
|
|
114
116
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
115
117
|
from sglang.srt.utils import (
|
116
118
|
dataclass_to_string_truncated,
|
119
|
+
get_bool_env_var,
|
117
120
|
get_zmq_socket,
|
118
121
|
kill_process_tree,
|
119
122
|
)
|
@@ -221,7 +224,7 @@ class TokenizerManager:
|
|
221
224
|
self.tokenizer = get_tokenizer_from_processor(self.processor)
|
222
225
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
223
226
|
else:
|
224
|
-
self.mm_processor =
|
227
|
+
self.mm_processor = None
|
225
228
|
|
226
229
|
if server_args.skip_tokenizer_init:
|
227
230
|
self.tokenizer = self.processor = None
|
@@ -395,6 +398,9 @@ class TokenizerManager:
|
|
395
398
|
self.server_args.disaggregation_bootstrap_port
|
396
399
|
)
|
397
400
|
|
401
|
+
self.current_load = 0
|
402
|
+
self.current_load_lock = asyncio.Lock()
|
403
|
+
|
398
404
|
async def generate_request(
|
399
405
|
self,
|
400
406
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
@@ -422,8 +428,8 @@ class TokenizerManager:
|
|
422
428
|
is_single = obj.is_single
|
423
429
|
if is_single:
|
424
430
|
tokenized_obj = await self._tokenize_one_request(obj)
|
425
|
-
self._send_one_request(obj, tokenized_obj, created_time)
|
426
|
-
async for response in self._wait_one_response(obj, request):
|
431
|
+
state = self._send_one_request(obj, tokenized_obj, created_time)
|
432
|
+
async for response in self._wait_one_response(obj, state, request):
|
427
433
|
yield response
|
428
434
|
else:
|
429
435
|
async for response in self._handle_batch_request(
|
@@ -459,8 +465,7 @@ class TokenizerManager:
|
|
459
465
|
)
|
460
466
|
input_ids = self.tokenizer.encode(input_text)
|
461
467
|
|
462
|
-
|
463
|
-
if obj.contains_mm_input():
|
468
|
+
if self.mm_processor and obj.contains_mm_input():
|
464
469
|
image_inputs = await self.mm_processor.process_mm_data_async(
|
465
470
|
image_data=obj.image_data,
|
466
471
|
input_text=input_text or input_ids,
|
@@ -469,6 +474,8 @@ class TokenizerManager:
|
|
469
474
|
)
|
470
475
|
if image_inputs and "input_ids" in image_inputs:
|
471
476
|
input_ids = image_inputs["input_ids"]
|
477
|
+
else:
|
478
|
+
image_inputs: Optional[Dict] = None
|
472
479
|
|
473
480
|
self._validate_token_len(obj, input_ids)
|
474
481
|
return self._create_tokenized_object(
|
@@ -563,6 +570,7 @@ class TokenizerManager:
|
|
563
570
|
session_params=session_params,
|
564
571
|
custom_logit_processor=obj.custom_logit_processor,
|
565
572
|
return_hidden_states=obj.return_hidden_states,
|
573
|
+
data_parallel_rank=obj.data_parallel_rank,
|
566
574
|
)
|
567
575
|
elif isinstance(obj, EmbeddingReqInput):
|
568
576
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
@@ -628,15 +636,15 @@ class TokenizerManager:
|
|
628
636
|
self.send_to_scheduler.send_pyobj(tokenized_obj)
|
629
637
|
state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
|
630
638
|
self.rid_to_state[obj.rid] = state
|
639
|
+
return state
|
631
640
|
|
632
641
|
async def _wait_one_response(
|
633
642
|
self,
|
634
643
|
obj: Union[GenerateReqInput, EmbeddingReqInput],
|
644
|
+
state: ReqState,
|
635
645
|
request: Optional[fastapi.Request] = None,
|
636
646
|
):
|
637
647
|
"""Wait for the response of one request."""
|
638
|
-
state = self.rid_to_state[obj.rid]
|
639
|
-
|
640
648
|
while True:
|
641
649
|
try:
|
642
650
|
await asyncio.wait_for(state.event.wait(), timeout=4)
|
@@ -706,16 +714,16 @@ class TokenizerManager:
|
|
706
714
|
|
707
715
|
for i, tokenized_obj in enumerate(tokenized_objs):
|
708
716
|
tmp_obj = obj[i]
|
709
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
710
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
717
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
718
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
711
719
|
rids.append(tmp_obj.rid)
|
712
720
|
else:
|
713
721
|
# Sequential tokenization and processing
|
714
722
|
for i in range(batch_size):
|
715
723
|
tmp_obj = obj[i]
|
716
724
|
tokenized_obj = await self._tokenize_one_request(tmp_obj)
|
717
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
718
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
725
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
726
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
719
727
|
rids.append(tmp_obj.rid)
|
720
728
|
else:
|
721
729
|
# FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
|
@@ -740,8 +748,8 @@ class TokenizerManager:
|
|
740
748
|
tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
|
741
749
|
tokenized_obj.sampling_params.max_new_tokens = 0
|
742
750
|
tokenized_obj.stream = False
|
743
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
744
|
-
await self._wait_one_response(tmp_obj, request).__anext__()
|
751
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
752
|
+
await self._wait_one_response(tmp_obj, state, request).__anext__()
|
745
753
|
|
746
754
|
# Expand requests, assign new rids for them, and send them
|
747
755
|
for i in range(batch_size):
|
@@ -749,8 +757,8 @@ class TokenizerManager:
|
|
749
757
|
tmp_obj = copy.copy(objs[i])
|
750
758
|
tokenized_obj = copy.copy(tokenized_objs[i])
|
751
759
|
tokenized_obj.rid = tmp_obj.regenerate_rid()
|
752
|
-
self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
753
|
-
generators.append(self._wait_one_response(tmp_obj, request))
|
760
|
+
state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
|
761
|
+
generators.append(self._wait_one_response(tmp_obj, state, request))
|
754
762
|
rids.append(tmp_obj.rid)
|
755
763
|
|
756
764
|
# Wait for all requests
|
@@ -786,6 +794,9 @@ class TokenizerManager:
|
|
786
794
|
req = AbortReq(rid)
|
787
795
|
self.send_to_scheduler.send_pyobj(req)
|
788
796
|
|
797
|
+
if self.enable_metrics:
|
798
|
+
self.metrics_collector.observe_one_aborted_request()
|
799
|
+
|
789
800
|
async def start_profile(
|
790
801
|
self,
|
791
802
|
output_dir: Optional[str] = None,
|
@@ -793,8 +804,11 @@ class TokenizerManager:
|
|
793
804
|
activities: Optional[List[str]] = None,
|
794
805
|
with_stack: Optional[bool] = None,
|
795
806
|
record_shapes: Optional[bool] = None,
|
807
|
+
profile_by_stage: bool = False,
|
796
808
|
):
|
797
809
|
self.auto_create_handle_loop()
|
810
|
+
env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
|
811
|
+
with_stack = False if with_stack is False or env_with_stack is False else True
|
798
812
|
req = ProfileReq(
|
799
813
|
type=ProfileReqType.START_PROFILE,
|
800
814
|
output_dir=output_dir,
|
@@ -802,6 +816,7 @@ class TokenizerManager:
|
|
802
816
|
activities=activities,
|
803
817
|
with_stack=with_stack,
|
804
818
|
record_shapes=record_shapes,
|
819
|
+
profile_by_stage=profile_by_stage,
|
805
820
|
profile_id=str(time.time()),
|
806
821
|
)
|
807
822
|
return await self._execute_profile(req)
|
@@ -841,7 +856,7 @@ class TokenizerManager:
|
|
841
856
|
obj.load_format = self.server_args.load_format
|
842
857
|
logger.info("Start update_weights. Load format=%s", obj.load_format)
|
843
858
|
|
844
|
-
if True:
|
859
|
+
if True: # Keep this redundant check to simplify some internal code sync
|
845
860
|
# Hold the lock if it is not async. This means that weight sync
|
846
861
|
# cannot run while requests are in progress.
|
847
862
|
async with self.model_update_lock.writer_lock:
|
@@ -983,6 +998,14 @@ class TokenizerManager:
|
|
983
998
|
# Many DP ranks
|
984
999
|
return [res.internal_state for res in responses]
|
985
1000
|
|
1001
|
+
async def get_load(self) -> dict:
|
1002
|
+
# TODO(lsyin): fake load report server
|
1003
|
+
if not self.current_load_lock.locked():
|
1004
|
+
async with self.current_load_lock:
|
1005
|
+
internal_state = await self.get_internal_state()
|
1006
|
+
self.current_load = internal_state[0]["load"]
|
1007
|
+
return {"load": self.current_load}
|
1008
|
+
|
986
1009
|
async def set_internal_state(
|
987
1010
|
self, obj: SetInternalStateReq
|
988
1011
|
) -> SetInternalStateReqOutput:
|
@@ -1400,7 +1423,7 @@ class TokenizerManager:
|
|
1400
1423
|
asyncio.create_task(asyncio.to_thread(background_task))
|
1401
1424
|
|
1402
1425
|
def _handle_abort_req(self, recv_obj):
|
1403
|
-
self.rid_to_state.pop(recv_obj.rid)
|
1426
|
+
self.rid_to_state.pop(recv_obj.rid, None)
|
1404
1427
|
|
1405
1428
|
def _handle_open_session_req_output(self, recv_obj):
|
1406
1429
|
self.session_futures[recv_obj.session_id].set_result(
|
@@ -1416,6 +1439,100 @@ class TokenizerManager:
|
|
1416
1439
|
if len(self.model_update_tmp) == self.server_args.dp_size:
|
1417
1440
|
self.model_update_result.set_result(self.model_update_tmp)
|
1418
1441
|
|
1442
|
+
async def score_request(
|
1443
|
+
self,
|
1444
|
+
query: Optional[Union[str, List[int]]] = None,
|
1445
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
1446
|
+
label_token_ids: Optional[List[int]] = None,
|
1447
|
+
apply_softmax: bool = False,
|
1448
|
+
item_first: bool = False,
|
1449
|
+
request: Optional[Any] = None,
|
1450
|
+
) -> List[List[float]]:
|
1451
|
+
"""
|
1452
|
+
See Engine.score() for more details.
|
1453
|
+
"""
|
1454
|
+
if label_token_ids is None:
|
1455
|
+
raise ValueError("label_token_ids must be provided")
|
1456
|
+
|
1457
|
+
if self.tokenizer is not None:
|
1458
|
+
vocab_size = self.tokenizer.vocab_size
|
1459
|
+
for token_id in label_token_ids:
|
1460
|
+
if token_id >= vocab_size:
|
1461
|
+
raise ValueError(
|
1462
|
+
f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
|
1463
|
+
)
|
1464
|
+
|
1465
|
+
# Handle string or tokenized query/items
|
1466
|
+
if isinstance(query, str) and (
|
1467
|
+
isinstance(items, str)
|
1468
|
+
or (isinstance(items, list) and (not items or isinstance(items[0], str)))
|
1469
|
+
):
|
1470
|
+
# Both query and items are text
|
1471
|
+
items_list = [items] if isinstance(items, str) else items
|
1472
|
+
if item_first:
|
1473
|
+
prompts = [f"{item}{query}" for item in items_list]
|
1474
|
+
else:
|
1475
|
+
prompts = [f"{query}{item}" for item in items_list]
|
1476
|
+
batch_request = GenerateReqInput(
|
1477
|
+
text=prompts,
|
1478
|
+
return_logprob=True,
|
1479
|
+
token_ids_logprob=label_token_ids,
|
1480
|
+
stream=False,
|
1481
|
+
sampling_params={"max_new_tokens": 1},
|
1482
|
+
)
|
1483
|
+
elif (
|
1484
|
+
isinstance(query, list)
|
1485
|
+
and isinstance(items, list)
|
1486
|
+
and items
|
1487
|
+
and isinstance(items[0], list)
|
1488
|
+
):
|
1489
|
+
# Both query and items are token IDs
|
1490
|
+
if item_first:
|
1491
|
+
input_ids_list = [item + query for item in items]
|
1492
|
+
else:
|
1493
|
+
input_ids_list = [query + item for item in items]
|
1494
|
+
batch_request = GenerateReqInput(
|
1495
|
+
input_ids=input_ids_list,
|
1496
|
+
return_logprob=True,
|
1497
|
+
token_ids_logprob=label_token_ids,
|
1498
|
+
stream=False,
|
1499
|
+
sampling_params={"max_new_tokens": 1},
|
1500
|
+
)
|
1501
|
+
else:
|
1502
|
+
raise ValueError(
|
1503
|
+
"Invalid combination of query/items types for score_request."
|
1504
|
+
)
|
1505
|
+
|
1506
|
+
results = await self.generate_request(batch_request, request).__anext__()
|
1507
|
+
scores = []
|
1508
|
+
|
1509
|
+
for result in results:
|
1510
|
+
# Get logprobs for each token
|
1511
|
+
logprobs = {}
|
1512
|
+
for logprob, token_id, _ in result["meta_info"].get(
|
1513
|
+
"output_token_ids_logprobs", []
|
1514
|
+
)[0]:
|
1515
|
+
if token_id in label_token_ids:
|
1516
|
+
logprobs[token_id] = logprob
|
1517
|
+
|
1518
|
+
# Get scores in order of label_token_ids
|
1519
|
+
score_list = [
|
1520
|
+
logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
|
1521
|
+
]
|
1522
|
+
|
1523
|
+
# Apply softmax to logprobs if needed
|
1524
|
+
if apply_softmax:
|
1525
|
+
score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
|
1526
|
+
else:
|
1527
|
+
# Convert logprobs to probabilities if not using softmax
|
1528
|
+
score_list = [
|
1529
|
+
math.exp(x) if x != float("-inf") else 0.0 for x in score_list
|
1530
|
+
]
|
1531
|
+
|
1532
|
+
scores.append(score_list)
|
1533
|
+
|
1534
|
+
return scores
|
1535
|
+
|
1419
1536
|
|
1420
1537
|
async def print_exception_wrapper(func):
|
1421
1538
|
"""
|
sglang/srt/managers/utils.py
CHANGED
@@ -35,10 +35,6 @@ def validate_input_length(
|
|
35
35
|
f"the maximum allowed length ({max_req_input_len} tokens). "
|
36
36
|
f"Use a shorter input or enable --allow-auto-truncate."
|
37
37
|
)
|
38
|
-
logger.error(error_msg)
|
39
|
-
req.finished_reason = FINISH_ABORT(
|
40
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
41
|
-
)
|
42
38
|
return error_msg
|
43
39
|
|
44
40
|
return None
|
sglang/srt/metrics/collector.py
CHANGED
@@ -402,6 +402,12 @@ class TokenizerMetricsCollector:
|
|
402
402
|
labelnames=labels.keys(),
|
403
403
|
)
|
404
404
|
|
405
|
+
self.num_aborted_requests_total = Counter(
|
406
|
+
name="sglang:num_aborted_requests",
|
407
|
+
documentation="Number of requests aborted.",
|
408
|
+
labelnames=labels.keys(),
|
409
|
+
)
|
410
|
+
|
405
411
|
if bucket_time_to_first_token is None:
|
406
412
|
bucket_time_to_first_token = [
|
407
413
|
0.1,
|
@@ -533,3 +539,6 @@ class TokenizerMetricsCollector:
|
|
533
539
|
if adjusted_interval <= bound:
|
534
540
|
his._buckets[i].inc(num_new_tokens)
|
535
541
|
break
|
542
|
+
|
543
|
+
def observe_one_aborted_request(self):
|
544
|
+
self.num_aborted_requests_total.labels(**self.labels).inc(1)
|
@@ -28,7 +28,6 @@ from sglang.srt.custom_op import CustomOp
|
|
28
28
|
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
29
29
|
from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
|
30
30
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
31
|
-
from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
|
32
31
|
from sglang.srt.layers.torchao_utils import save_gemlite_cache
|
33
32
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
34
33
|
from sglang.srt.model_executor.forward_batch_info import (
|
@@ -36,8 +35,10 @@ from sglang.srt.model_executor.forward_batch_info import (
|
|
36
35
|
ForwardBatch,
|
37
36
|
ForwardMode,
|
38
37
|
PPProxyTensors,
|
38
|
+
enable_num_token_non_padded,
|
39
39
|
)
|
40
40
|
from sglang.srt.patch_torch import monkey_patch_torch_compile
|
41
|
+
from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
|
41
42
|
from sglang.srt.utils import (
|
42
43
|
get_available_gpu_memory,
|
43
44
|
get_device_memory_capacity,
|
@@ -55,22 +56,23 @@ def get_is_capture_mode():
|
|
55
56
|
return is_capture_mode
|
56
57
|
|
57
58
|
|
59
|
+
@contextmanager
|
60
|
+
def model_capture_mode():
|
61
|
+
global is_capture_mode
|
62
|
+
is_capture_mode = True
|
63
|
+
|
64
|
+
yield
|
65
|
+
|
66
|
+
is_capture_mode = False
|
67
|
+
|
68
|
+
|
58
69
|
def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
|
59
70
|
for sub in model._modules.values():
|
60
71
|
if isinstance(sub, CustomOp):
|
61
72
|
if reverse:
|
62
|
-
sub.
|
63
|
-
setattr(sub, "is_torch_compile", False)
|
73
|
+
sub.leave_torch_compile()
|
64
74
|
else:
|
65
|
-
|
66
|
-
if "FusedMoE" in sub.__class__.__name__:
|
67
|
-
if num_tokens == 1:
|
68
|
-
# The performance of torch.compile on this layer is not always good when bs > 1,
|
69
|
-
# so we decide to only use torch.compile when bs =1
|
70
|
-
sub._forward_method = fused_moe_forward_native
|
71
|
-
else:
|
72
|
-
sub._forward_method = sub.forward_native
|
73
|
-
setattr(sub, "is_torch_compile", True)
|
75
|
+
sub.enter_torch_compile(num_tokens=num_tokens)
|
74
76
|
if isinstance(sub, torch.nn.Module):
|
75
77
|
_to_torch(sub, reverse, num_tokens)
|
76
78
|
|
@@ -131,26 +133,32 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|
131
133
|
if capture_bs is None:
|
132
134
|
if server_args.speculative_algorithm is None:
|
133
135
|
if server_args.disable_cuda_graph_padding:
|
134
|
-
capture_bs = list(range(1, 33)) + list(range(
|
136
|
+
capture_bs = list(range(1, 33)) + list(range(48, 161, 16))
|
135
137
|
else:
|
136
138
|
capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
|
137
139
|
else:
|
138
140
|
# Since speculative decoding requires more cuda graph memory, we
|
139
141
|
# capture less.
|
140
142
|
capture_bs = (
|
141
|
-
list(range(1, 9))
|
143
|
+
list(range(1, 9))
|
144
|
+
+ list(range(10, 33, 2))
|
145
|
+
+ list(range(40, 64, 8))
|
146
|
+
+ list(range(80, 161, 16))
|
142
147
|
)
|
143
148
|
|
144
149
|
gpu_mem = get_device_memory_capacity()
|
145
150
|
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
146
151
|
capture_bs += list(range(160, 257, 8))
|
152
|
+
if gpu_mem is not None and gpu_mem > 180 * 1000:
|
153
|
+
capture_bs += list(range(256, 513, 16))
|
147
154
|
|
148
155
|
if max(capture_bs) > model_runner.req_to_token_pool.size:
|
149
|
-
# In some
|
156
|
+
# In some cases (e.g., with a small GPU or --max-running-requests), the #max-running-requests
|
150
157
|
# is very small. We add more values here to make sure we capture the maximum bs.
|
151
|
-
capture_bs += [model_runner.req_to_token_pool.size
|
152
|
-
|
153
|
-
|
158
|
+
capture_bs += [model_runner.req_to_token_pool.size]
|
159
|
+
|
160
|
+
if server_args.enable_two_batch_overlap:
|
161
|
+
capture_bs = [bs for bs in capture_bs if bs >= 2]
|
154
162
|
|
155
163
|
if server_args.cuda_graph_max_bs:
|
156
164
|
capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
|
@@ -160,7 +168,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
|
|
160
168
|
)
|
161
169
|
capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
|
162
170
|
capture_bs = list(sorted(set(capture_bs)))
|
163
|
-
assert len(capture_bs) > 0 and capture_bs[0] > 0
|
171
|
+
assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
|
164
172
|
compile_bs = (
|
165
173
|
[bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
|
166
174
|
if server_args.enable_torch_compile
|
@@ -195,6 +203,9 @@ class CudaGraphRunner:
|
|
195
203
|
self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
|
196
204
|
self.enable_dp_attention = model_runner.server_args.enable_dp_attention
|
197
205
|
self.enable_sp_layernorm = model_runner.server_args.enable_sp_layernorm
|
206
|
+
self.enable_two_batch_overlap = (
|
207
|
+
model_runner.server_args.enable_two_batch_overlap
|
208
|
+
)
|
198
209
|
self.speculative_algorithm = model_runner.server_args.speculative_algorithm
|
199
210
|
self.tp_size = model_runner.server_args.tp_size
|
200
211
|
self.dp_size = model_runner.server_args.dp_size
|
@@ -248,6 +259,7 @@ class CudaGraphRunner:
|
|
248
259
|
self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
|
249
260
|
self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
|
250
261
|
self.num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
|
262
|
+
self.tbo_plugin = TboCudaGraphRunnerPlugin()
|
251
263
|
|
252
264
|
# pipeline parallelism
|
253
265
|
if self.pp_size > 1:
|
@@ -263,23 +275,8 @@ class CudaGraphRunner:
|
|
263
275
|
}
|
264
276
|
|
265
277
|
# Speculative_inference
|
266
|
-
if (
|
267
|
-
model_runner.spec_algorithm.is_eagle3()
|
268
|
-
and not model_runner.is_draft_worker
|
269
|
-
):
|
270
|
-
self.hidden_states = torch.zeros(
|
271
|
-
(
|
272
|
-
self.max_num_token,
|
273
|
-
3 * self.model_runner.model_config.hidden_size,
|
274
|
-
),
|
275
|
-
dtype=self.model_runner.dtype,
|
276
|
-
)
|
278
|
+
if model_runner.spec_algorithm.is_eagle3():
|
277
279
|
self.model_runner.model.set_eagle3_layers_to_capture()
|
278
|
-
elif model_runner.spec_algorithm.is_eagle():
|
279
|
-
self.hidden_states = torch.zeros(
|
280
|
-
(self.max_num_token, self.model_runner.model_config.hidden_size),
|
281
|
-
dtype=self.model_runner.dtype,
|
282
|
-
)
|
283
280
|
|
284
281
|
if self.is_encoder_decoder:
|
285
282
|
# NOTE: encoder_lens can influence the full_text_row_masked_out_mask tensor when doing mixed batch
|
@@ -288,6 +285,7 @@ class CudaGraphRunner:
|
|
288
285
|
)
|
289
286
|
else:
|
290
287
|
self.encoder_lens = None
|
288
|
+
|
291
289
|
if self.enable_dp_attention or self.enable_sp_layernorm:
|
292
290
|
# TODO(ch-wan): SP layernorm should use a different logic to manage gathered_buffer
|
293
291
|
self.gathered_buffer = torch.zeros(
|
@@ -303,28 +301,13 @@ class CudaGraphRunner:
|
|
303
301
|
|
304
302
|
# Capture
|
305
303
|
try:
|
306
|
-
with
|
304
|
+
with model_capture_mode():
|
307
305
|
self.capture()
|
308
306
|
except RuntimeError as e:
|
309
307
|
raise Exception(
|
310
|
-
f"Capture
|
311
|
-
"Possible solutions:\n"
|
312
|
-
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
313
|
-
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
314
|
-
"3. disable torch compile by not using --enable-torch-compile\n"
|
315
|
-
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
316
|
-
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
308
|
+
f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
|
317
309
|
)
|
318
310
|
|
319
|
-
@contextmanager
|
320
|
-
def model_capture_mode(self):
|
321
|
-
global is_capture_mode
|
322
|
-
is_capture_mode = True
|
323
|
-
|
324
|
-
yield
|
325
|
-
|
326
|
-
is_capture_mode = False
|
327
|
-
|
328
311
|
def can_run(self, forward_batch: ForwardBatch):
|
329
312
|
if self.enable_dp_attention or self.enable_sp_layernorm:
|
330
313
|
total_global_tokens = sum(forward_batch.global_num_tokens_cpu)
|
@@ -349,7 +332,12 @@ class CudaGraphRunner:
|
|
349
332
|
if self.is_encoder_decoder
|
350
333
|
else True
|
351
334
|
)
|
352
|
-
|
335
|
+
|
336
|
+
is_tbo_supported = (
|
337
|
+
forward_batch.can_run_tbo if self.enable_two_batch_overlap else True
|
338
|
+
)
|
339
|
+
|
340
|
+
return is_bs_supported and is_encoder_lens_supported and is_tbo_supported
|
353
341
|
|
354
342
|
def capture(self):
|
355
343
|
with graph_capture() as graph_capture_context:
|
@@ -436,6 +424,7 @@ class CudaGraphRunner:
|
|
436
424
|
self.capture_hidden_mode = (
|
437
425
|
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
|
438
426
|
)
|
427
|
+
|
439
428
|
if self.model_runner.server_args.lora_paths is not None:
|
440
429
|
# Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
|
441
430
|
# different logic to handle lora, so we need to set `lora_paths` to a list of non-None
|
@@ -464,9 +453,11 @@ class CudaGraphRunner:
|
|
464
453
|
spec_algorithm=self.model_runner.spec_algorithm,
|
465
454
|
spec_info=spec_info,
|
466
455
|
capture_hidden_mode=self.capture_hidden_mode,
|
467
|
-
lora_paths=lora_paths,
|
468
456
|
num_token_non_padded=self.num_token_non_padded,
|
457
|
+
global_forward_mode=self.capture_forward_mode,
|
458
|
+
lora_paths=lora_paths,
|
469
459
|
)
|
460
|
+
self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
|
470
461
|
|
471
462
|
if lora_paths is not None:
|
472
463
|
self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
|
@@ -492,7 +483,9 @@ class CudaGraphRunner:
|
|
492
483
|
self.pp_size > 1
|
493
484
|
and "pp_proxy_tensors" in inspect.signature(forward).parameters
|
494
485
|
):
|
495
|
-
kwargs["pp_proxy_tensors"] =
|
486
|
+
kwargs["pp_proxy_tensors"] = PPProxyTensors(
|
487
|
+
{k: v.clone() for k, v in pp_proxy_tensors.tensors.items()}
|
488
|
+
)
|
496
489
|
|
497
490
|
logits_output_or_pp_proxy_tensors = forward(
|
498
491
|
input_ids,
|
@@ -561,7 +554,7 @@ class CudaGraphRunner:
|
|
561
554
|
self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
|
562
555
|
self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
|
563
556
|
self.positions[:raw_num_token].copy_(forward_batch.positions)
|
564
|
-
|
557
|
+
|
565
558
|
if forward_batch.seq_lens_cpu is not None:
|
566
559
|
if bs != raw_bs:
|
567
560
|
self.seq_lens_cpu.fill_(1)
|
@@ -578,9 +571,14 @@ class CudaGraphRunner:
|
|
578
571
|
self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
|
579
572
|
if self.enable_dp_attention or self.enable_sp_layernorm:
|
580
573
|
self.global_num_tokens_gpu.copy_(forward_batch.global_num_tokens_gpu)
|
581
|
-
|
582
|
-
|
583
|
-
|
574
|
+
if enable_num_token_non_padded(self.model_runner.server_args):
|
575
|
+
self.num_token_non_padded.copy_(forward_batch.num_token_non_padded)
|
576
|
+
if self.enable_two_batch_overlap:
|
577
|
+
self.tbo_plugin.replay_prepare(
|
578
|
+
forward_mode=forward_batch.forward_mode,
|
579
|
+
bs=bs,
|
580
|
+
num_token_non_padded=len(forward_batch.input_ids),
|
581
|
+
)
|
584
582
|
|
585
583
|
# Attention backend
|
586
584
|
self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
|
@@ -639,7 +637,7 @@ class CudaGraphRunner:
|
|
639
637
|
else:
|
640
638
|
spec_info = EagleVerifyInput(
|
641
639
|
draft_token=None,
|
642
|
-
custom_mask=torch.
|
640
|
+
custom_mask=torch.ones(
|
643
641
|
(num_tokens * self.model_runner.model_config.context_len),
|
644
642
|
dtype=torch.bool,
|
645
643
|
device="cuda",
|
@@ -649,9 +647,22 @@ class CudaGraphRunner:
|
|
649
647
|
retrive_next_token=None,
|
650
648
|
retrive_next_sibling=None,
|
651
649
|
retrive_cum_len=None,
|
652
|
-
draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
|
653
650
|
spec_steps=self.model_runner.server_args.speculative_num_steps,
|
651
|
+
topk=self.model_runner.server_args.speculative_eagle_topk,
|
652
|
+
draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
|
654
653
|
capture_hidden_mode=CaptureHiddenMode.FULL,
|
654
|
+
seq_lens_sum=None,
|
655
|
+
seq_lens_cpu=None,
|
655
656
|
)
|
656
657
|
|
657
658
|
return spec_info
|
659
|
+
|
660
|
+
|
661
|
+
CUDA_GRAPH_CAPTURE_FAILED_MSG = (
|
662
|
+
"Possible solutions:\n"
|
663
|
+
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
664
|
+
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
665
|
+
"3. disable torch compile by not using --enable-torch-compile\n"
|
666
|
+
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
667
|
+
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
668
|
+
)
|