sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
|
|
24
24
|
from concurrent import futures
|
25
25
|
from dataclasses import dataclass
|
26
26
|
from http import HTTPStatus
|
27
|
+
from pathlib import Path
|
27
28
|
from types import SimpleNamespace
|
28
29
|
from typing import Dict, List, Optional, Tuple, Union
|
29
30
|
|
@@ -35,7 +36,10 @@ from torch.distributed import barrier
|
|
35
36
|
|
36
37
|
from sglang.global_config import global_config
|
37
38
|
from sglang.srt.configs.model_config import ModelConfig
|
38
|
-
from sglang.srt.constrained.base_grammar_backend import
|
39
|
+
from sglang.srt.constrained.base_grammar_backend import (
|
40
|
+
INVALID_GRAMMAR_OBJ,
|
41
|
+
create_grammar_backend,
|
42
|
+
)
|
39
43
|
from sglang.srt.disaggregation.decode import (
|
40
44
|
DecodePreallocQueue,
|
41
45
|
DecodeTransferQueue,
|
@@ -62,7 +66,6 @@ from sglang.srt.hf_transformers_utils import (
|
|
62
66
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
63
67
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
64
68
|
from sglang.srt.managers.expert_distribution import (
|
65
|
-
ExpertDistributionRecorder,
|
66
69
|
get_global_expert_distribution_recorder,
|
67
70
|
)
|
68
71
|
from sglang.srt.managers.io_struct import (
|
@@ -132,11 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
|
|
132
135
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
133
136
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
134
137
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
138
|
+
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
135
139
|
from sglang.srt.utils import (
|
140
|
+
DeepEPMode,
|
136
141
|
DynamicGradMode,
|
137
142
|
broadcast_pyobj,
|
138
143
|
configure_logger,
|
139
144
|
disable_request_logging,
|
145
|
+
get_available_gpu_memory,
|
140
146
|
get_bool_env_var,
|
141
147
|
get_zmq_socket,
|
142
148
|
kill_itself_when_parent_died,
|
@@ -210,7 +216,6 @@ class Scheduler(
|
|
210
216
|
self.gpu_id = gpu_id
|
211
217
|
self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
|
212
218
|
self.page_size = server_args.page_size
|
213
|
-
# Distributed rank info
|
214
219
|
self.dp_size = server_args.dp_size
|
215
220
|
self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
|
216
221
|
compute_dp_attention_world_info(
|
@@ -330,12 +335,16 @@ class Scheduler(
|
|
330
335
|
|
331
336
|
# Print debug info
|
332
337
|
if tp_rank == 0:
|
338
|
+
avail_mem = get_available_gpu_memory(
|
339
|
+
self.device, self.gpu_id, empty_cache=False
|
340
|
+
)
|
333
341
|
logger.info(
|
334
342
|
f"max_total_num_tokens={self.max_total_num_tokens}, "
|
335
343
|
f"chunked_prefill_size={server_args.chunked_prefill_size}, "
|
336
344
|
f"max_prefill_tokens={self.max_prefill_tokens}, "
|
337
345
|
f"max_running_requests={self.max_running_requests}, "
|
338
|
-
f"context_len={self.model_config.context_len}"
|
346
|
+
f"context_len={self.model_config.context_len}, "
|
347
|
+
f"available_gpu_mem={avail_mem:.2f} GB"
|
339
348
|
)
|
340
349
|
|
341
350
|
# Init memory pool and cache
|
@@ -359,6 +368,7 @@ class Scheduler(
|
|
359
368
|
self.current_stream = torch.get_device_module(self.device).current_stream()
|
360
369
|
if self.device == "cpu":
|
361
370
|
self.current_stream.synchronize = lambda: None # No-op for CPU
|
371
|
+
self.forward_sleep_time = None
|
362
372
|
|
363
373
|
# Init session info
|
364
374
|
self.sessions: Dict[str, Session] = {}
|
@@ -420,10 +430,16 @@ class Scheduler(
|
|
420
430
|
self.torch_profiler = None
|
421
431
|
self.torch_profiler_output_dir: Optional[str] = None
|
422
432
|
self.profiler_activities: Optional[List[str]] = None
|
423
|
-
self.
|
433
|
+
self.profile_id: Optional[str] = None
|
424
434
|
self.profiler_target_forward_ct: Optional[int] = None
|
425
|
-
|
426
|
-
self.
|
435
|
+
self.profiler_target_prefill_ct: Optional[int] = None
|
436
|
+
self.profiler_target_decode_ct: Optional[int] = None
|
437
|
+
self.profiler_prefill_ct: Optional[int] = None
|
438
|
+
self.profiler_decode_ct: Optional[int] = None
|
439
|
+
self.profile_by_stage: bool = False
|
440
|
+
self.profile_steps: Optional[int] = None
|
441
|
+
self.profile_in_progress: bool = False
|
442
|
+
self.rpd_profiler = None
|
427
443
|
|
428
444
|
# Init metrics stats
|
429
445
|
self.init_metrics()
|
@@ -556,7 +572,9 @@ class Scheduler(
|
|
556
572
|
|
557
573
|
def init_kv_events(self, kv_events_config: Optional[str]):
|
558
574
|
if self.enable_kv_cache_events:
|
559
|
-
self.kv_event_publisher = EventPublisherFactory.create(
|
575
|
+
self.kv_event_publisher = EventPublisherFactory.create(
|
576
|
+
kv_events_config, self.attn_dp_rank
|
577
|
+
)
|
560
578
|
|
561
579
|
def init_disaggregation(self):
|
562
580
|
self.transfer_backend = TransferBackend(
|
@@ -931,18 +949,19 @@ class Scheduler(
|
|
931
949
|
bootstrap_host=recv_req.bootstrap_host,
|
932
950
|
bootstrap_port=recv_req.bootstrap_port,
|
933
951
|
bootstrap_room=recv_req.bootstrap_room,
|
952
|
+
data_parallel_rank=recv_req.data_parallel_rank,
|
934
953
|
)
|
935
954
|
req.tokenizer = self.tokenizer
|
936
955
|
|
937
956
|
if self.disaggregation_mode != DisaggregationMode.NULL:
|
938
957
|
# Invalid request for disaggregated mode
|
939
958
|
if recv_req.bootstrap_room is None:
|
940
|
-
|
959
|
+
error_msg = (
|
941
960
|
f"Invalid request: Disaggregated request received without "
|
942
961
|
f"boostrap room id. {req.rid=}"
|
943
962
|
)
|
944
|
-
logger.error(
|
945
|
-
prepare_abort(req,
|
963
|
+
logger.error(error_msg)
|
964
|
+
prepare_abort(req, error_msg)
|
946
965
|
self.stream_output([req], req.return_logprob)
|
947
966
|
return
|
948
967
|
|
@@ -973,29 +992,23 @@ class Scheduler(
|
|
973
992
|
req.extend_image_inputs(image_inputs)
|
974
993
|
|
975
994
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
req.origin_input_ids = [0]
|
982
|
-
req.multimodal_inputs = None
|
983
|
-
req.sampling_params.max_new_tokens = 0
|
984
|
-
req.finished_reason = FINISH_ABORT(
|
985
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
995
|
+
req.set_finish_with_abort(
|
996
|
+
error_msg=(
|
997
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
998
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
999
|
+
)
|
986
1000
|
)
|
987
1001
|
self._add_request_to_queue(req)
|
988
1002
|
return
|
989
1003
|
|
990
|
-
# Validate
|
1004
|
+
# Validate prompt length
|
991
1005
|
error_msg = validate_input_length(
|
992
1006
|
req,
|
993
1007
|
self.max_req_input_len,
|
994
1008
|
self.server_args.allow_auto_truncate,
|
995
1009
|
)
|
996
1010
|
if error_msg:
|
997
|
-
req.
|
998
|
-
req.sampling_params.max_new_tokens = 0
|
1011
|
+
req.set_finish_with_abort(error_msg)
|
999
1012
|
self._add_request_to_queue(req)
|
1000
1013
|
return
|
1001
1014
|
|
@@ -1007,12 +1020,9 @@ class Scheduler(
|
|
1007
1020
|
req.logprob_start_len = recv_req.logprob_start_len
|
1008
1021
|
|
1009
1022
|
if req.logprob_start_len >= len(req.origin_input_ids):
|
1010
|
-
req.
|
1011
|
-
f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
|
1012
|
-
HTTPStatus.BAD_REQUEST,
|
1013
|
-
"BadRequestError",
|
1014
|
-
)
|
1023
|
+
error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
|
1015
1024
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
1025
|
+
req.set_finish_with_abort(error_msg)
|
1016
1026
|
self._add_request_to_queue(req)
|
1017
1027
|
return
|
1018
1028
|
|
@@ -1049,6 +1059,10 @@ class Scheduler(
|
|
1049
1059
|
if not cache_hit:
|
1050
1060
|
req.grammar_key = key
|
1051
1061
|
add_to_grammar_queue = True
|
1062
|
+
else:
|
1063
|
+
if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
|
1064
|
+
error_msg = f"Invalid grammar request with cache hit: {key=}"
|
1065
|
+
req.set_finish_with_abort(error_msg)
|
1052
1066
|
|
1053
1067
|
if add_to_grammar_queue:
|
1054
1068
|
req.queue_time_start = time.perf_counter()
|
@@ -1096,19 +1110,13 @@ class Scheduler(
|
|
1096
1110
|
req.extend_image_inputs(image_inputs)
|
1097
1111
|
|
1098
1112
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
req.origin_input_ids = [0]
|
1105
|
-
req.multimodal_inputs = None
|
1106
|
-
req.sampling_params.max_new_tokens = 0
|
1107
|
-
req.finished_reason = FINISH_ABORT(
|
1108
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1113
|
+
req.set_finish_with_abort(
|
1114
|
+
error_msg=(
|
1115
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
1116
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
1117
|
+
)
|
1109
1118
|
)
|
1110
|
-
|
1111
|
-
self.waiting_queue.append(req)
|
1119
|
+
self._add_request_to_queue(req)
|
1112
1120
|
return
|
1113
1121
|
|
1114
1122
|
# Validate prompts length
|
@@ -1154,7 +1162,8 @@ class Scheduler(
|
|
1154
1162
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1155
1163
|
f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
|
1156
1164
|
f += f"#queue-req: {len(self.waiting_queue)}, "
|
1157
|
-
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
|
1165
|
+
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
|
1166
|
+
f += f"time: {gap_latency:.2f} "
|
1158
1167
|
else:
|
1159
1168
|
f += f"#queue-req: {len(self.waiting_queue)}"
|
1160
1169
|
|
@@ -1515,7 +1524,7 @@ class Scheduler(
|
|
1515
1524
|
self.new_token_ratio = new_token_ratio
|
1516
1525
|
|
1517
1526
|
logger.info(
|
1518
|
-
"
|
1527
|
+
"KV cache pool is full. Retract requests. "
|
1519
1528
|
f"#retracted_reqs: {len(retracted_reqs)}, "
|
1520
1529
|
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
1521
1530
|
)
|
@@ -1539,13 +1548,8 @@ class Scheduler(
|
|
1539
1548
|
"""Run a batch."""
|
1540
1549
|
self.forward_ct += 1
|
1541
1550
|
|
1542
|
-
#
|
1543
|
-
|
1544
|
-
self.profiler_target_forward_ct
|
1545
|
-
and self.profiler_target_forward_ct <= self.forward_ct
|
1546
|
-
):
|
1547
|
-
self.send_to_tokenizer.send_pyobj(self.stop_profile())
|
1548
|
-
|
1551
|
+
# Whether to run the profiler
|
1552
|
+
self._profile_batch_predicate(batch)
|
1549
1553
|
if self.forward_sleep_time is not None:
|
1550
1554
|
logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
|
1551
1555
|
time.sleep(self.forward_sleep_time)
|
@@ -1571,10 +1575,9 @@ class Scheduler(
|
|
1571
1575
|
num_accepted_tokens,
|
1572
1576
|
can_run_cuda_graph,
|
1573
1577
|
) = self.draft_worker.forward_batch_speculative_generation(batch)
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
self.spec_num_total_forward_ct += batch.batch_size()
|
1578
|
+
bs = batch.batch_size()
|
1579
|
+
self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
|
1580
|
+
self.spec_num_total_forward_ct += bs
|
1578
1581
|
self.num_generated_tokens += num_accepted_tokens
|
1579
1582
|
|
1580
1583
|
if self.pp_group.is_last_rank:
|
@@ -1648,6 +1651,9 @@ class Scheduler(
|
|
1648
1651
|
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
1649
1652
|
spec_algorithm=self.spec_algorithm,
|
1650
1653
|
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
1654
|
+
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
1655
|
+
enable_deepep_moe=self.server_args.enable_deepep_moe,
|
1656
|
+
deepep_mode=DeepEPMode[self.server_args.deepep_mode],
|
1651
1657
|
)
|
1652
1658
|
|
1653
1659
|
@staticmethod
|
@@ -1661,6 +1667,9 @@ class Scheduler(
|
|
1661
1667
|
disable_cuda_graph: bool,
|
1662
1668
|
spec_algorithm,
|
1663
1669
|
speculative_num_draft_tokens,
|
1670
|
+
enable_two_batch_overlap: bool,
|
1671
|
+
enable_deepep_moe: bool,
|
1672
|
+
deepep_mode: DeepEPMode,
|
1664
1673
|
):
|
1665
1674
|
# Check if other DP workers have running batches
|
1666
1675
|
if local_batch is None:
|
@@ -1696,17 +1705,26 @@ class Scheduler(
|
|
1696
1705
|
is_extend_in_batch = (
|
1697
1706
|
local_batch.forward_mode.is_extend() if local_batch else False
|
1698
1707
|
)
|
1708
|
+
|
1709
|
+
tbo_preparer = TboDPAttentionPreparer()
|
1710
|
+
|
1699
1711
|
local_info = torch.tensor(
|
1700
1712
|
[
|
1701
1713
|
num_tokens,
|
1702
1714
|
can_cuda_graph,
|
1703
1715
|
num_tokens_for_logprob,
|
1704
1716
|
is_extend_in_batch,
|
1717
|
+
*tbo_preparer.prepare_all_gather(
|
1718
|
+
local_batch,
|
1719
|
+
deepep_mode,
|
1720
|
+
enable_deepep_moe,
|
1721
|
+
enable_two_batch_overlap,
|
1722
|
+
),
|
1705
1723
|
],
|
1706
1724
|
dtype=torch.int64,
|
1707
1725
|
)
|
1708
1726
|
global_info = torch.empty(
|
1709
|
-
(dp_size, attn_tp_size,
|
1727
|
+
(dp_size, attn_tp_size, 6),
|
1710
1728
|
dtype=torch.int64,
|
1711
1729
|
)
|
1712
1730
|
torch.distributed.all_gather_into_tensor(
|
@@ -1719,6 +1737,10 @@ class Scheduler(
|
|
1719
1737
|
global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
|
1720
1738
|
is_extend_in_batch = global_info[:, 0, 3].tolist()
|
1721
1739
|
|
1740
|
+
tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
|
1741
|
+
global_info[:, :, 4:6]
|
1742
|
+
)
|
1743
|
+
|
1722
1744
|
if local_batch is None and max(global_num_tokens) > 0:
|
1723
1745
|
local_batch = get_idle_batch()
|
1724
1746
|
|
@@ -1732,6 +1754,8 @@ class Scheduler(
|
|
1732
1754
|
local_batch.global_num_tokens_for_logprob = (
|
1733
1755
|
global_num_tokens_for_logprob
|
1734
1756
|
)
|
1757
|
+
local_batch.tbo_split_seq_index = tbo_split_seq_index
|
1758
|
+
local_batch.global_forward_mode = global_forward_mode
|
1735
1759
|
|
1736
1760
|
# Check forward mode for cuda graph
|
1737
1761
|
if not disable_cuda_graph:
|
@@ -1757,17 +1781,25 @@ class Scheduler(
|
|
1757
1781
|
"""Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
|
1758
1782
|
|
1759
1783
|
num_ready_reqs = 0
|
1760
|
-
|
1784
|
+
num_timeout_reqs = 0
|
1761
1785
|
for req in self.grammar_queue:
|
1762
1786
|
try:
|
1787
|
+
if req.finished(): # It is aborted by AbortReq
|
1788
|
+
num_ready_reqs += 1
|
1789
|
+
continue
|
1763
1790
|
req.grammar = req.grammar.result(timeout=0.03)
|
1764
|
-
|
1765
|
-
|
1791
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1792
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1793
|
+
req.set_finish_with_abort(
|
1794
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1795
|
+
)
|
1766
1796
|
num_ready_reqs += 1
|
1767
1797
|
except futures._base.TimeoutError:
|
1768
1798
|
req.grammar_wait_ct += 1
|
1799
|
+
# NOTE(lianmin): this timeout is the waiting time of the above line. It is
|
1800
|
+
# not the waiting time from it enters the grammar queue.
|
1769
1801
|
if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
|
1770
|
-
|
1802
|
+
num_timeout_reqs = 1
|
1771
1803
|
break
|
1772
1804
|
|
1773
1805
|
if self.server_args.enable_dp_attention:
|
@@ -1779,28 +1811,33 @@ class Scheduler(
|
|
1779
1811
|
|
1780
1812
|
if tp_size > 1:
|
1781
1813
|
# Sync across TP ranks to make sure they have the same number of ready requests
|
1782
|
-
tensor = torch.tensor([num_ready_reqs,
|
1814
|
+
tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
|
1783
1815
|
torch.distributed.all_reduce(
|
1784
1816
|
tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
|
1785
1817
|
)
|
1786
|
-
num_ready_reqs_max,
|
1818
|
+
num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
|
1787
1819
|
|
1788
1820
|
for i in range(num_ready_reqs, num_ready_reqs_max):
|
1789
1821
|
req = self.grammar_queue[i]
|
1822
|
+
if req.finished(): # It is aborted by AbortReq
|
1823
|
+
continue
|
1790
1824
|
req.grammar = req.grammar.result()
|
1791
|
-
|
1792
|
-
|
1825
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1826
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1827
|
+
req.set_finish_with_abort(
|
1828
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1829
|
+
)
|
1830
|
+
else:
|
1831
|
+
num_ready_reqs_max = num_ready_reqs
|
1832
|
+
num_timeout_reqs_max = num_timeout_reqs
|
1793
1833
|
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1802
|
-
)
|
1803
|
-
num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
|
1834
|
+
for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
|
1835
|
+
req = self.grammar_queue[i]
|
1836
|
+
req.grammar.cancel()
|
1837
|
+
error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
|
1838
|
+
req.set_finish_with_abort(error_msg)
|
1839
|
+
self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
|
1840
|
+
num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
|
1804
1841
|
|
1805
1842
|
self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
|
1806
1843
|
self.grammar_queue = self.grammar_queue[num_ready_reqs:]
|
@@ -1887,6 +1924,27 @@ class Scheduler(
|
|
1887
1924
|
if_success = False
|
1888
1925
|
return if_success
|
1889
1926
|
|
1927
|
+
def get_load(self):
|
1928
|
+
# TODO(lsyin): use dynamically maintained num_waiting_tokens
|
1929
|
+
load = (
|
1930
|
+
self.max_total_num_tokens
|
1931
|
+
- self.token_to_kv_pool_allocator.available_size()
|
1932
|
+
- self.tree_cache.evictable_size()
|
1933
|
+
)
|
1934
|
+
load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
|
1935
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1936
|
+
load += sum(
|
1937
|
+
len(req.origin_input_ids)
|
1938
|
+
for req in self.disagg_prefill_bootstrap_queue.queue
|
1939
|
+
)
|
1940
|
+
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1941
|
+
load += sum(
|
1942
|
+
len(req.req.origin_input_ids)
|
1943
|
+
for req in self.disagg_decode_prealloc_queue.queue
|
1944
|
+
)
|
1945
|
+
|
1946
|
+
return load
|
1947
|
+
|
1890
1948
|
def get_internal_state(self, recv_req: GetInternalStateReq):
|
1891
1949
|
ret = dict(global_server_args_dict)
|
1892
1950
|
ret["last_gen_throughput"] = self.last_gen_throughput
|
@@ -1896,9 +1954,10 @@ class Scheduler(
|
|
1896
1954
|
)
|
1897
1955
|
if RECORD_STEP_TIME:
|
1898
1956
|
ret["step_time_dict"] = self.step_time_dict
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
1957
|
+
|
1958
|
+
ret["load"] = self.get_load()
|
1959
|
+
|
1960
|
+
return GetInternalStateReqOutput(internal_state=ret)
|
1902
1961
|
|
1903
1962
|
def set_internal_state(self, recv_req: SetInternalStateReq):
|
1904
1963
|
server_args_dict = recv_req.server_args
|
@@ -1932,7 +1991,7 @@ class Scheduler(
|
|
1932
1991
|
self.cum_spec_accept_length = self.cum_spec_accept_count = 0
|
1933
1992
|
for k, v in server_args_dict.items():
|
1934
1993
|
global_server_args_dict[k] = v
|
1935
|
-
logger.info(f"Global server args updated!
|
1994
|
+
logger.info(f"Global server args updated! {global_server_args_dict=}")
|
1936
1995
|
return SetInternalStateReqOutput(
|
1937
1996
|
updated=True,
|
1938
1997
|
server_args=global_server_args_dict,
|
@@ -1974,8 +2033,6 @@ class Scheduler(
|
|
1974
2033
|
)
|
1975
2034
|
|
1976
2035
|
def abort_request(self, recv_req: AbortReq):
|
1977
|
-
# TODO(lmzheng): abort the requests in the grammar queue.
|
1978
|
-
|
1979
2036
|
# Delete requests in the waiting queue
|
1980
2037
|
to_del = []
|
1981
2038
|
for i, req in enumerate(self.waiting_queue):
|
@@ -1984,10 +2041,23 @@ class Scheduler(
|
|
1984
2041
|
|
1985
2042
|
# Sort in reverse order to avoid index issues when deleting
|
1986
2043
|
for i in reversed(to_del):
|
2044
|
+
# Abort method 1: directly pop from the queue
|
2045
|
+
# This only works for requests that have not started anything.
|
2046
|
+
# We still need to send something back to TokenizerManager to clean up the state.
|
1987
2047
|
req = self.waiting_queue.pop(i)
|
1988
2048
|
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
1989
2049
|
logger.debug(f"Abort queued request. {req.rid=}")
|
1990
2050
|
|
2051
|
+
# Delete the requests in the grammar queue
|
2052
|
+
for req in self.grammar_queue:
|
2053
|
+
# Abort method 2: call `set_finish_with_abort`
|
2054
|
+
# The request will still run one prefill forward pass.
|
2055
|
+
# In this case, we change the input_ids to be only one token to make this prefill cheap.
|
2056
|
+
if req.rid.startswith(recv_req.rid):
|
2057
|
+
logger.debug(f"Abort grammar queue request. {req.rid=}")
|
2058
|
+
req.grammar.cancel()
|
2059
|
+
req.set_finish_with_abort("Aborted by AbortReq.")
|
2060
|
+
|
1991
2061
|
# Delete requests in the running batch
|
1992
2062
|
if self.cur_batch is self.running_batch or self.cur_batch is None:
|
1993
2063
|
reqs = self.running_batch.reqs
|
@@ -1996,6 +2066,9 @@ class Scheduler(
|
|
1996
2066
|
|
1997
2067
|
for req in reqs:
|
1998
2068
|
if req.rid.startswith(recv_req.rid) and not req.finished():
|
2069
|
+
# Abort method 3: set `to_abort=True`
|
2070
|
+
# The request will still run one decode forward pass.
|
2071
|
+
# Then we reuse all existing code to clean up the KV cache allocation.
|
1999
2072
|
logger.debug(f"Abort running request. {req.rid=}")
|
2000
2073
|
req.to_abort = True
|
2001
2074
|
|
@@ -2075,46 +2148,86 @@ class Scheduler(
|
|
2075
2148
|
|
2076
2149
|
def profile(self, recv_req: ProfileReq):
|
2077
2150
|
if recv_req.type == ProfileReqType.START_PROFILE:
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2151
|
+
if recv_req.profile_by_stage:
|
2152
|
+
return self.init_profile(
|
2153
|
+
recv_req.output_dir,
|
2154
|
+
recv_req.num_steps,
|
2155
|
+
recv_req.activities,
|
2156
|
+
recv_req.with_stack,
|
2157
|
+
recv_req.record_shapes,
|
2158
|
+
recv_req.profile_by_stage,
|
2159
|
+
recv_req.profile_id,
|
2160
|
+
)
|
2161
|
+
else:
|
2162
|
+
self.init_profile(
|
2163
|
+
recv_req.output_dir,
|
2164
|
+
recv_req.num_steps,
|
2165
|
+
recv_req.activities,
|
2166
|
+
recv_req.with_stack,
|
2167
|
+
recv_req.record_shapes,
|
2168
|
+
recv_req.profile_by_stage,
|
2169
|
+
recv_req.profile_id,
|
2170
|
+
)
|
2171
|
+
return self.start_profile(True)
|
2086
2172
|
else:
|
2087
2173
|
return self.stop_profile()
|
2088
2174
|
|
2089
|
-
def
|
2175
|
+
def init_profile(
|
2090
2176
|
self,
|
2091
2177
|
output_dir: Optional[str],
|
2092
2178
|
num_steps: Optional[int],
|
2093
2179
|
activities: Optional[List[str]],
|
2094
2180
|
with_stack: Optional[bool],
|
2095
2181
|
record_shapes: Optional[bool],
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2182
|
+
profile_by_stage: bool,
|
2183
|
+
profile_id: str,
|
2184
|
+
) -> ProfileReqOutput:
|
2185
|
+
if self.profile_in_progress:
|
2099
2186
|
return ProfileReqOutput(
|
2100
2187
|
success=False,
|
2101
2188
|
message="Profiling is already in progress. Call /stop_profile first.",
|
2102
2189
|
)
|
2103
2190
|
|
2191
|
+
self.profile_by_stage = profile_by_stage
|
2192
|
+
|
2104
2193
|
if output_dir is None:
|
2105
2194
|
output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
|
2106
2195
|
if activities is None:
|
2107
2196
|
activities = ["CPU", "GPU"]
|
2108
2197
|
|
2109
2198
|
self.torch_profiler_output_dir = output_dir
|
2199
|
+
self.torch_profiler_with_stack = with_stack
|
2200
|
+
self.torch_profiler_record_shapes = record_shapes
|
2110
2201
|
self.profiler_activities = activities
|
2111
|
-
self.
|
2202
|
+
self.profile_id = profile_id
|
2203
|
+
|
2204
|
+
if num_steps:
|
2205
|
+
self.profile_steps = num_steps
|
2206
|
+
if self.profile_by_stage:
|
2207
|
+
self.profiler_target_prefill_ct = num_steps
|
2208
|
+
self.profiler_target_decode_ct = num_steps
|
2209
|
+
self.profiler_prefill_ct = 0
|
2210
|
+
self.profiler_decode_ct = 0
|
2211
|
+
else:
|
2212
|
+
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2213
|
+
# The caller will be notified when reaching profiler_target_forward_ct
|
2214
|
+
else:
|
2215
|
+
self.profiler_target_forward_ct = None
|
2216
|
+
|
2217
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2218
|
+
|
2219
|
+
def start_profile(
|
2220
|
+
self, stage: Optional[ForwardMode] = None
|
2221
|
+
) -> ProfileReqOutput | None:
|
2222
|
+
stage_str = f" for {stage.__str__()}" if stage else ""
|
2112
2223
|
logger.info(
|
2113
|
-
"Profiling starts. Traces will be saved to:
|
2114
|
-
self.torch_profiler_output_dir,
|
2115
|
-
self.profiler_id,
|
2224
|
+
f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
|
2116
2225
|
)
|
2117
2226
|
|
2227
|
+
activities = self.profiler_activities
|
2228
|
+
with_stack = self.torch_profiler_with_stack
|
2229
|
+
record_shapes = self.torch_profiler_record_shapes
|
2230
|
+
|
2118
2231
|
activity_map = {
|
2119
2232
|
"CPU": torch.profiler.ProfilerActivity.CPU,
|
2120
2233
|
"GPU": torch.profiler.ProfilerActivity.CUDA,
|
@@ -2123,48 +2236,100 @@ class Scheduler(
|
|
2123
2236
|
activity_map[a] for a in activities if a in activity_map
|
2124
2237
|
]
|
2125
2238
|
|
2126
|
-
if
|
2239
|
+
if "RPD" in activities:
|
2240
|
+
from rpdTracerControl import rpdTracerControl
|
2241
|
+
|
2242
|
+
rpdTracerControl.skipCreate()
|
2243
|
+
|
2244
|
+
self.rpd_profile_path = os.path.join(
|
2245
|
+
self.torch_profiler_output_dir,
|
2246
|
+
"rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
2247
|
+
)
|
2248
|
+
|
2249
|
+
if self.tp_rank == 0:
|
2250
|
+
import sqlite3
|
2251
|
+
|
2252
|
+
from rocpd.schema import RocpdSchema
|
2253
|
+
|
2254
|
+
if os.path.exists("trace.rpd"):
|
2255
|
+
os.unlink("trace.rpd")
|
2256
|
+
schema = RocpdSchema()
|
2257
|
+
connection = sqlite3.connect("trace.rpd")
|
2258
|
+
schema.writeSchema(connection)
|
2259
|
+
connection.commit()
|
2260
|
+
del connection
|
2261
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2262
|
+
|
2263
|
+
self.rpd_profiler = rpdTracerControl()
|
2264
|
+
self.rpd_profiler.setPythonTrace(True)
|
2265
|
+
self.rpd_profiler.start()
|
2266
|
+
self.rpd_profiler.rangePush("", "rpd profile range", "")
|
2267
|
+
self.profile_in_progress = True
|
2268
|
+
elif torchprof_activities:
|
2127
2269
|
self.torch_profiler = torch.profiler.profile(
|
2128
2270
|
activities=torchprof_activities,
|
2129
2271
|
with_stack=with_stack if with_stack is not None else True,
|
2130
2272
|
record_shapes=record_shapes if record_shapes is not None else False,
|
2131
2273
|
)
|
2132
2274
|
self.torch_profiler.start()
|
2275
|
+
self.profile_in_progress = True
|
2133
2276
|
|
2134
2277
|
if "MEM" in activities:
|
2135
2278
|
torch.cuda.memory._record_memory_history(max_entries=100000)
|
2279
|
+
self.profile_in_progress = True
|
2136
2280
|
|
2137
2281
|
if "CUDA_PROFILER" in activities:
|
2138
2282
|
torch.cuda.cudart().cudaProfilerStart()
|
2139
2283
|
|
2140
|
-
|
2141
|
-
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2142
|
-
# The caller will be notified when reaching profiler_target_forward_ct
|
2143
|
-
else:
|
2144
|
-
self.profiler_target_forward_ct = None
|
2145
|
-
return ProfileReqOutput(success=True, message="Succeeded")
|
2284
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2146
2285
|
|
2147
|
-
def stop_profile(
|
2148
|
-
|
2286
|
+
def stop_profile(
|
2287
|
+
self, stage: Optional[ForwardMode] = None
|
2288
|
+
) -> ProfileReqOutput | None:
|
2289
|
+
if not self.profile_in_progress:
|
2149
2290
|
return ProfileReqOutput(
|
2150
2291
|
success=False,
|
2151
2292
|
message="Profiling is not in progress. Call /start_profile first.",
|
2152
2293
|
)
|
2153
2294
|
|
2154
|
-
|
2295
|
+
if not Path(self.torch_profiler_output_dir).exists():
|
2296
|
+
Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
|
2297
|
+
|
2298
|
+
stage_suffix = f"-{stage.__str__()}" if stage else ""
|
2299
|
+
logger.info("Stop profiling" + stage_suffix + "...")
|
2155
2300
|
if self.torch_profiler is not None:
|
2156
2301
|
self.torch_profiler.stop()
|
2157
2302
|
self.torch_profiler.export_chrome_trace(
|
2158
2303
|
os.path.join(
|
2159
2304
|
self.torch_profiler_output_dir,
|
2160
|
-
self.
|
2305
|
+
self.profile_id
|
2306
|
+
+ f"-TP-{self.tp_rank}"
|
2307
|
+
+ stage_suffix
|
2308
|
+
+ ".trace.json.gz",
|
2161
2309
|
)
|
2162
2310
|
)
|
2311
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2312
|
+
|
2313
|
+
if self.rpd_profiler is not None:
|
2314
|
+
self.rpd_profiler.rangePop()
|
2315
|
+
self.rpd_profiler.stop()
|
2316
|
+
self.rpd_profiler.flush()
|
2317
|
+
|
2318
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2319
|
+
if self.tp_rank == 0:
|
2320
|
+
from sglang.srt.utils import rpd_to_chrome_trace
|
2163
2321
|
|
2164
|
-
|
2322
|
+
rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
|
2323
|
+
self.rpd_profiler = None
|
2324
|
+
self.rpd_profiler_path = None
|
2325
|
+
|
2326
|
+
if self.profiler_activities is not None and "MEM" in self.profiler_activities:
|
2165
2327
|
memory_profile_path = os.path.join(
|
2166
2328
|
self.torch_profiler_output_dir,
|
2167
|
-
|
2329
|
+
str(time.time())
|
2330
|
+
+ f"-TP-{self.tp_rank}-memory"
|
2331
|
+
+ stage_suffix
|
2332
|
+
+ ".pickle",
|
2168
2333
|
)
|
2169
2334
|
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
2170
2335
|
torch.cuda.memory._record_memory_history(enabled=None)
|
@@ -2177,10 +2342,38 @@ class Scheduler(
|
|
2177
2342
|
self.torch_profiler_output_dir,
|
2178
2343
|
)
|
2179
2344
|
self.torch_profiler = None
|
2180
|
-
self.
|
2181
|
-
|
2182
|
-
|
2183
|
-
|
2345
|
+
self.profile_in_progress = False
|
2346
|
+
|
2347
|
+
return ProfileReqOutput(success=True, message="Succeeded.")
|
2348
|
+
|
2349
|
+
def _profile_batch_predicate(self, batch):
|
2350
|
+
if self.profile_by_stage:
|
2351
|
+
if batch.forward_mode.is_prefill():
|
2352
|
+
if self.profiler_prefill_ct == 0:
|
2353
|
+
self.start_profile(batch.forward_mode)
|
2354
|
+
self.profiler_prefill_ct += 1
|
2355
|
+
if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
|
2356
|
+
if self.profile_in_progress:
|
2357
|
+
self.stop_profile(stage=ForwardMode.EXTEND)
|
2358
|
+
elif batch.forward_mode.is_decode():
|
2359
|
+
if self.profiler_decode_ct == 0:
|
2360
|
+
if self.profile_in_progress:
|
2361
|
+
# force trace flush
|
2362
|
+
self.stop_profile(ForwardMode.EXTEND)
|
2363
|
+
self.start_profile(batch.forward_mode)
|
2364
|
+
self.profiler_decode_ct += 1
|
2365
|
+
if self.profiler_decode_ct > self.profiler_target_decode_ct:
|
2366
|
+
if self.profile_in_progress:
|
2367
|
+
self.stop_profile(stage=ForwardMode.DECODE)
|
2368
|
+
else:
|
2369
|
+
raise RuntimeError("unsupported profile stage")
|
2370
|
+
else:
|
2371
|
+
# Check profiler
|
2372
|
+
if (
|
2373
|
+
self.profiler_target_forward_ct
|
2374
|
+
and self.profiler_target_forward_ct <= self.forward_ct
|
2375
|
+
):
|
2376
|
+
self.stop_profile()
|
2184
2377
|
|
2185
2378
|
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
2186
2379
|
if recv_req == ExpertDistributionReq.START_RECORD:
|