sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
|
|
24
24
|
from concurrent import futures
|
25
25
|
from dataclasses import dataclass
|
26
26
|
from http import HTTPStatus
|
27
|
+
from pathlib import Path
|
27
28
|
from types import SimpleNamespace
|
28
29
|
from typing import Dict, List, Optional, Tuple, Union
|
29
30
|
|
@@ -35,20 +36,26 @@ from torch.distributed import barrier
|
|
35
36
|
|
36
37
|
from sglang.global_config import global_config
|
37
38
|
from sglang.srt.configs.model_config import ModelConfig
|
38
|
-
from sglang.srt.constrained.base_grammar_backend import
|
39
|
+
from sglang.srt.constrained.base_grammar_backend import (
|
40
|
+
INVALID_GRAMMAR_OBJ,
|
41
|
+
create_grammar_backend,
|
42
|
+
)
|
39
43
|
from sglang.srt.disaggregation.decode import (
|
40
44
|
DecodePreallocQueue,
|
41
45
|
DecodeTransferQueue,
|
42
46
|
SchedulerDisaggregationDecodeMixin,
|
43
47
|
)
|
48
|
+
from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
|
44
49
|
from sglang.srt.disaggregation.prefill import (
|
45
50
|
PrefillBootstrapQueue,
|
46
51
|
SchedulerDisaggregationPrefillMixin,
|
47
52
|
)
|
48
53
|
from sglang.srt.disaggregation.utils import (
|
49
54
|
DisaggregationMode,
|
55
|
+
MetadataBuffers,
|
50
56
|
ReqToMetadataIdxAllocator,
|
51
57
|
TransferBackend,
|
58
|
+
prepare_abort,
|
52
59
|
)
|
53
60
|
from sglang.srt.distributed import get_pp_group, get_world_group
|
54
61
|
from sglang.srt.hf_transformers_utils import (
|
@@ -58,7 +65,9 @@ from sglang.srt.hf_transformers_utils import (
|
|
58
65
|
)
|
59
66
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
60
67
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
61
|
-
from sglang.srt.managers.expert_distribution import
|
68
|
+
from sglang.srt.managers.expert_distribution import (
|
69
|
+
get_global_expert_distribution_recorder,
|
70
|
+
)
|
62
71
|
from sglang.srt.managers.io_struct import (
|
63
72
|
AbortReq,
|
64
73
|
CloseSessionReqInput,
|
@@ -97,6 +106,7 @@ from sglang.srt.managers.io_struct import (
|
|
97
106
|
UpdateWeightsFromTensorReqInput,
|
98
107
|
UpdateWeightsFromTensorReqOutput,
|
99
108
|
)
|
109
|
+
from sglang.srt.managers.mm_utils import init_embedding_cache
|
100
110
|
from sglang.srt.managers.schedule_batch import (
|
101
111
|
FINISH_ABORT,
|
102
112
|
MultimodalInputs,
|
@@ -125,12 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
|
|
125
135
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
126
136
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
127
137
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
138
|
+
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
128
139
|
from sglang.srt.utils import (
|
140
|
+
DeepEPMode,
|
129
141
|
DynamicGradMode,
|
130
142
|
broadcast_pyobj,
|
131
143
|
configure_logger,
|
132
|
-
crash_on_warnings,
|
133
144
|
disable_request_logging,
|
145
|
+
get_available_gpu_memory,
|
134
146
|
get_bool_env_var,
|
135
147
|
get_zmq_socket,
|
136
148
|
kill_itself_when_parent_died,
|
@@ -142,8 +154,6 @@ from sglang.srt.utils import (
|
|
142
154
|
)
|
143
155
|
from sglang.utils import TypeBasedDispatcher, get_exception_traceback
|
144
156
|
|
145
|
-
expert_distribution_recorder = ExpertDistributionRecorder()
|
146
|
-
|
147
157
|
logger = logging.getLogger(__name__)
|
148
158
|
|
149
159
|
# Test retract decode for debugging purposes
|
@@ -198,6 +208,7 @@ class Scheduler(
|
|
198
208
|
self.enable_overlap = not server_args.disable_overlap_schedule
|
199
209
|
self.skip_tokenizer_init = server_args.skip_tokenizer_init
|
200
210
|
self.enable_metrics = server_args.enable_metrics
|
211
|
+
self.enable_kv_cache_events = server_args.kv_events_config is not None
|
201
212
|
self.stream_interval = server_args.stream_interval
|
202
213
|
self.spec_algorithm = SpeculativeAlgorithm.from_string(
|
203
214
|
server_args.speculative_algorithm
|
@@ -205,8 +216,6 @@ class Scheduler(
|
|
205
216
|
self.gpu_id = gpu_id
|
206
217
|
self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
|
207
218
|
self.page_size = server_args.page_size
|
208
|
-
|
209
|
-
# Distributed rank info
|
210
219
|
self.dp_size = server_args.dp_size
|
211
220
|
self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
|
212
221
|
compute_dp_attention_world_info(
|
@@ -326,12 +335,16 @@ class Scheduler(
|
|
326
335
|
|
327
336
|
# Print debug info
|
328
337
|
if tp_rank == 0:
|
338
|
+
avail_mem = get_available_gpu_memory(
|
339
|
+
self.device, self.gpu_id, empty_cache=False
|
340
|
+
)
|
329
341
|
logger.info(
|
330
342
|
f"max_total_num_tokens={self.max_total_num_tokens}, "
|
331
343
|
f"chunked_prefill_size={server_args.chunked_prefill_size}, "
|
332
344
|
f"max_prefill_tokens={self.max_prefill_tokens}, "
|
333
345
|
f"max_running_requests={self.max_running_requests}, "
|
334
|
-
f"context_len={self.model_config.context_len}"
|
346
|
+
f"context_len={self.model_config.context_len}, "
|
347
|
+
f"available_gpu_mem={avail_mem:.2f} GB"
|
335
348
|
)
|
336
349
|
|
337
350
|
# Init memory pool and cache
|
@@ -349,12 +362,13 @@ class Scheduler(
|
|
349
362
|
self.forward_ct_decode = 0
|
350
363
|
self.num_generated_tokens = 0
|
351
364
|
self.num_prefill_tokens = 0
|
352
|
-
self.last_decode_stats_tic = time.
|
353
|
-
self.last_prefill_stats_tic = time.
|
365
|
+
self.last_decode_stats_tic = time.perf_counter()
|
366
|
+
self.last_prefill_stats_tic = time.perf_counter()
|
354
367
|
self.return_health_check_ct = 0
|
355
368
|
self.current_stream = torch.get_device_module(self.device).current_stream()
|
356
369
|
if self.device == "cpu":
|
357
370
|
self.current_stream.synchronize = lambda: None # No-op for CPU
|
371
|
+
self.forward_sleep_time = None
|
358
372
|
|
359
373
|
# Init session info
|
360
374
|
self.sessions: Dict[str, Session] = {}
|
@@ -416,13 +430,20 @@ class Scheduler(
|
|
416
430
|
self.torch_profiler = None
|
417
431
|
self.torch_profiler_output_dir: Optional[str] = None
|
418
432
|
self.profiler_activities: Optional[List[str]] = None
|
419
|
-
self.
|
433
|
+
self.profile_id: Optional[str] = None
|
420
434
|
self.profiler_target_forward_ct: Optional[int] = None
|
421
|
-
|
422
|
-
self.
|
435
|
+
self.profiler_target_prefill_ct: Optional[int] = None
|
436
|
+
self.profiler_target_decode_ct: Optional[int] = None
|
437
|
+
self.profiler_prefill_ct: Optional[int] = None
|
438
|
+
self.profiler_decode_ct: Optional[int] = None
|
439
|
+
self.profile_by_stage: bool = False
|
440
|
+
self.profile_steps: Optional[int] = None
|
441
|
+
self.profile_in_progress: bool = False
|
442
|
+
self.rpd_profiler = None
|
423
443
|
|
424
444
|
# Init metrics stats
|
425
445
|
self.init_metrics()
|
446
|
+
self.init_kv_events(server_args.kv_events_config)
|
426
447
|
|
427
448
|
# Init request dispatcher
|
428
449
|
self._request_dispatcher = TypeBasedDispatcher(
|
@@ -516,6 +537,7 @@ class Scheduler(
|
|
516
537
|
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
517
538
|
page_size=self.page_size,
|
518
539
|
disable=server_args.disable_radix_cache,
|
540
|
+
enable_kv_cache_events=self.enable_kv_cache_events,
|
519
541
|
)
|
520
542
|
|
521
543
|
self.decode_mem_cache_buf_multiplier = (
|
@@ -548,6 +570,12 @@ class Scheduler(
|
|
548
570
|
},
|
549
571
|
)
|
550
572
|
|
573
|
+
def init_kv_events(self, kv_events_config: Optional[str]):
|
574
|
+
if self.enable_kv_cache_events:
|
575
|
+
self.kv_event_publisher = EventPublisherFactory.create(
|
576
|
+
kv_events_config, self.attn_dp_rank
|
577
|
+
)
|
578
|
+
|
551
579
|
def init_disaggregation(self):
|
552
580
|
self.transfer_backend = TransferBackend(
|
553
581
|
self.server_args.disaggregation_transfer_backend
|
@@ -560,29 +588,28 @@ class Scheduler(
|
|
560
588
|
req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
561
589
|
buffer_size
|
562
590
|
)
|
563
|
-
|
564
|
-
# A list of metadata buffers. The shape is (b, metadata_size) where
|
565
|
-
# b corresponds to a max running requests. The last shape * dtype.itemsize
|
566
|
-
# should be larger than 64 bytes to work with RDMA, so we pad it.
|
567
|
-
output_id_buffer = torch.zeros(
|
568
|
-
(buffer_size, 16), dtype=aux_dtype, device="cpu"
|
569
|
-
)
|
570
|
-
metadata_buffers = [output_id_buffer]
|
591
|
+
self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
|
571
592
|
|
572
593
|
# The decode requests polling kv cache
|
573
594
|
self.disagg_decode_transfer_queue = DecodeTransferQueue(
|
574
595
|
gloo_group=self.attn_tp_cpu_group,
|
575
596
|
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
576
|
-
metadata_buffers=
|
597
|
+
metadata_buffers=self.disagg_metadata_buffers,
|
598
|
+
scheduler=self,
|
599
|
+
tree_cache=self.tree_cache,
|
577
600
|
)
|
578
601
|
|
579
602
|
# The decode requests pending for pre-allocation
|
580
603
|
self.disagg_decode_prealloc_queue = DecodePreallocQueue(
|
581
604
|
req_to_token_pool=self.req_to_token_pool,
|
582
605
|
token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
|
606
|
+
draft_token_to_kv_pool=(
|
607
|
+
None
|
608
|
+
if self.draft_worker is None
|
609
|
+
else self.draft_worker.model_runner.token_to_kv_pool
|
610
|
+
),
|
583
611
|
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
584
|
-
metadata_buffers=
|
585
|
-
aux_dtype=aux_dtype,
|
612
|
+
metadata_buffers=self.disagg_metadata_buffers,
|
586
613
|
scheduler=self,
|
587
614
|
transfer_queue=self.disagg_decode_transfer_queue,
|
588
615
|
tree_cache=self.tree_cache,
|
@@ -602,20 +629,17 @@ class Scheduler(
|
|
602
629
|
req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
603
630
|
buffer_size
|
604
631
|
)
|
605
|
-
|
606
|
-
# A list of metadata buffers. The shape is (b, metadata_size) where
|
607
|
-
# b corresponds to a max running requests. The last shape * dtype.itemsize
|
608
|
-
# should be larger than 64 bytes to work with RDMA, so we pad it.
|
609
|
-
output_id_buffer = torch.zeros(
|
610
|
-
(buffer_size, 16), dtype=aux_dtype, device="cpu"
|
611
|
-
)
|
612
|
-
metadata_buffers = [output_id_buffer]
|
632
|
+
self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
|
613
633
|
|
614
634
|
self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
|
615
635
|
token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
|
636
|
+
draft_token_to_kv_pool=(
|
637
|
+
None
|
638
|
+
if self.draft_worker is None
|
639
|
+
else self.draft_worker.model_runner.token_to_kv_pool
|
640
|
+
),
|
616
641
|
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
617
|
-
metadata_buffers=
|
618
|
-
aux_dtype=aux_dtype,
|
642
|
+
metadata_buffers=self.disagg_metadata_buffers,
|
619
643
|
tp_rank=self.tp_rank,
|
620
644
|
tp_size=self.tp_size,
|
621
645
|
bootstrap_port=self.server_args.disaggregation_bootstrap_port,
|
@@ -925,9 +949,22 @@ class Scheduler(
|
|
925
949
|
bootstrap_host=recv_req.bootstrap_host,
|
926
950
|
bootstrap_port=recv_req.bootstrap_port,
|
927
951
|
bootstrap_room=recv_req.bootstrap_room,
|
952
|
+
data_parallel_rank=recv_req.data_parallel_rank,
|
928
953
|
)
|
929
954
|
req.tokenizer = self.tokenizer
|
930
955
|
|
956
|
+
if self.disaggregation_mode != DisaggregationMode.NULL:
|
957
|
+
# Invalid request for disaggregated mode
|
958
|
+
if recv_req.bootstrap_room is None:
|
959
|
+
error_msg = (
|
960
|
+
f"Invalid request: Disaggregated request received without "
|
961
|
+
f"boostrap room id. {req.rid=}"
|
962
|
+
)
|
963
|
+
logger.error(error_msg)
|
964
|
+
prepare_abort(req, error_msg)
|
965
|
+
self.stream_output([req], req.return_logprob)
|
966
|
+
return
|
967
|
+
|
931
968
|
if (
|
932
969
|
recv_req.session_params is not None
|
933
970
|
and recv_req.session_params.id is not None
|
@@ -955,29 +992,23 @@ class Scheduler(
|
|
955
992
|
req.extend_image_inputs(image_inputs)
|
956
993
|
|
957
994
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
req.origin_input_ids = [0]
|
964
|
-
req.multimodal_inputs = None
|
965
|
-
req.sampling_params.max_new_tokens = 0
|
966
|
-
req.finished_reason = FINISH_ABORT(
|
967
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
995
|
+
req.set_finish_with_abort(
|
996
|
+
error_msg=(
|
997
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
998
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
999
|
+
)
|
968
1000
|
)
|
969
1001
|
self._add_request_to_queue(req)
|
970
1002
|
return
|
971
1003
|
|
972
|
-
# Validate
|
1004
|
+
# Validate prompt length
|
973
1005
|
error_msg = validate_input_length(
|
974
1006
|
req,
|
975
1007
|
self.max_req_input_len,
|
976
1008
|
self.server_args.allow_auto_truncate,
|
977
1009
|
)
|
978
1010
|
if error_msg:
|
979
|
-
req.
|
980
|
-
req.sampling_params.max_new_tokens = 0
|
1011
|
+
req.set_finish_with_abort(error_msg)
|
981
1012
|
self._add_request_to_queue(req)
|
982
1013
|
return
|
983
1014
|
|
@@ -989,12 +1020,9 @@ class Scheduler(
|
|
989
1020
|
req.logprob_start_len = recv_req.logprob_start_len
|
990
1021
|
|
991
1022
|
if req.logprob_start_len >= len(req.origin_input_ids):
|
992
|
-
req.
|
993
|
-
f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
|
994
|
-
HTTPStatus.BAD_REQUEST,
|
995
|
-
"BadRequestError",
|
996
|
-
)
|
1023
|
+
error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
|
997
1024
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
1025
|
+
req.set_finish_with_abort(error_msg)
|
998
1026
|
self._add_request_to_queue(req)
|
999
1027
|
return
|
1000
1028
|
|
@@ -1031,15 +1059,19 @@ class Scheduler(
|
|
1031
1059
|
if not cache_hit:
|
1032
1060
|
req.grammar_key = key
|
1033
1061
|
add_to_grammar_queue = True
|
1062
|
+
else:
|
1063
|
+
if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
|
1064
|
+
error_msg = f"Invalid grammar request with cache hit: {key=}"
|
1065
|
+
req.set_finish_with_abort(error_msg)
|
1034
1066
|
|
1035
1067
|
if add_to_grammar_queue:
|
1036
|
-
req.queue_time_start = time.
|
1068
|
+
req.queue_time_start = time.perf_counter()
|
1037
1069
|
self.grammar_queue.append(req)
|
1038
1070
|
else:
|
1039
1071
|
self._add_request_to_queue(req)
|
1040
1072
|
|
1041
1073
|
def _add_request_to_queue(self, req: Req):
|
1042
|
-
req.queue_time_start = time.
|
1074
|
+
req.queue_time_start = time.perf_counter()
|
1043
1075
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1044
1076
|
self.disagg_prefill_bootstrap_queue.add(req)
|
1045
1077
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
@@ -1047,8 +1079,11 @@ class Scheduler(
|
|
1047
1079
|
else:
|
1048
1080
|
self.waiting_queue.append(req)
|
1049
1081
|
|
1050
|
-
def _extend_requests_to_queue(self, reqs: List[Req]
|
1051
|
-
if self.disaggregation_mode == DisaggregationMode.
|
1082
|
+
def _extend_requests_to_queue(self, reqs: List[Req]):
|
1083
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1084
|
+
self.disagg_prefill_bootstrap_queue.extend(reqs)
|
1085
|
+
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1086
|
+
# If this is a decode server, we put the request to the decode pending prealloc queue
|
1052
1087
|
self.disagg_decode_prealloc_queue.extend(reqs)
|
1053
1088
|
else:
|
1054
1089
|
self.waiting_queue.extend(reqs)
|
@@ -1075,19 +1110,13 @@ class Scheduler(
|
|
1075
1110
|
req.extend_image_inputs(image_inputs)
|
1076
1111
|
|
1077
1112
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
req.origin_input_ids = [0]
|
1084
|
-
req.multimodal_inputs = None
|
1085
|
-
req.sampling_params.max_new_tokens = 0
|
1086
|
-
req.finished_reason = FINISH_ABORT(
|
1087
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1113
|
+
req.set_finish_with_abort(
|
1114
|
+
error_msg=(
|
1115
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
1116
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
1117
|
+
)
|
1088
1118
|
)
|
1089
|
-
|
1090
|
-
self.waiting_queue.append(req)
|
1119
|
+
self._add_request_to_queue(req)
|
1091
1120
|
return
|
1092
1121
|
|
1093
1122
|
# Validate prompts length
|
@@ -1110,8 +1139,8 @@ class Scheduler(
|
|
1110
1139
|
can_run_list: List[Req],
|
1111
1140
|
running_bs: int,
|
1112
1141
|
):
|
1113
|
-
gap_latency = time.
|
1114
|
-
self.last_prefill_stats_tic = time.
|
1142
|
+
gap_latency = time.perf_counter() - self.last_prefill_stats_tic
|
1143
|
+
self.last_prefill_stats_tic = time.perf_counter()
|
1115
1144
|
self.last_input_throughput = self.num_prefill_tokens / gap_latency
|
1116
1145
|
self.num_prefill_tokens = 0
|
1117
1146
|
|
@@ -1133,7 +1162,8 @@ class Scheduler(
|
|
1133
1162
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1134
1163
|
f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
|
1135
1164
|
f += f"#queue-req: {len(self.waiting_queue)}, "
|
1136
|
-
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
|
1165
|
+
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
|
1166
|
+
f += f"time: {gap_latency:.2f} "
|
1137
1167
|
else:
|
1138
1168
|
f += f"#queue-req: {len(self.waiting_queue)}"
|
1139
1169
|
|
@@ -1155,14 +1185,15 @@ class Scheduler(
|
|
1155
1185
|
self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
|
1156
1186
|
|
1157
1187
|
self.metrics_collector.log_stats(self.stats)
|
1188
|
+
self._publish_kv_events()
|
1158
1189
|
|
1159
1190
|
def log_decode_stats(
|
1160
1191
|
self, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
|
1161
1192
|
):
|
1162
1193
|
batch = running_batch or self.running_batch
|
1163
1194
|
|
1164
|
-
gap_latency = time.
|
1165
|
-
self.last_decode_stats_tic = time.
|
1195
|
+
gap_latency = time.perf_counter() - self.last_decode_stats_tic
|
1196
|
+
self.last_decode_stats_tic = time.perf_counter()
|
1166
1197
|
self.last_gen_throughput = self.num_generated_tokens / gap_latency
|
1167
1198
|
self.num_generated_tokens = 0
|
1168
1199
|
num_running_reqs = len(batch.reqs)
|
@@ -1214,6 +1245,7 @@ class Scheduler(
|
|
1214
1245
|
self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
|
1215
1246
|
self.stats.spec_accept_length = spec_accept_length
|
1216
1247
|
self.metrics_collector.log_stats(self.stats)
|
1248
|
+
self._publish_kv_events()
|
1217
1249
|
|
1218
1250
|
def check_memory(self):
|
1219
1251
|
available_size = (
|
@@ -1246,7 +1278,7 @@ class Scheduler(
|
|
1246
1278
|
if (
|
1247
1279
|
self.enable_metrics
|
1248
1280
|
and self.attn_tp_rank == 0
|
1249
|
-
and time.
|
1281
|
+
and time.perf_counter() > self.metrics_collector.last_log_time + 30
|
1250
1282
|
):
|
1251
1283
|
# During idle time, also collect metrics every 30 seconds.
|
1252
1284
|
num_used = self.max_total_num_tokens - (
|
@@ -1261,6 +1293,7 @@ class Scheduler(
|
|
1261
1293
|
self.stats.num_queue_reqs = len(self.waiting_queue)
|
1262
1294
|
self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
|
1263
1295
|
self.metrics_collector.log_stats(self.stats)
|
1296
|
+
self._publish_kv_events()
|
1264
1297
|
|
1265
1298
|
def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
|
1266
1299
|
# Merge the prefill batch into the running batch
|
@@ -1383,6 +1416,13 @@ class Scheduler(
|
|
1383
1416
|
self.running_batch.batch_is_full = True
|
1384
1417
|
break
|
1385
1418
|
|
1419
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1420
|
+
# In prefill mode, prealloc queue and transfer queue can also take memory,
|
1421
|
+
# so we need to check if the available size for the actual available size.
|
1422
|
+
if len(adder.can_run_list) >= self.req_to_token_pool.available_size():
|
1423
|
+
self.running_batch.batch_is_full = True
|
1424
|
+
break
|
1425
|
+
|
1386
1426
|
req.init_next_round_input(
|
1387
1427
|
None if prefix_computed else self.tree_cache,
|
1388
1428
|
self.enable_hierarchical_cache,
|
@@ -1411,7 +1451,7 @@ class Scheduler(
|
|
1411
1451
|
if self.enable_metrics:
|
1412
1452
|
# only record queue time when enable_metrics is True to avoid overhead
|
1413
1453
|
for req in can_run_list:
|
1414
|
-
req.queue_time_end = time.
|
1454
|
+
req.queue_time_end = time.perf_counter()
|
1415
1455
|
|
1416
1456
|
self.waiting_queue = [
|
1417
1457
|
x for x in self.waiting_queue if x not in set(can_run_list)
|
@@ -1484,7 +1524,7 @@ class Scheduler(
|
|
1484
1524
|
self.new_token_ratio = new_token_ratio
|
1485
1525
|
|
1486
1526
|
logger.info(
|
1487
|
-
"
|
1527
|
+
"KV cache pool is full. Retract requests. "
|
1488
1528
|
f"#retracted_reqs: {len(retracted_reqs)}, "
|
1489
1529
|
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
1490
1530
|
)
|
@@ -1508,13 +1548,8 @@ class Scheduler(
|
|
1508
1548
|
"""Run a batch."""
|
1509
1549
|
self.forward_ct += 1
|
1510
1550
|
|
1511
|
-
#
|
1512
|
-
|
1513
|
-
self.profiler_target_forward_ct
|
1514
|
-
and self.profiler_target_forward_ct <= self.forward_ct
|
1515
|
-
):
|
1516
|
-
self.stop_profile()
|
1517
|
-
|
1551
|
+
# Whether to run the profiler
|
1552
|
+
self._profile_batch_predicate(batch)
|
1518
1553
|
if self.forward_sleep_time is not None:
|
1519
1554
|
logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
|
1520
1555
|
time.sleep(self.forward_sleep_time)
|
@@ -1540,10 +1575,9 @@ class Scheduler(
|
|
1540
1575
|
num_accepted_tokens,
|
1541
1576
|
can_run_cuda_graph,
|
1542
1577
|
) = self.draft_worker.forward_batch_speculative_generation(batch)
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
self.spec_num_total_forward_ct += batch.batch_size()
|
1578
|
+
bs = batch.batch_size()
|
1579
|
+
self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
|
1580
|
+
self.spec_num_total_forward_ct += bs
|
1547
1581
|
self.num_generated_tokens += num_accepted_tokens
|
1548
1582
|
|
1549
1583
|
if self.pp_group.is_last_rank:
|
@@ -1617,6 +1651,9 @@ class Scheduler(
|
|
1617
1651
|
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
1618
1652
|
spec_algorithm=self.spec_algorithm,
|
1619
1653
|
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
1654
|
+
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
1655
|
+
enable_deepep_moe=self.server_args.enable_deepep_moe,
|
1656
|
+
deepep_mode=DeepEPMode[self.server_args.deepep_mode],
|
1620
1657
|
)
|
1621
1658
|
|
1622
1659
|
@staticmethod
|
@@ -1630,6 +1667,9 @@ class Scheduler(
|
|
1630
1667
|
disable_cuda_graph: bool,
|
1631
1668
|
spec_algorithm,
|
1632
1669
|
speculative_num_draft_tokens,
|
1670
|
+
enable_two_batch_overlap: bool,
|
1671
|
+
enable_deepep_moe: bool,
|
1672
|
+
deepep_mode: DeepEPMode,
|
1633
1673
|
):
|
1634
1674
|
# Check if other DP workers have running batches
|
1635
1675
|
if local_batch is None:
|
@@ -1665,17 +1705,26 @@ class Scheduler(
|
|
1665
1705
|
is_extend_in_batch = (
|
1666
1706
|
local_batch.forward_mode.is_extend() if local_batch else False
|
1667
1707
|
)
|
1708
|
+
|
1709
|
+
tbo_preparer = TboDPAttentionPreparer()
|
1710
|
+
|
1668
1711
|
local_info = torch.tensor(
|
1669
1712
|
[
|
1670
1713
|
num_tokens,
|
1671
1714
|
can_cuda_graph,
|
1672
1715
|
num_tokens_for_logprob,
|
1673
1716
|
is_extend_in_batch,
|
1717
|
+
*tbo_preparer.prepare_all_gather(
|
1718
|
+
local_batch,
|
1719
|
+
deepep_mode,
|
1720
|
+
enable_deepep_moe,
|
1721
|
+
enable_two_batch_overlap,
|
1722
|
+
),
|
1674
1723
|
],
|
1675
1724
|
dtype=torch.int64,
|
1676
1725
|
)
|
1677
1726
|
global_info = torch.empty(
|
1678
|
-
(dp_size, attn_tp_size,
|
1727
|
+
(dp_size, attn_tp_size, 6),
|
1679
1728
|
dtype=torch.int64,
|
1680
1729
|
)
|
1681
1730
|
torch.distributed.all_gather_into_tensor(
|
@@ -1688,6 +1737,10 @@ class Scheduler(
|
|
1688
1737
|
global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
|
1689
1738
|
is_extend_in_batch = global_info[:, 0, 3].tolist()
|
1690
1739
|
|
1740
|
+
tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
|
1741
|
+
global_info[:, :, 4:6]
|
1742
|
+
)
|
1743
|
+
|
1691
1744
|
if local_batch is None and max(global_num_tokens) > 0:
|
1692
1745
|
local_batch = get_idle_batch()
|
1693
1746
|
|
@@ -1701,6 +1754,8 @@ class Scheduler(
|
|
1701
1754
|
local_batch.global_num_tokens_for_logprob = (
|
1702
1755
|
global_num_tokens_for_logprob
|
1703
1756
|
)
|
1757
|
+
local_batch.tbo_split_seq_index = tbo_split_seq_index
|
1758
|
+
local_batch.global_forward_mode = global_forward_mode
|
1704
1759
|
|
1705
1760
|
# Check forward mode for cuda graph
|
1706
1761
|
if not disable_cuda_graph:
|
@@ -1726,17 +1781,25 @@ class Scheduler(
|
|
1726
1781
|
"""Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
|
1727
1782
|
|
1728
1783
|
num_ready_reqs = 0
|
1729
|
-
|
1784
|
+
num_timeout_reqs = 0
|
1730
1785
|
for req in self.grammar_queue:
|
1731
1786
|
try:
|
1787
|
+
if req.finished(): # It is aborted by AbortReq
|
1788
|
+
num_ready_reqs += 1
|
1789
|
+
continue
|
1732
1790
|
req.grammar = req.grammar.result(timeout=0.03)
|
1733
|
-
|
1734
|
-
|
1791
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1792
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1793
|
+
req.set_finish_with_abort(
|
1794
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1795
|
+
)
|
1735
1796
|
num_ready_reqs += 1
|
1736
1797
|
except futures._base.TimeoutError:
|
1737
1798
|
req.grammar_wait_ct += 1
|
1799
|
+
# NOTE(lianmin): this timeout is the waiting time of the above line. It is
|
1800
|
+
# not the waiting time from it enters the grammar queue.
|
1738
1801
|
if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
|
1739
|
-
|
1802
|
+
num_timeout_reqs = 1
|
1740
1803
|
break
|
1741
1804
|
|
1742
1805
|
if self.server_args.enable_dp_attention:
|
@@ -1748,28 +1811,33 @@ class Scheduler(
|
|
1748
1811
|
|
1749
1812
|
if tp_size > 1:
|
1750
1813
|
# Sync across TP ranks to make sure they have the same number of ready requests
|
1751
|
-
tensor = torch.tensor([num_ready_reqs,
|
1814
|
+
tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
|
1752
1815
|
torch.distributed.all_reduce(
|
1753
1816
|
tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
|
1754
1817
|
)
|
1755
|
-
num_ready_reqs_max,
|
1818
|
+
num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
|
1756
1819
|
|
1757
1820
|
for i in range(num_ready_reqs, num_ready_reqs_max):
|
1758
1821
|
req = self.grammar_queue[i]
|
1822
|
+
if req.finished(): # It is aborted by AbortReq
|
1823
|
+
continue
|
1759
1824
|
req.grammar = req.grammar.result()
|
1760
|
-
|
1761
|
-
|
1825
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1826
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1827
|
+
req.set_finish_with_abort(
|
1828
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1829
|
+
)
|
1830
|
+
else:
|
1831
|
+
num_ready_reqs_max = num_ready_reqs
|
1832
|
+
num_timeout_reqs_max = num_timeout_reqs
|
1762
1833
|
|
1763
|
-
|
1764
|
-
|
1765
|
-
|
1766
|
-
|
1767
|
-
|
1768
|
-
|
1769
|
-
|
1770
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1771
|
-
)
|
1772
|
-
num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
|
1834
|
+
for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
|
1835
|
+
req = self.grammar_queue[i]
|
1836
|
+
req.grammar.cancel()
|
1837
|
+
error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
|
1838
|
+
req.set_finish_with_abort(error_msg)
|
1839
|
+
self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
|
1840
|
+
num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
|
1773
1841
|
|
1774
1842
|
self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
|
1775
1843
|
self.grammar_queue = self.grammar_queue[num_ready_reqs:]
|
@@ -1784,10 +1852,10 @@ class Scheduler(
|
|
1784
1852
|
def watchdog_thread(self):
|
1785
1853
|
"""A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
|
1786
1854
|
self.watchdog_last_forward_ct = 0
|
1787
|
-
self.watchdog_last_time = time.
|
1855
|
+
self.watchdog_last_time = time.perf_counter()
|
1788
1856
|
|
1789
1857
|
while True:
|
1790
|
-
current = time.
|
1858
|
+
current = time.perf_counter()
|
1791
1859
|
if self.cur_batch is not None:
|
1792
1860
|
if self.watchdog_last_forward_ct == self.forward_ct:
|
1793
1861
|
if current > self.watchdog_last_time + self.watchdog_timeout:
|
@@ -1856,6 +1924,27 @@ class Scheduler(
|
|
1856
1924
|
if_success = False
|
1857
1925
|
return if_success
|
1858
1926
|
|
1927
|
+
def get_load(self):
|
1928
|
+
# TODO(lsyin): use dynamically maintained num_waiting_tokens
|
1929
|
+
load = (
|
1930
|
+
self.max_total_num_tokens
|
1931
|
+
- self.token_to_kv_pool_allocator.available_size()
|
1932
|
+
- self.tree_cache.evictable_size()
|
1933
|
+
)
|
1934
|
+
load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
|
1935
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1936
|
+
load += sum(
|
1937
|
+
len(req.origin_input_ids)
|
1938
|
+
for req in self.disagg_prefill_bootstrap_queue.queue
|
1939
|
+
)
|
1940
|
+
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1941
|
+
load += sum(
|
1942
|
+
len(req.req.origin_input_ids)
|
1943
|
+
for req in self.disagg_decode_prealloc_queue.queue
|
1944
|
+
)
|
1945
|
+
|
1946
|
+
return load
|
1947
|
+
|
1859
1948
|
def get_internal_state(self, recv_req: GetInternalStateReq):
|
1860
1949
|
ret = dict(global_server_args_dict)
|
1861
1950
|
ret["last_gen_throughput"] = self.last_gen_throughput
|
@@ -1865,9 +1954,10 @@ class Scheduler(
|
|
1865
1954
|
)
|
1866
1955
|
if RECORD_STEP_TIME:
|
1867
1956
|
ret["step_time_dict"] = self.step_time_dict
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1957
|
+
|
1958
|
+
ret["load"] = self.get_load()
|
1959
|
+
|
1960
|
+
return GetInternalStateReqOutput(internal_state=ret)
|
1871
1961
|
|
1872
1962
|
def set_internal_state(self, recv_req: SetInternalStateReq):
|
1873
1963
|
server_args_dict = recv_req.server_args
|
@@ -1901,7 +1991,7 @@ class Scheduler(
|
|
1901
1991
|
self.cum_spec_accept_length = self.cum_spec_accept_count = 0
|
1902
1992
|
for k, v in server_args_dict.items():
|
1903
1993
|
global_server_args_dict[k] = v
|
1904
|
-
logger.info(f"Global server args updated!
|
1994
|
+
logger.info(f"Global server args updated! {global_server_args_dict=}")
|
1905
1995
|
return SetInternalStateReqOutput(
|
1906
1996
|
updated=True,
|
1907
1997
|
server_args=global_server_args_dict,
|
@@ -1943,8 +2033,6 @@ class Scheduler(
|
|
1943
2033
|
)
|
1944
2034
|
|
1945
2035
|
def abort_request(self, recv_req: AbortReq):
|
1946
|
-
# TODO(lmzheng): abort the requests in the grammar queue.
|
1947
|
-
|
1948
2036
|
# Delete requests in the waiting queue
|
1949
2037
|
to_del = []
|
1950
2038
|
for i, req in enumerate(self.waiting_queue):
|
@@ -1953,10 +2041,23 @@ class Scheduler(
|
|
1953
2041
|
|
1954
2042
|
# Sort in reverse order to avoid index issues when deleting
|
1955
2043
|
for i in reversed(to_del):
|
2044
|
+
# Abort method 1: directly pop from the queue
|
2045
|
+
# This only works for requests that have not started anything.
|
2046
|
+
# We still need to send something back to TokenizerManager to clean up the state.
|
1956
2047
|
req = self.waiting_queue.pop(i)
|
1957
2048
|
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
1958
2049
|
logger.debug(f"Abort queued request. {req.rid=}")
|
1959
2050
|
|
2051
|
+
# Delete the requests in the grammar queue
|
2052
|
+
for req in self.grammar_queue:
|
2053
|
+
# Abort method 2: call `set_finish_with_abort`
|
2054
|
+
# The request will still run one prefill forward pass.
|
2055
|
+
# In this case, we change the input_ids to be only one token to make this prefill cheap.
|
2056
|
+
if req.rid.startswith(recv_req.rid):
|
2057
|
+
logger.debug(f"Abort grammar queue request. {req.rid=}")
|
2058
|
+
req.grammar.cancel()
|
2059
|
+
req.set_finish_with_abort("Aborted by AbortReq.")
|
2060
|
+
|
1960
2061
|
# Delete requests in the running batch
|
1961
2062
|
if self.cur_batch is self.running_batch or self.cur_batch is None:
|
1962
2063
|
reqs = self.running_batch.reqs
|
@@ -1965,6 +2066,9 @@ class Scheduler(
|
|
1965
2066
|
|
1966
2067
|
for req in reqs:
|
1967
2068
|
if req.rid.startswith(recv_req.rid) and not req.finished():
|
2069
|
+
# Abort method 3: set `to_abort=True`
|
2070
|
+
# The request will still run one decode forward pass.
|
2071
|
+
# Then we reuse all existing code to clean up the KV cache allocation.
|
1968
2072
|
logger.debug(f"Abort running request. {req.rid=}")
|
1969
2073
|
req.to_abort = True
|
1970
2074
|
|
@@ -2044,46 +2148,86 @@ class Scheduler(
|
|
2044
2148
|
|
2045
2149
|
def profile(self, recv_req: ProfileReq):
|
2046
2150
|
if recv_req.type == ProfileReqType.START_PROFILE:
|
2047
|
-
|
2048
|
-
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2151
|
+
if recv_req.profile_by_stage:
|
2152
|
+
return self.init_profile(
|
2153
|
+
recv_req.output_dir,
|
2154
|
+
recv_req.num_steps,
|
2155
|
+
recv_req.activities,
|
2156
|
+
recv_req.with_stack,
|
2157
|
+
recv_req.record_shapes,
|
2158
|
+
recv_req.profile_by_stage,
|
2159
|
+
recv_req.profile_id,
|
2160
|
+
)
|
2161
|
+
else:
|
2162
|
+
self.init_profile(
|
2163
|
+
recv_req.output_dir,
|
2164
|
+
recv_req.num_steps,
|
2165
|
+
recv_req.activities,
|
2166
|
+
recv_req.with_stack,
|
2167
|
+
recv_req.record_shapes,
|
2168
|
+
recv_req.profile_by_stage,
|
2169
|
+
recv_req.profile_id,
|
2170
|
+
)
|
2171
|
+
return self.start_profile(True)
|
2055
2172
|
else:
|
2056
2173
|
return self.stop_profile()
|
2057
2174
|
|
2058
|
-
def
|
2175
|
+
def init_profile(
|
2059
2176
|
self,
|
2060
2177
|
output_dir: Optional[str],
|
2061
2178
|
num_steps: Optional[int],
|
2062
2179
|
activities: Optional[List[str]],
|
2063
2180
|
with_stack: Optional[bool],
|
2064
2181
|
record_shapes: Optional[bool],
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2182
|
+
profile_by_stage: bool,
|
2183
|
+
profile_id: str,
|
2184
|
+
) -> ProfileReqOutput:
|
2185
|
+
if self.profile_in_progress:
|
2068
2186
|
return ProfileReqOutput(
|
2069
2187
|
success=False,
|
2070
2188
|
message="Profiling is already in progress. Call /stop_profile first.",
|
2071
2189
|
)
|
2072
2190
|
|
2191
|
+
self.profile_by_stage = profile_by_stage
|
2192
|
+
|
2073
2193
|
if output_dir is None:
|
2074
2194
|
output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
|
2075
2195
|
if activities is None:
|
2076
2196
|
activities = ["CPU", "GPU"]
|
2077
2197
|
|
2078
2198
|
self.torch_profiler_output_dir = output_dir
|
2199
|
+
self.torch_profiler_with_stack = with_stack
|
2200
|
+
self.torch_profiler_record_shapes = record_shapes
|
2079
2201
|
self.profiler_activities = activities
|
2080
|
-
self.
|
2202
|
+
self.profile_id = profile_id
|
2203
|
+
|
2204
|
+
if num_steps:
|
2205
|
+
self.profile_steps = num_steps
|
2206
|
+
if self.profile_by_stage:
|
2207
|
+
self.profiler_target_prefill_ct = num_steps
|
2208
|
+
self.profiler_target_decode_ct = num_steps
|
2209
|
+
self.profiler_prefill_ct = 0
|
2210
|
+
self.profiler_decode_ct = 0
|
2211
|
+
else:
|
2212
|
+
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2213
|
+
# The caller will be notified when reaching profiler_target_forward_ct
|
2214
|
+
else:
|
2215
|
+
self.profiler_target_forward_ct = None
|
2216
|
+
|
2217
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2218
|
+
|
2219
|
+
def start_profile(
|
2220
|
+
self, stage: Optional[ForwardMode] = None
|
2221
|
+
) -> ProfileReqOutput | None:
|
2222
|
+
stage_str = f" for {stage.__str__()}" if stage else ""
|
2081
2223
|
logger.info(
|
2082
|
-
"Profiling starts. Traces will be saved to:
|
2083
|
-
self.torch_profiler_output_dir,
|
2084
|
-
self.profiler_id,
|
2224
|
+
f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
|
2085
2225
|
)
|
2086
2226
|
|
2227
|
+
activities = self.profiler_activities
|
2228
|
+
with_stack = self.torch_profiler_with_stack
|
2229
|
+
record_shapes = self.torch_profiler_record_shapes
|
2230
|
+
|
2087
2231
|
activity_map = {
|
2088
2232
|
"CPU": torch.profiler.ProfilerActivity.CPU,
|
2089
2233
|
"GPU": torch.profiler.ProfilerActivity.CUDA,
|
@@ -2092,45 +2236,100 @@ class Scheduler(
|
|
2092
2236
|
activity_map[a] for a in activities if a in activity_map
|
2093
2237
|
]
|
2094
2238
|
|
2095
|
-
if
|
2239
|
+
if "RPD" in activities:
|
2240
|
+
from rpdTracerControl import rpdTracerControl
|
2241
|
+
|
2242
|
+
rpdTracerControl.skipCreate()
|
2243
|
+
|
2244
|
+
self.rpd_profile_path = os.path.join(
|
2245
|
+
self.torch_profiler_output_dir,
|
2246
|
+
"rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
2247
|
+
)
|
2248
|
+
|
2249
|
+
if self.tp_rank == 0:
|
2250
|
+
import sqlite3
|
2251
|
+
|
2252
|
+
from rocpd.schema import RocpdSchema
|
2253
|
+
|
2254
|
+
if os.path.exists("trace.rpd"):
|
2255
|
+
os.unlink("trace.rpd")
|
2256
|
+
schema = RocpdSchema()
|
2257
|
+
connection = sqlite3.connect("trace.rpd")
|
2258
|
+
schema.writeSchema(connection)
|
2259
|
+
connection.commit()
|
2260
|
+
del connection
|
2261
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2262
|
+
|
2263
|
+
self.rpd_profiler = rpdTracerControl()
|
2264
|
+
self.rpd_profiler.setPythonTrace(True)
|
2265
|
+
self.rpd_profiler.start()
|
2266
|
+
self.rpd_profiler.rangePush("", "rpd profile range", "")
|
2267
|
+
self.profile_in_progress = True
|
2268
|
+
elif torchprof_activities:
|
2096
2269
|
self.torch_profiler = torch.profiler.profile(
|
2097
2270
|
activities=torchprof_activities,
|
2098
2271
|
with_stack=with_stack if with_stack is not None else True,
|
2099
2272
|
record_shapes=record_shapes if record_shapes is not None else False,
|
2100
2273
|
)
|
2101
2274
|
self.torch_profiler.start()
|
2275
|
+
self.profile_in_progress = True
|
2102
2276
|
|
2103
2277
|
if "MEM" in activities:
|
2104
2278
|
torch.cuda.memory._record_memory_history(max_entries=100000)
|
2279
|
+
self.profile_in_progress = True
|
2105
2280
|
|
2106
2281
|
if "CUDA_PROFILER" in activities:
|
2107
2282
|
torch.cuda.cudart().cudaProfilerStart()
|
2108
2283
|
|
2109
|
-
|
2110
|
-
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2111
|
-
# The caller will be notified when reaching profiler_target_forward_ct
|
2112
|
-
else:
|
2113
|
-
self.profiler_target_forward_ct = None
|
2114
|
-
return ProfileReqOutput(success=True, message="Succeeded")
|
2284
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2115
2285
|
|
2116
|
-
def stop_profile(
|
2117
|
-
|
2118
|
-
|
2286
|
+
def stop_profile(
|
2287
|
+
self, stage: Optional[ForwardMode] = None
|
2288
|
+
) -> ProfileReqOutput | None:
|
2289
|
+
if not self.profile_in_progress:
|
2290
|
+
return ProfileReqOutput(
|
2291
|
+
success=False,
|
2292
|
+
message="Profiling is not in progress. Call /start_profile first.",
|
2293
|
+
)
|
2119
2294
|
|
2120
|
-
|
2295
|
+
if not Path(self.torch_profiler_output_dir).exists():
|
2296
|
+
Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
|
2297
|
+
|
2298
|
+
stage_suffix = f"-{stage.__str__()}" if stage else ""
|
2299
|
+
logger.info("Stop profiling" + stage_suffix + "...")
|
2121
2300
|
if self.torch_profiler is not None:
|
2122
2301
|
self.torch_profiler.stop()
|
2123
2302
|
self.torch_profiler.export_chrome_trace(
|
2124
2303
|
os.path.join(
|
2125
2304
|
self.torch_profiler_output_dir,
|
2126
|
-
self.
|
2305
|
+
self.profile_id
|
2306
|
+
+ f"-TP-{self.tp_rank}"
|
2307
|
+
+ stage_suffix
|
2308
|
+
+ ".trace.json.gz",
|
2127
2309
|
)
|
2128
2310
|
)
|
2311
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2312
|
+
|
2313
|
+
if self.rpd_profiler is not None:
|
2314
|
+
self.rpd_profiler.rangePop()
|
2315
|
+
self.rpd_profiler.stop()
|
2316
|
+
self.rpd_profiler.flush()
|
2129
2317
|
|
2130
|
-
|
2318
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2319
|
+
if self.tp_rank == 0:
|
2320
|
+
from sglang.srt.utils import rpd_to_chrome_trace
|
2321
|
+
|
2322
|
+
rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
|
2323
|
+
self.rpd_profiler = None
|
2324
|
+
self.rpd_profiler_path = None
|
2325
|
+
|
2326
|
+
if self.profiler_activities is not None and "MEM" in self.profiler_activities:
|
2131
2327
|
memory_profile_path = os.path.join(
|
2132
2328
|
self.torch_profiler_output_dir,
|
2133
|
-
|
2329
|
+
str(time.time())
|
2330
|
+
+ f"-TP-{self.tp_rank}-memory"
|
2331
|
+
+ stage_suffix
|
2332
|
+
+ ".pickle",
|
2134
2333
|
)
|
2135
2334
|
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
2136
2335
|
torch.cuda.memory._record_memory_history(enabled=None)
|
@@ -2143,21 +2342,46 @@ class Scheduler(
|
|
2143
2342
|
self.torch_profiler_output_dir,
|
2144
2343
|
)
|
2145
2344
|
self.torch_profiler = None
|
2146
|
-
self.
|
2147
|
-
|
2148
|
-
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2152
|
-
)
|
2345
|
+
self.profile_in_progress = False
|
2346
|
+
|
2347
|
+
return ProfileReqOutput(success=True, message="Succeeded.")
|
2348
|
+
|
2349
|
+
def _profile_batch_predicate(self, batch):
|
2350
|
+
if self.profile_by_stage:
|
2351
|
+
if batch.forward_mode.is_prefill():
|
2352
|
+
if self.profiler_prefill_ct == 0:
|
2353
|
+
self.start_profile(batch.forward_mode)
|
2354
|
+
self.profiler_prefill_ct += 1
|
2355
|
+
if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
|
2356
|
+
if self.profile_in_progress:
|
2357
|
+
self.stop_profile(stage=ForwardMode.EXTEND)
|
2358
|
+
elif batch.forward_mode.is_decode():
|
2359
|
+
if self.profiler_decode_ct == 0:
|
2360
|
+
if self.profile_in_progress:
|
2361
|
+
# force trace flush
|
2362
|
+
self.stop_profile(ForwardMode.EXTEND)
|
2363
|
+
self.start_profile(batch.forward_mode)
|
2364
|
+
self.profiler_decode_ct += 1
|
2365
|
+
if self.profiler_decode_ct > self.profiler_target_decode_ct:
|
2366
|
+
if self.profile_in_progress:
|
2367
|
+
self.stop_profile(stage=ForwardMode.DECODE)
|
2368
|
+
else:
|
2369
|
+
raise RuntimeError("unsupported profile stage")
|
2370
|
+
else:
|
2371
|
+
# Check profiler
|
2372
|
+
if (
|
2373
|
+
self.profiler_target_forward_ct
|
2374
|
+
and self.profiler_target_forward_ct <= self.forward_ct
|
2375
|
+
):
|
2376
|
+
self.stop_profile()
|
2153
2377
|
|
2154
2378
|
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
2155
2379
|
if recv_req == ExpertDistributionReq.START_RECORD:
|
2156
|
-
|
2380
|
+
get_global_expert_distribution_recorder().start_record()
|
2157
2381
|
elif recv_req == ExpertDistributionReq.STOP_RECORD:
|
2158
|
-
|
2382
|
+
get_global_expert_distribution_recorder().stop_record()
|
2159
2383
|
elif recv_req == ExpertDistributionReq.DUMP_RECORD:
|
2160
|
-
|
2384
|
+
get_global_expert_distribution_recorder().dump_record()
|
2161
2385
|
else:
|
2162
2386
|
raise ValueError("Unrecognized ExpertDistributionReq value")
|
2163
2387
|
return ExpertDistributionReqOutput()
|
@@ -2195,6 +2419,13 @@ class Scheduler(
|
|
2195
2419
|
prefix += f" PP{self.pp_rank}"
|
2196
2420
|
return prefix
|
2197
2421
|
|
2422
|
+
def _publish_kv_events(self):
|
2423
|
+
if self.enable_kv_cache_events:
|
2424
|
+
events = self.tree_cache.take_events()
|
2425
|
+
if events:
|
2426
|
+
batch = KVEventBatch(ts=time.time(), events=events)
|
2427
|
+
self.kv_event_publisher.publish(batch)
|
2428
|
+
|
2198
2429
|
|
2199
2430
|
def is_health_check_generate_req(recv_req):
|
2200
2431
|
return getattr(recv_req, "rid", "").startswith("HEALTH_CHECK")
|
@@ -2250,6 +2481,10 @@ def run_scheduler_process(
|
|
2250
2481
|
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
2251
2482
|
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
|
2252
2483
|
|
2484
|
+
embedding_cache_size = 100
|
2485
|
+
if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
|
2486
|
+
embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
|
2487
|
+
init_embedding_cache(embedding_cache_size * 1024 * 1024)
|
2253
2488
|
# Create a scheduler and run the event loop
|
2254
2489
|
try:
|
2255
2490
|
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
|