sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/managers/scheduler.py
CHANGED
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
|
|
24
24
|
from concurrent import futures
|
25
25
|
from dataclasses import dataclass
|
26
26
|
from http import HTTPStatus
|
27
|
+
from pathlib import Path
|
27
28
|
from types import SimpleNamespace
|
28
29
|
from typing import Dict, List, Optional, Tuple, Union
|
29
30
|
|
@@ -35,7 +36,10 @@ from torch.distributed import barrier
|
|
35
36
|
|
36
37
|
from sglang.global_config import global_config
|
37
38
|
from sglang.srt.configs.model_config import ModelConfig
|
38
|
-
from sglang.srt.constrained.base_grammar_backend import
|
39
|
+
from sglang.srt.constrained.base_grammar_backend import (
|
40
|
+
INVALID_GRAMMAR_OBJ,
|
41
|
+
create_grammar_backend,
|
42
|
+
)
|
39
43
|
from sglang.srt.disaggregation.decode import (
|
40
44
|
DecodePreallocQueue,
|
41
45
|
DecodeTransferQueue,
|
@@ -62,7 +66,6 @@ from sglang.srt.hf_transformers_utils import (
|
|
62
66
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
63
67
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
64
68
|
from sglang.srt.managers.expert_distribution import (
|
65
|
-
ExpertDistributionRecorder,
|
66
69
|
get_global_expert_distribution_recorder,
|
67
70
|
)
|
68
71
|
from sglang.srt.managers.io_struct import (
|
@@ -132,11 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
|
|
132
135
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
133
136
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
134
137
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
138
|
+
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
135
139
|
from sglang.srt.utils import (
|
140
|
+
DeepEPMode,
|
136
141
|
DynamicGradMode,
|
137
142
|
broadcast_pyobj,
|
138
143
|
configure_logger,
|
139
144
|
disable_request_logging,
|
145
|
+
get_available_gpu_memory,
|
140
146
|
get_bool_env_var,
|
141
147
|
get_zmq_socket,
|
142
148
|
kill_itself_when_parent_died,
|
@@ -173,6 +179,27 @@ class EmbeddingBatchResult:
|
|
173
179
|
bid: int
|
174
180
|
|
175
181
|
|
182
|
+
class IdleSleeper:
|
183
|
+
"""
|
184
|
+
In setups which have long inactivity periods it is desirable to reduce
|
185
|
+
system power consumption when sglang does nothing. This would lead not only
|
186
|
+
to power savings, but also to more CPU thermal headroom when a request
|
187
|
+
eventually comes. This is important in cases when multiple GPUs are connected
|
188
|
+
as each GPU would otherwise pin one thread at 100% CPU usage.
|
189
|
+
|
190
|
+
The simplest solution is to use zmq.Poller on all sockets that may receive
|
191
|
+
data that needs handling immediately.
|
192
|
+
"""
|
193
|
+
|
194
|
+
def __init__(self, sockets):
|
195
|
+
self.poller = zmq.Poller()
|
196
|
+
for s in sockets:
|
197
|
+
self.poller.register(s, zmq.POLLIN)
|
198
|
+
|
199
|
+
def maybe_sleep(self):
|
200
|
+
self.poller.poll(1000)
|
201
|
+
|
202
|
+
|
176
203
|
class Scheduler(
|
177
204
|
SchedulerOutputProcessorMixin,
|
178
205
|
SchedulerDisaggregationDecodeMixin,
|
@@ -210,7 +237,6 @@ class Scheduler(
|
|
210
237
|
self.gpu_id = gpu_id
|
211
238
|
self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
|
212
239
|
self.page_size = server_args.page_size
|
213
|
-
# Distributed rank info
|
214
240
|
self.dp_size = server_args.dp_size
|
215
241
|
self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
|
216
242
|
compute_dp_attention_world_info(
|
@@ -223,6 +249,8 @@ class Scheduler(
|
|
223
249
|
|
224
250
|
# Init inter-process communication
|
225
251
|
context = zmq.Context(2)
|
252
|
+
self.idle_sleeper = None
|
253
|
+
|
226
254
|
if self.pp_rank == 0 and self.attn_tp_rank == 0:
|
227
255
|
self.recv_from_tokenizer = get_zmq_socket(
|
228
256
|
context, zmq.PULL, port_args.scheduler_input_ipc_name, False
|
@@ -245,6 +273,13 @@ class Scheduler(
|
|
245
273
|
self.recv_from_rpc = get_zmq_socket(
|
246
274
|
context, zmq.DEALER, port_args.rpc_ipc_name, False
|
247
275
|
)
|
276
|
+
if self.server_args.sleep_on_idle:
|
277
|
+
self.idle_sleeper = IdleSleeper(
|
278
|
+
[
|
279
|
+
self.recv_from_tokenizer,
|
280
|
+
self.recv_from_rpc,
|
281
|
+
]
|
282
|
+
)
|
248
283
|
else:
|
249
284
|
self.recv_from_tokenizer = None
|
250
285
|
self.recv_from_rpc = None
|
@@ -330,12 +365,16 @@ class Scheduler(
|
|
330
365
|
|
331
366
|
# Print debug info
|
332
367
|
if tp_rank == 0:
|
368
|
+
avail_mem = get_available_gpu_memory(
|
369
|
+
self.device, self.gpu_id, empty_cache=False
|
370
|
+
)
|
333
371
|
logger.info(
|
334
372
|
f"max_total_num_tokens={self.max_total_num_tokens}, "
|
335
373
|
f"chunked_prefill_size={server_args.chunked_prefill_size}, "
|
336
374
|
f"max_prefill_tokens={self.max_prefill_tokens}, "
|
337
375
|
f"max_running_requests={self.max_running_requests}, "
|
338
|
-
f"context_len={self.model_config.context_len}"
|
376
|
+
f"context_len={self.model_config.context_len}, "
|
377
|
+
f"available_gpu_mem={avail_mem:.2f} GB"
|
339
378
|
)
|
340
379
|
|
341
380
|
# Init memory pool and cache
|
@@ -352,13 +391,14 @@ class Scheduler(
|
|
352
391
|
self.forward_ct = 0
|
353
392
|
self.forward_ct_decode = 0
|
354
393
|
self.num_generated_tokens = 0
|
355
|
-
self.
|
394
|
+
self.last_prefill_tokens = 0
|
356
395
|
self.last_decode_stats_tic = time.perf_counter()
|
357
396
|
self.last_prefill_stats_tic = time.perf_counter()
|
358
397
|
self.return_health_check_ct = 0
|
359
398
|
self.current_stream = torch.get_device_module(self.device).current_stream()
|
360
399
|
if self.device == "cpu":
|
361
400
|
self.current_stream.synchronize = lambda: None # No-op for CPU
|
401
|
+
self.forward_sleep_time = None
|
362
402
|
|
363
403
|
# Init session info
|
364
404
|
self.sessions: Dict[str, Session] = {}
|
@@ -420,10 +460,16 @@ class Scheduler(
|
|
420
460
|
self.torch_profiler = None
|
421
461
|
self.torch_profiler_output_dir: Optional[str] = None
|
422
462
|
self.profiler_activities: Optional[List[str]] = None
|
423
|
-
self.
|
463
|
+
self.profile_id: Optional[str] = None
|
424
464
|
self.profiler_target_forward_ct: Optional[int] = None
|
425
|
-
|
426
|
-
self.
|
465
|
+
self.profiler_target_prefill_ct: Optional[int] = None
|
466
|
+
self.profiler_target_decode_ct: Optional[int] = None
|
467
|
+
self.profiler_prefill_ct: Optional[int] = None
|
468
|
+
self.profiler_decode_ct: Optional[int] = None
|
469
|
+
self.profile_by_stage: bool = False
|
470
|
+
self.profile_steps: Optional[int] = None
|
471
|
+
self.profile_in_progress: bool = False
|
472
|
+
self.rpd_profiler = None
|
427
473
|
|
428
474
|
# Init metrics stats
|
429
475
|
self.init_metrics()
|
@@ -462,6 +508,10 @@ class Scheduler(
|
|
462
508
|
)
|
463
509
|
self.init_disaggregation()
|
464
510
|
|
511
|
+
def maybe_sleep_on_idle(self):
|
512
|
+
if self.idle_sleeper is not None:
|
513
|
+
self.idle_sleeper.maybe_sleep()
|
514
|
+
|
465
515
|
def init_tokenizer(self):
|
466
516
|
server_args = self.server_args
|
467
517
|
|
@@ -556,7 +606,9 @@ class Scheduler(
|
|
556
606
|
|
557
607
|
def init_kv_events(self, kv_events_config: Optional[str]):
|
558
608
|
if self.enable_kv_cache_events:
|
559
|
-
self.kv_event_publisher = EventPublisherFactory.create(
|
609
|
+
self.kv_event_publisher = EventPublisherFactory.create(
|
610
|
+
kv_events_config, self.attn_dp_rank
|
611
|
+
)
|
560
612
|
|
561
613
|
def init_disaggregation(self):
|
562
614
|
self.transfer_backend = TransferBackend(
|
@@ -567,7 +619,7 @@ class Scheduler(
|
|
567
619
|
self.disaggregation_mode == DisaggregationMode.DECODE
|
568
620
|
): # *2 for the headroom.
|
569
621
|
buffer_size = (self.req_to_token_pool.size) * 2
|
570
|
-
req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
622
|
+
self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
571
623
|
buffer_size
|
572
624
|
)
|
573
625
|
self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
|
@@ -575,7 +627,8 @@ class Scheduler(
|
|
575
627
|
# The decode requests polling kv cache
|
576
628
|
self.disagg_decode_transfer_queue = DecodeTransferQueue(
|
577
629
|
gloo_group=self.attn_tp_cpu_group,
|
578
|
-
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
630
|
+
req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
|
631
|
+
tp_rank=self.tp_rank,
|
579
632
|
metadata_buffers=self.disagg_metadata_buffers,
|
580
633
|
scheduler=self,
|
581
634
|
tree_cache=self.tree_cache,
|
@@ -590,7 +643,7 @@ class Scheduler(
|
|
590
643
|
if self.draft_worker is None
|
591
644
|
else self.draft_worker.model_runner.token_to_kv_pool
|
592
645
|
),
|
593
|
-
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
646
|
+
req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
|
594
647
|
metadata_buffers=self.disagg_metadata_buffers,
|
595
648
|
scheduler=self,
|
596
649
|
transfer_queue=self.disagg_decode_transfer_queue,
|
@@ -598,7 +651,12 @@ class Scheduler(
|
|
598
651
|
gloo_group=self.attn_tp_cpu_group,
|
599
652
|
tp_rank=self.tp_rank,
|
600
653
|
tp_size=self.tp_size,
|
654
|
+
dp_size=self.server_args.dp_size,
|
655
|
+
gpu_id=self.gpu_id,
|
601
656
|
bootstrap_port=self.server_args.disaggregation_bootstrap_port,
|
657
|
+
max_total_num_tokens=self.max_total_num_tokens,
|
658
|
+
prefill_pp_size=self.server_args.disaggregation_prefill_pp,
|
659
|
+
num_reserved_decode_tokens=self.server_args.num_reserved_decode_tokens,
|
602
660
|
transfer_backend=self.transfer_backend,
|
603
661
|
)
|
604
662
|
|
@@ -608,7 +666,7 @@ class Scheduler(
|
|
608
666
|
elif self.disaggregation_mode == DisaggregationMode.PREFILL:
|
609
667
|
# *2 for the headroom.
|
610
668
|
buffer_size = self.max_running_requests * 2
|
611
|
-
req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
669
|
+
self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
|
612
670
|
buffer_size
|
613
671
|
)
|
614
672
|
self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
|
@@ -620,14 +678,20 @@ class Scheduler(
|
|
620
678
|
if self.draft_worker is None
|
621
679
|
else self.draft_worker.model_runner.token_to_kv_pool
|
622
680
|
),
|
623
|
-
req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
|
681
|
+
req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
|
624
682
|
metadata_buffers=self.disagg_metadata_buffers,
|
625
683
|
tp_rank=self.tp_rank,
|
626
684
|
tp_size=self.tp_size,
|
685
|
+
gpu_id=self.gpu_id,
|
627
686
|
bootstrap_port=self.server_args.disaggregation_bootstrap_port,
|
628
687
|
gloo_group=self.attn_tp_cpu_group,
|
629
|
-
|
688
|
+
max_total_num_tokens=self.max_total_num_tokens,
|
689
|
+
decode_tp_size=self.server_args.disaggregation_decode_tp,
|
690
|
+
decode_dp_size=self.server_args.disaggregation_decode_dp,
|
630
691
|
scheduler=self,
|
692
|
+
pp_rank=self.pp_rank,
|
693
|
+
pp_size=self.pp_size,
|
694
|
+
transfer_backend=self.transfer_backend,
|
631
695
|
)
|
632
696
|
# The prefill requests that are in the middle of kv sending
|
633
697
|
self.disagg_prefill_inflight_queue: List[Req] = []
|
@@ -649,6 +713,7 @@ class Scheduler(
|
|
649
713
|
# When the server is idle, do self-check and re-init some states
|
650
714
|
self.check_memory()
|
651
715
|
self.new_token_ratio = self.init_new_token_ratio
|
716
|
+
self.maybe_sleep_on_idle()
|
652
717
|
|
653
718
|
self.last_batch = batch
|
654
719
|
|
@@ -693,6 +758,7 @@ class Scheduler(
|
|
693
758
|
# When the server is idle, do self-check and re-init some states
|
694
759
|
self.check_memory()
|
695
760
|
self.new_token_ratio = self.init_new_token_ratio
|
761
|
+
self.maybe_sleep_on_idle()
|
696
762
|
|
697
763
|
self.last_batch = batch
|
698
764
|
|
@@ -798,6 +864,7 @@ class Scheduler(
|
|
798
864
|
if server_is_idle:
|
799
865
|
self.check_memory()
|
800
866
|
self.new_token_ratio = self.init_new_token_ratio
|
867
|
+
self.maybe_sleep_on_idle()
|
801
868
|
|
802
869
|
def recv_requests(self) -> List[Req]:
|
803
870
|
"""Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
|
@@ -931,18 +998,19 @@ class Scheduler(
|
|
931
998
|
bootstrap_host=recv_req.bootstrap_host,
|
932
999
|
bootstrap_port=recv_req.bootstrap_port,
|
933
1000
|
bootstrap_room=recv_req.bootstrap_room,
|
1001
|
+
data_parallel_rank=recv_req.data_parallel_rank,
|
934
1002
|
)
|
935
1003
|
req.tokenizer = self.tokenizer
|
936
1004
|
|
937
1005
|
if self.disaggregation_mode != DisaggregationMode.NULL:
|
938
1006
|
# Invalid request for disaggregated mode
|
939
1007
|
if recv_req.bootstrap_room is None:
|
940
|
-
|
1008
|
+
error_msg = (
|
941
1009
|
f"Invalid request: Disaggregated request received without "
|
942
1010
|
f"boostrap room id. {req.rid=}"
|
943
1011
|
)
|
944
|
-
logger.error(
|
945
|
-
prepare_abort(req,
|
1012
|
+
logger.error(error_msg)
|
1013
|
+
prepare_abort(req, error_msg)
|
946
1014
|
self.stream_output([req], req.return_logprob)
|
947
1015
|
return
|
948
1016
|
|
@@ -973,29 +1041,23 @@ class Scheduler(
|
|
973
1041
|
req.extend_image_inputs(image_inputs)
|
974
1042
|
|
975
1043
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
req.origin_input_ids = [0]
|
982
|
-
req.multimodal_inputs = None
|
983
|
-
req.sampling_params.max_new_tokens = 0
|
984
|
-
req.finished_reason = FINISH_ABORT(
|
985
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1044
|
+
req.set_finish_with_abort(
|
1045
|
+
error_msg=(
|
1046
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
1047
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
1048
|
+
)
|
986
1049
|
)
|
987
1050
|
self._add_request_to_queue(req)
|
988
1051
|
return
|
989
1052
|
|
990
|
-
# Validate
|
1053
|
+
# Validate prompt length
|
991
1054
|
error_msg = validate_input_length(
|
992
1055
|
req,
|
993
1056
|
self.max_req_input_len,
|
994
1057
|
self.server_args.allow_auto_truncate,
|
995
1058
|
)
|
996
1059
|
if error_msg:
|
997
|
-
req.
|
998
|
-
req.sampling_params.max_new_tokens = 0
|
1060
|
+
req.set_finish_with_abort(error_msg)
|
999
1061
|
self._add_request_to_queue(req)
|
1000
1062
|
return
|
1001
1063
|
|
@@ -1007,12 +1069,9 @@ class Scheduler(
|
|
1007
1069
|
req.logprob_start_len = recv_req.logprob_start_len
|
1008
1070
|
|
1009
1071
|
if req.logprob_start_len >= len(req.origin_input_ids):
|
1010
|
-
req.
|
1011
|
-
f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
|
1012
|
-
HTTPStatus.BAD_REQUEST,
|
1013
|
-
"BadRequestError",
|
1014
|
-
)
|
1072
|
+
error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
|
1015
1073
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
1074
|
+
req.set_finish_with_abort(error_msg)
|
1016
1075
|
self._add_request_to_queue(req)
|
1017
1076
|
return
|
1018
1077
|
|
@@ -1049,6 +1108,10 @@ class Scheduler(
|
|
1049
1108
|
if not cache_hit:
|
1050
1109
|
req.grammar_key = key
|
1051
1110
|
add_to_grammar_queue = True
|
1111
|
+
else:
|
1112
|
+
if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
|
1113
|
+
error_msg = f"Invalid grammar request with cache hit: {key=}"
|
1114
|
+
req.set_finish_with_abort(error_msg)
|
1052
1115
|
|
1053
1116
|
if add_to_grammar_queue:
|
1054
1117
|
req.queue_time_start = time.perf_counter()
|
@@ -1059,18 +1122,22 @@ class Scheduler(
|
|
1059
1122
|
def _add_request_to_queue(self, req: Req):
|
1060
1123
|
req.queue_time_start = time.perf_counter()
|
1061
1124
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1062
|
-
self.disagg_prefill_bootstrap_queue.add(
|
1125
|
+
self.disagg_prefill_bootstrap_queue.add(
|
1126
|
+
req, self.model_config.num_key_value_heads
|
1127
|
+
)
|
1063
1128
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1064
1129
|
self.disagg_decode_prealloc_queue.add(req)
|
1065
1130
|
else:
|
1066
1131
|
self.waiting_queue.append(req)
|
1067
1132
|
|
1068
|
-
def _extend_requests_to_queue(self, reqs: List[Req]):
|
1133
|
+
def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
|
1069
1134
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1070
|
-
self.disagg_prefill_bootstrap_queue.extend(
|
1135
|
+
self.disagg_prefill_bootstrap_queue.extend(
|
1136
|
+
reqs, self.model_config.num_key_value_heads
|
1137
|
+
)
|
1071
1138
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1072
1139
|
# If this is a decode server, we put the request to the decode pending prealloc queue
|
1073
|
-
self.disagg_decode_prealloc_queue.extend(reqs)
|
1140
|
+
self.disagg_decode_prealloc_queue.extend(reqs, is_retracted)
|
1074
1141
|
else:
|
1075
1142
|
self.waiting_queue.extend(reqs)
|
1076
1143
|
|
@@ -1083,6 +1150,7 @@ class Scheduler(
|
|
1083
1150
|
recv_req.input_text,
|
1084
1151
|
recv_req.input_ids,
|
1085
1152
|
recv_req.sampling_params,
|
1153
|
+
token_type_ids=recv_req.token_type_ids,
|
1086
1154
|
)
|
1087
1155
|
req.tokenizer = self.tokenizer
|
1088
1156
|
|
@@ -1096,19 +1164,13 @@ class Scheduler(
|
|
1096
1164
|
req.extend_image_inputs(image_inputs)
|
1097
1165
|
|
1098
1166
|
if len(req.origin_input_ids) >= self.max_req_input_len:
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
req.origin_input_ids = [0]
|
1105
|
-
req.multimodal_inputs = None
|
1106
|
-
req.sampling_params.max_new_tokens = 0
|
1107
|
-
req.finished_reason = FINISH_ABORT(
|
1108
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1167
|
+
req.set_finish_with_abort(
|
1168
|
+
error_msg=(
|
1169
|
+
"Multimodal prompt is too long after expanding multimodal tokens. "
|
1170
|
+
f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
|
1171
|
+
)
|
1109
1172
|
)
|
1110
|
-
|
1111
|
-
self.waiting_queue.append(req)
|
1173
|
+
self._add_request_to_queue(req)
|
1112
1174
|
return
|
1113
1175
|
|
1114
1176
|
# Validate prompts length
|
@@ -1133,8 +1195,8 @@ class Scheduler(
|
|
1133
1195
|
):
|
1134
1196
|
gap_latency = time.perf_counter() - self.last_prefill_stats_tic
|
1135
1197
|
self.last_prefill_stats_tic = time.perf_counter()
|
1136
|
-
self.last_input_throughput = self.
|
1137
|
-
self.
|
1198
|
+
self.last_input_throughput = self.last_prefill_tokens / gap_latency
|
1199
|
+
self.last_prefill_tokens = adder.log_input_tokens
|
1138
1200
|
|
1139
1201
|
num_used = self.max_total_num_tokens - (
|
1140
1202
|
self.token_to_kv_pool_allocator.available_size()
|
@@ -1148,14 +1210,15 @@ class Scheduler(
|
|
1148
1210
|
f"#new-token: {adder.log_input_tokens}, "
|
1149
1211
|
f"#cached-token: {adder.log_hit_tokens}, "
|
1150
1212
|
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
1151
|
-
f"#running-req: {running_bs}, "
|
1152
1213
|
)
|
1153
1214
|
|
1154
1215
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1155
1216
|
f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
|
1156
1217
|
f += f"#queue-req: {len(self.waiting_queue)}, "
|
1157
|
-
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
|
1218
|
+
f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
|
1219
|
+
f += f"input throughput (token/s): {self.last_input_throughput:.2f} "
|
1158
1220
|
else:
|
1221
|
+
f += f"#running-req: {running_bs}, "
|
1159
1222
|
f += f"#queue-req: {len(self.waiting_queue)}"
|
1160
1223
|
|
1161
1224
|
logger.info(f)
|
@@ -1218,6 +1281,7 @@ class Scheduler(
|
|
1218
1281
|
|
1219
1282
|
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
1220
1283
|
msg += f"pre-allocated usage: {self.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
|
1284
|
+
msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
|
1221
1285
|
|
1222
1286
|
msg += (
|
1223
1287
|
f"cuda graph: {can_run_cuda_graph}, "
|
@@ -1515,11 +1579,11 @@ class Scheduler(
|
|
1515
1579
|
self.new_token_ratio = new_token_ratio
|
1516
1580
|
|
1517
1581
|
logger.info(
|
1518
|
-
"
|
1582
|
+
"KV cache pool is full. Retract requests. "
|
1519
1583
|
f"#retracted_reqs: {len(retracted_reqs)}, "
|
1520
1584
|
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
|
1521
1585
|
)
|
1522
|
-
self._extend_requests_to_queue(retracted_reqs)
|
1586
|
+
self._extend_requests_to_queue(retracted_reqs, is_retracted=True)
|
1523
1587
|
else:
|
1524
1588
|
self.new_token_ratio = max(
|
1525
1589
|
self.new_token_ratio - self.new_token_ratio_decay,
|
@@ -1539,13 +1603,8 @@ class Scheduler(
|
|
1539
1603
|
"""Run a batch."""
|
1540
1604
|
self.forward_ct += 1
|
1541
1605
|
|
1542
|
-
#
|
1543
|
-
|
1544
|
-
self.profiler_target_forward_ct
|
1545
|
-
and self.profiler_target_forward_ct <= self.forward_ct
|
1546
|
-
):
|
1547
|
-
self.send_to_tokenizer.send_pyobj(self.stop_profile())
|
1548
|
-
|
1606
|
+
# Whether to run the profiler
|
1607
|
+
self._profile_batch_predicate(batch)
|
1549
1608
|
if self.forward_sleep_time is not None:
|
1550
1609
|
logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
|
1551
1610
|
time.sleep(self.forward_sleep_time)
|
@@ -1571,10 +1630,9 @@ class Scheduler(
|
|
1571
1630
|
num_accepted_tokens,
|
1572
1631
|
can_run_cuda_graph,
|
1573
1632
|
) = self.draft_worker.forward_batch_speculative_generation(batch)
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
self.spec_num_total_forward_ct += batch.batch_size()
|
1633
|
+
bs = batch.batch_size()
|
1634
|
+
self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
|
1635
|
+
self.spec_num_total_forward_ct += bs
|
1578
1636
|
self.num_generated_tokens += num_accepted_tokens
|
1579
1637
|
|
1580
1638
|
if self.pp_group.is_last_rank:
|
@@ -1648,6 +1706,9 @@ class Scheduler(
|
|
1648
1706
|
disable_cuda_graph=self.server_args.disable_cuda_graph,
|
1649
1707
|
spec_algorithm=self.spec_algorithm,
|
1650
1708
|
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
1709
|
+
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
1710
|
+
enable_deepep_moe=self.server_args.enable_deepep_moe,
|
1711
|
+
deepep_mode=DeepEPMode[self.server_args.deepep_mode],
|
1651
1712
|
)
|
1652
1713
|
|
1653
1714
|
@staticmethod
|
@@ -1661,6 +1722,9 @@ class Scheduler(
|
|
1661
1722
|
disable_cuda_graph: bool,
|
1662
1723
|
spec_algorithm,
|
1663
1724
|
speculative_num_draft_tokens,
|
1725
|
+
enable_two_batch_overlap: bool,
|
1726
|
+
enable_deepep_moe: bool,
|
1727
|
+
deepep_mode: DeepEPMode,
|
1664
1728
|
):
|
1665
1729
|
# Check if other DP workers have running batches
|
1666
1730
|
if local_batch is None:
|
@@ -1696,17 +1760,26 @@ class Scheduler(
|
|
1696
1760
|
is_extend_in_batch = (
|
1697
1761
|
local_batch.forward_mode.is_extend() if local_batch else False
|
1698
1762
|
)
|
1763
|
+
|
1764
|
+
tbo_preparer = TboDPAttentionPreparer()
|
1765
|
+
|
1699
1766
|
local_info = torch.tensor(
|
1700
1767
|
[
|
1701
1768
|
num_tokens,
|
1702
1769
|
can_cuda_graph,
|
1703
1770
|
num_tokens_for_logprob,
|
1704
1771
|
is_extend_in_batch,
|
1772
|
+
*tbo_preparer.prepare_all_gather(
|
1773
|
+
local_batch,
|
1774
|
+
deepep_mode,
|
1775
|
+
enable_deepep_moe,
|
1776
|
+
enable_two_batch_overlap,
|
1777
|
+
),
|
1705
1778
|
],
|
1706
1779
|
dtype=torch.int64,
|
1707
1780
|
)
|
1708
1781
|
global_info = torch.empty(
|
1709
|
-
(dp_size, attn_tp_size,
|
1782
|
+
(dp_size, attn_tp_size, 6),
|
1710
1783
|
dtype=torch.int64,
|
1711
1784
|
)
|
1712
1785
|
torch.distributed.all_gather_into_tensor(
|
@@ -1719,6 +1792,10 @@ class Scheduler(
|
|
1719
1792
|
global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
|
1720
1793
|
is_extend_in_batch = global_info[:, 0, 3].tolist()
|
1721
1794
|
|
1795
|
+
tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
|
1796
|
+
global_info[:, :, 4:6]
|
1797
|
+
)
|
1798
|
+
|
1722
1799
|
if local_batch is None and max(global_num_tokens) > 0:
|
1723
1800
|
local_batch = get_idle_batch()
|
1724
1801
|
|
@@ -1732,6 +1809,8 @@ class Scheduler(
|
|
1732
1809
|
local_batch.global_num_tokens_for_logprob = (
|
1733
1810
|
global_num_tokens_for_logprob
|
1734
1811
|
)
|
1812
|
+
local_batch.tbo_split_seq_index = tbo_split_seq_index
|
1813
|
+
local_batch.global_forward_mode = global_forward_mode
|
1735
1814
|
|
1736
1815
|
# Check forward mode for cuda graph
|
1737
1816
|
if not disable_cuda_graph:
|
@@ -1757,17 +1836,25 @@ class Scheduler(
|
|
1757
1836
|
"""Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
|
1758
1837
|
|
1759
1838
|
num_ready_reqs = 0
|
1760
|
-
|
1839
|
+
num_timeout_reqs = 0
|
1761
1840
|
for req in self.grammar_queue:
|
1762
1841
|
try:
|
1842
|
+
if req.finished(): # It is aborted by AbortReq
|
1843
|
+
num_ready_reqs += 1
|
1844
|
+
continue
|
1763
1845
|
req.grammar = req.grammar.result(timeout=0.03)
|
1764
|
-
|
1765
|
-
|
1846
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1847
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1848
|
+
req.set_finish_with_abort(
|
1849
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1850
|
+
)
|
1766
1851
|
num_ready_reqs += 1
|
1767
1852
|
except futures._base.TimeoutError:
|
1768
1853
|
req.grammar_wait_ct += 1
|
1854
|
+
# NOTE(lianmin): this timeout is the waiting time of the above line. It is
|
1855
|
+
# not the waiting time from it enters the grammar queue.
|
1769
1856
|
if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
|
1770
|
-
|
1857
|
+
num_timeout_reqs = 1
|
1771
1858
|
break
|
1772
1859
|
|
1773
1860
|
if self.server_args.enable_dp_attention:
|
@@ -1779,28 +1866,33 @@ class Scheduler(
|
|
1779
1866
|
|
1780
1867
|
if tp_size > 1:
|
1781
1868
|
# Sync across TP ranks to make sure they have the same number of ready requests
|
1782
|
-
tensor = torch.tensor([num_ready_reqs,
|
1869
|
+
tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
|
1783
1870
|
torch.distributed.all_reduce(
|
1784
1871
|
tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
|
1785
1872
|
)
|
1786
|
-
num_ready_reqs_max,
|
1873
|
+
num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
|
1787
1874
|
|
1788
1875
|
for i in range(num_ready_reqs, num_ready_reqs_max):
|
1789
1876
|
req = self.grammar_queue[i]
|
1877
|
+
if req.finished(): # It is aborted by AbortReq
|
1878
|
+
continue
|
1790
1879
|
req.grammar = req.grammar.result()
|
1791
|
-
|
1792
|
-
|
1880
|
+
self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
|
1881
|
+
if req.grammar is INVALID_GRAMMAR_OBJ:
|
1882
|
+
req.set_finish_with_abort(
|
1883
|
+
f"Invalid grammar request: {req.grammar_key=}"
|
1884
|
+
)
|
1885
|
+
else:
|
1886
|
+
num_ready_reqs_max = num_ready_reqs
|
1887
|
+
num_timeout_reqs_max = num_timeout_reqs
|
1793
1888
|
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
1802
|
-
)
|
1803
|
-
num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
|
1889
|
+
for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
|
1890
|
+
req = self.grammar_queue[i]
|
1891
|
+
req.grammar.cancel()
|
1892
|
+
error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
|
1893
|
+
req.set_finish_with_abort(error_msg)
|
1894
|
+
self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
|
1895
|
+
num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
|
1804
1896
|
|
1805
1897
|
self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
|
1806
1898
|
self.grammar_queue = self.grammar_queue[num_ready_reqs:]
|
@@ -1887,6 +1979,27 @@ class Scheduler(
|
|
1887
1979
|
if_success = False
|
1888
1980
|
return if_success
|
1889
1981
|
|
1982
|
+
def get_load(self):
|
1983
|
+
# TODO(lsyin): use dynamically maintained num_waiting_tokens
|
1984
|
+
load = (
|
1985
|
+
self.max_total_num_tokens
|
1986
|
+
- self.token_to_kv_pool_allocator.available_size()
|
1987
|
+
- self.tree_cache.evictable_size()
|
1988
|
+
)
|
1989
|
+
load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
|
1990
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
1991
|
+
load += sum(
|
1992
|
+
len(req.origin_input_ids)
|
1993
|
+
for req in self.disagg_prefill_bootstrap_queue.queue
|
1994
|
+
)
|
1995
|
+
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
1996
|
+
load += sum(
|
1997
|
+
len(req.req.origin_input_ids)
|
1998
|
+
for req in self.disagg_decode_prealloc_queue.queue
|
1999
|
+
)
|
2000
|
+
|
2001
|
+
return load
|
2002
|
+
|
1890
2003
|
def get_internal_state(self, recv_req: GetInternalStateReq):
|
1891
2004
|
ret = dict(global_server_args_dict)
|
1892
2005
|
ret["last_gen_throughput"] = self.last_gen_throughput
|
@@ -1896,9 +2009,10 @@ class Scheduler(
|
|
1896
2009
|
)
|
1897
2010
|
if RECORD_STEP_TIME:
|
1898
2011
|
ret["step_time_dict"] = self.step_time_dict
|
1899
|
-
|
1900
|
-
|
1901
|
-
|
2012
|
+
|
2013
|
+
ret["load"] = self.get_load()
|
2014
|
+
|
2015
|
+
return GetInternalStateReqOutput(internal_state=ret)
|
1902
2016
|
|
1903
2017
|
def set_internal_state(self, recv_req: SetInternalStateReq):
|
1904
2018
|
server_args_dict = recv_req.server_args
|
@@ -1932,7 +2046,7 @@ class Scheduler(
|
|
1932
2046
|
self.cum_spec_accept_length = self.cum_spec_accept_count = 0
|
1933
2047
|
for k, v in server_args_dict.items():
|
1934
2048
|
global_server_args_dict[k] = v
|
1935
|
-
logger.info(f"Global server args updated!
|
2049
|
+
logger.info(f"Global server args updated! {global_server_args_dict=}")
|
1936
2050
|
return SetInternalStateReqOutput(
|
1937
2051
|
updated=True,
|
1938
2052
|
server_args=global_server_args_dict,
|
@@ -1974,8 +2088,6 @@ class Scheduler(
|
|
1974
2088
|
)
|
1975
2089
|
|
1976
2090
|
def abort_request(self, recv_req: AbortReq):
|
1977
|
-
# TODO(lmzheng): abort the requests in the grammar queue.
|
1978
|
-
|
1979
2091
|
# Delete requests in the waiting queue
|
1980
2092
|
to_del = []
|
1981
2093
|
for i, req in enumerate(self.waiting_queue):
|
@@ -1984,10 +2096,24 @@ class Scheduler(
|
|
1984
2096
|
|
1985
2097
|
# Sort in reverse order to avoid index issues when deleting
|
1986
2098
|
for i in reversed(to_del):
|
2099
|
+
# Abort method 1: directly pop from the queue
|
2100
|
+
# This only works for requests that have not started anything.
|
2101
|
+
# We still need to send something back to TokenizerManager to clean up the state.
|
1987
2102
|
req = self.waiting_queue.pop(i)
|
1988
2103
|
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
|
1989
2104
|
logger.debug(f"Abort queued request. {req.rid=}")
|
1990
2105
|
|
2106
|
+
# Delete the requests in the grammar queue
|
2107
|
+
for req in self.grammar_queue:
|
2108
|
+
# Abort method 2: call `set_finish_with_abort`
|
2109
|
+
# The request will still run one prefill forward pass.
|
2110
|
+
# In this case, we change the input_ids to be only one token to make this prefill cheap.
|
2111
|
+
if req.rid.startswith(recv_req.rid):
|
2112
|
+
logger.debug(f"Abort grammar queue request. {req.rid=}")
|
2113
|
+
if req.grammar:
|
2114
|
+
req.grammar.cancel()
|
2115
|
+
req.set_finish_with_abort("Aborted by AbortReq.")
|
2116
|
+
|
1991
2117
|
# Delete requests in the running batch
|
1992
2118
|
if self.cur_batch is self.running_batch or self.cur_batch is None:
|
1993
2119
|
reqs = self.running_batch.reqs
|
@@ -1996,6 +2122,9 @@ class Scheduler(
|
|
1996
2122
|
|
1997
2123
|
for req in reqs:
|
1998
2124
|
if req.rid.startswith(recv_req.rid) and not req.finished():
|
2125
|
+
# Abort method 3: set `to_abort=True`
|
2126
|
+
# The request will still run one decode forward pass.
|
2127
|
+
# Then we reuse all existing code to clean up the KV cache allocation.
|
1999
2128
|
logger.debug(f"Abort running request. {req.rid=}")
|
2000
2129
|
req.to_abort = True
|
2001
2130
|
|
@@ -2075,46 +2204,86 @@ class Scheduler(
|
|
2075
2204
|
|
2076
2205
|
def profile(self, recv_req: ProfileReq):
|
2077
2206
|
if recv_req.type == ProfileReqType.START_PROFILE:
|
2078
|
-
|
2079
|
-
|
2080
|
-
|
2081
|
-
|
2082
|
-
|
2083
|
-
|
2084
|
-
|
2085
|
-
|
2207
|
+
if recv_req.profile_by_stage:
|
2208
|
+
return self.init_profile(
|
2209
|
+
recv_req.output_dir,
|
2210
|
+
recv_req.num_steps,
|
2211
|
+
recv_req.activities,
|
2212
|
+
recv_req.with_stack,
|
2213
|
+
recv_req.record_shapes,
|
2214
|
+
recv_req.profile_by_stage,
|
2215
|
+
recv_req.profile_id,
|
2216
|
+
)
|
2217
|
+
else:
|
2218
|
+
self.init_profile(
|
2219
|
+
recv_req.output_dir,
|
2220
|
+
recv_req.num_steps,
|
2221
|
+
recv_req.activities,
|
2222
|
+
recv_req.with_stack,
|
2223
|
+
recv_req.record_shapes,
|
2224
|
+
recv_req.profile_by_stage,
|
2225
|
+
recv_req.profile_id,
|
2226
|
+
)
|
2227
|
+
return self.start_profile(True)
|
2086
2228
|
else:
|
2087
2229
|
return self.stop_profile()
|
2088
2230
|
|
2089
|
-
def
|
2231
|
+
def init_profile(
|
2090
2232
|
self,
|
2091
2233
|
output_dir: Optional[str],
|
2092
2234
|
num_steps: Optional[int],
|
2093
2235
|
activities: Optional[List[str]],
|
2094
2236
|
with_stack: Optional[bool],
|
2095
2237
|
record_shapes: Optional[bool],
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2238
|
+
profile_by_stage: bool,
|
2239
|
+
profile_id: str,
|
2240
|
+
) -> ProfileReqOutput:
|
2241
|
+
if self.profile_in_progress:
|
2099
2242
|
return ProfileReqOutput(
|
2100
2243
|
success=False,
|
2101
2244
|
message="Profiling is already in progress. Call /stop_profile first.",
|
2102
2245
|
)
|
2103
2246
|
|
2247
|
+
self.profile_by_stage = profile_by_stage
|
2248
|
+
|
2104
2249
|
if output_dir is None:
|
2105
2250
|
output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
|
2106
2251
|
if activities is None:
|
2107
2252
|
activities = ["CPU", "GPU"]
|
2108
2253
|
|
2109
2254
|
self.torch_profiler_output_dir = output_dir
|
2255
|
+
self.torch_profiler_with_stack = with_stack
|
2256
|
+
self.torch_profiler_record_shapes = record_shapes
|
2110
2257
|
self.profiler_activities = activities
|
2111
|
-
self.
|
2258
|
+
self.profile_id = profile_id
|
2259
|
+
|
2260
|
+
if num_steps:
|
2261
|
+
self.profile_steps = num_steps
|
2262
|
+
if self.profile_by_stage:
|
2263
|
+
self.profiler_target_prefill_ct = num_steps
|
2264
|
+
self.profiler_target_decode_ct = num_steps
|
2265
|
+
self.profiler_prefill_ct = 0
|
2266
|
+
self.profiler_decode_ct = 0
|
2267
|
+
else:
|
2268
|
+
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2269
|
+
# The caller will be notified when reaching profiler_target_forward_ct
|
2270
|
+
else:
|
2271
|
+
self.profiler_target_forward_ct = None
|
2272
|
+
|
2273
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2274
|
+
|
2275
|
+
def start_profile(
|
2276
|
+
self, stage: Optional[ForwardMode] = None
|
2277
|
+
) -> ProfileReqOutput | None:
|
2278
|
+
stage_str = f" for {stage.__str__()}" if stage else ""
|
2112
2279
|
logger.info(
|
2113
|
-
"Profiling starts. Traces will be saved to:
|
2114
|
-
self.torch_profiler_output_dir,
|
2115
|
-
self.profiler_id,
|
2280
|
+
f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
|
2116
2281
|
)
|
2117
2282
|
|
2283
|
+
activities = self.profiler_activities
|
2284
|
+
with_stack = self.torch_profiler_with_stack
|
2285
|
+
record_shapes = self.torch_profiler_record_shapes
|
2286
|
+
|
2118
2287
|
activity_map = {
|
2119
2288
|
"CPU": torch.profiler.ProfilerActivity.CPU,
|
2120
2289
|
"GPU": torch.profiler.ProfilerActivity.CUDA,
|
@@ -2123,48 +2292,100 @@ class Scheduler(
|
|
2123
2292
|
activity_map[a] for a in activities if a in activity_map
|
2124
2293
|
]
|
2125
2294
|
|
2126
|
-
if
|
2295
|
+
if "RPD" in activities:
|
2296
|
+
from rpdTracerControl import rpdTracerControl
|
2297
|
+
|
2298
|
+
rpdTracerControl.skipCreate()
|
2299
|
+
|
2300
|
+
self.rpd_profile_path = os.path.join(
|
2301
|
+
self.torch_profiler_output_dir,
|
2302
|
+
"rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
|
2303
|
+
)
|
2304
|
+
|
2305
|
+
if self.tp_rank == 0:
|
2306
|
+
import sqlite3
|
2307
|
+
|
2308
|
+
from rocpd.schema import RocpdSchema
|
2309
|
+
|
2310
|
+
if os.path.exists("trace.rpd"):
|
2311
|
+
os.unlink("trace.rpd")
|
2312
|
+
schema = RocpdSchema()
|
2313
|
+
connection = sqlite3.connect("trace.rpd")
|
2314
|
+
schema.writeSchema(connection)
|
2315
|
+
connection.commit()
|
2316
|
+
del connection
|
2317
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2318
|
+
|
2319
|
+
self.rpd_profiler = rpdTracerControl()
|
2320
|
+
self.rpd_profiler.setPythonTrace(True)
|
2321
|
+
self.rpd_profiler.start()
|
2322
|
+
self.rpd_profiler.rangePush("", "rpd profile range", "")
|
2323
|
+
self.profile_in_progress = True
|
2324
|
+
elif torchprof_activities:
|
2127
2325
|
self.torch_profiler = torch.profiler.profile(
|
2128
2326
|
activities=torchprof_activities,
|
2129
2327
|
with_stack=with_stack if with_stack is not None else True,
|
2130
2328
|
record_shapes=record_shapes if record_shapes is not None else False,
|
2131
2329
|
)
|
2132
2330
|
self.torch_profiler.start()
|
2331
|
+
self.profile_in_progress = True
|
2133
2332
|
|
2134
2333
|
if "MEM" in activities:
|
2135
2334
|
torch.cuda.memory._record_memory_history(max_entries=100000)
|
2335
|
+
self.profile_in_progress = True
|
2136
2336
|
|
2137
2337
|
if "CUDA_PROFILER" in activities:
|
2138
2338
|
torch.cuda.cudart().cudaProfilerStart()
|
2139
2339
|
|
2140
|
-
|
2141
|
-
self.profiler_target_forward_ct = self.forward_ct + num_steps
|
2142
|
-
# The caller will be notified when reaching profiler_target_forward_ct
|
2143
|
-
else:
|
2144
|
-
self.profiler_target_forward_ct = None
|
2145
|
-
return ProfileReqOutput(success=True, message="Succeeded")
|
2340
|
+
return ProfileReqOutput(success=True, message="Succeeded")
|
2146
2341
|
|
2147
|
-
def stop_profile(
|
2148
|
-
|
2342
|
+
def stop_profile(
|
2343
|
+
self, stage: Optional[ForwardMode] = None
|
2344
|
+
) -> ProfileReqOutput | None:
|
2345
|
+
if not self.profile_in_progress:
|
2149
2346
|
return ProfileReqOutput(
|
2150
2347
|
success=False,
|
2151
2348
|
message="Profiling is not in progress. Call /start_profile first.",
|
2152
2349
|
)
|
2153
2350
|
|
2154
|
-
|
2351
|
+
if not Path(self.torch_profiler_output_dir).exists():
|
2352
|
+
Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
|
2353
|
+
|
2354
|
+
stage_suffix = f"-{stage.__str__()}" if stage else ""
|
2355
|
+
logger.info("Stop profiling" + stage_suffix + "...")
|
2155
2356
|
if self.torch_profiler is not None:
|
2156
2357
|
self.torch_profiler.stop()
|
2157
2358
|
self.torch_profiler.export_chrome_trace(
|
2158
2359
|
os.path.join(
|
2159
2360
|
self.torch_profiler_output_dir,
|
2160
|
-
self.
|
2361
|
+
self.profile_id
|
2362
|
+
+ f"-TP-{self.tp_rank}"
|
2363
|
+
+ stage_suffix
|
2364
|
+
+ ".trace.json.gz",
|
2161
2365
|
)
|
2162
2366
|
)
|
2367
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2368
|
+
|
2369
|
+
if self.rpd_profiler is not None:
|
2370
|
+
self.rpd_profiler.rangePop()
|
2371
|
+
self.rpd_profiler.stop()
|
2372
|
+
self.rpd_profiler.flush()
|
2373
|
+
|
2374
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
2375
|
+
if self.tp_rank == 0:
|
2376
|
+
from sglang.srt.utils import rpd_to_chrome_trace
|
2377
|
+
|
2378
|
+
rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
|
2379
|
+
self.rpd_profiler = None
|
2380
|
+
self.rpd_profiler_path = None
|
2163
2381
|
|
2164
|
-
if "MEM" in self.profiler_activities:
|
2382
|
+
if self.profiler_activities is not None and "MEM" in self.profiler_activities:
|
2165
2383
|
memory_profile_path = os.path.join(
|
2166
2384
|
self.torch_profiler_output_dir,
|
2167
|
-
|
2385
|
+
str(time.time())
|
2386
|
+
+ f"-TP-{self.tp_rank}-memory"
|
2387
|
+
+ stage_suffix
|
2388
|
+
+ ".pickle",
|
2168
2389
|
)
|
2169
2390
|
torch.cuda.memory._dump_snapshot(memory_profile_path)
|
2170
2391
|
torch.cuda.memory._record_memory_history(enabled=None)
|
@@ -2177,10 +2398,38 @@ class Scheduler(
|
|
2177
2398
|
self.torch_profiler_output_dir,
|
2178
2399
|
)
|
2179
2400
|
self.torch_profiler = None
|
2180
|
-
self.
|
2181
|
-
|
2182
|
-
|
2183
|
-
|
2401
|
+
self.profile_in_progress = False
|
2402
|
+
|
2403
|
+
return ProfileReqOutput(success=True, message="Succeeded.")
|
2404
|
+
|
2405
|
+
def _profile_batch_predicate(self, batch):
|
2406
|
+
if self.profile_by_stage:
|
2407
|
+
if batch.forward_mode.is_prefill():
|
2408
|
+
if self.profiler_prefill_ct == 0:
|
2409
|
+
self.start_profile(batch.forward_mode)
|
2410
|
+
self.profiler_prefill_ct += 1
|
2411
|
+
if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
|
2412
|
+
if self.profile_in_progress:
|
2413
|
+
self.stop_profile(stage=ForwardMode.EXTEND)
|
2414
|
+
elif batch.forward_mode.is_decode():
|
2415
|
+
if self.profiler_decode_ct == 0:
|
2416
|
+
if self.profile_in_progress:
|
2417
|
+
# force trace flush
|
2418
|
+
self.stop_profile(ForwardMode.EXTEND)
|
2419
|
+
self.start_profile(batch.forward_mode)
|
2420
|
+
self.profiler_decode_ct += 1
|
2421
|
+
if self.profiler_decode_ct > self.profiler_target_decode_ct:
|
2422
|
+
if self.profile_in_progress:
|
2423
|
+
self.stop_profile(stage=ForwardMode.DECODE)
|
2424
|
+
else:
|
2425
|
+
raise RuntimeError("unsupported profile stage")
|
2426
|
+
else:
|
2427
|
+
# Check profiler
|
2428
|
+
if (
|
2429
|
+
self.profiler_target_forward_ct
|
2430
|
+
and self.profiler_target_forward_ct <= self.forward_ct
|
2431
|
+
):
|
2432
|
+
self.stop_profile()
|
2184
2433
|
|
2185
2434
|
def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
|
2186
2435
|
if recv_req == ExpertDistributionReq.START_RECORD:
|