sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -25,16 +25,16 @@ import os
|
|
25
25
|
from collections import deque
|
26
26
|
from dataclasses import dataclass
|
27
27
|
from http import HTTPStatus
|
28
|
-
from typing import TYPE_CHECKING, List, Optional, Tuple
|
28
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
29
29
|
|
30
30
|
import numpy as np
|
31
31
|
import torch
|
32
32
|
from torch.distributed import ProcessGroup
|
33
33
|
|
34
|
-
from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver,
|
34
|
+
from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVPoll
|
35
35
|
from sglang.srt.disaggregation.utils import (
|
36
|
+
FAKE_BOOTSTRAP_HOST,
|
36
37
|
DisaggregationMode,
|
37
|
-
FakeBootstrapHost,
|
38
38
|
KVClassType,
|
39
39
|
MetadataBuffers,
|
40
40
|
ReqToMetadataIdxAllocator,
|
@@ -47,8 +47,13 @@ from sglang.srt.disaggregation.utils import (
|
|
47
47
|
)
|
48
48
|
from sglang.srt.managers.schedule_batch import FINISH_ABORT, ScheduleBatch
|
49
49
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
50
|
-
from sglang.srt.mem_cache.memory_pool import
|
50
|
+
from sglang.srt.mem_cache.memory_pool import (
|
51
|
+
KVCache,
|
52
|
+
ReqToTokenPool,
|
53
|
+
TokenToKVPoolAllocator,
|
54
|
+
)
|
51
55
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
56
|
+
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
52
57
|
|
53
58
|
logger = logging.getLogger(__name__)
|
54
59
|
|
@@ -57,6 +62,67 @@ if TYPE_CHECKING:
|
|
57
62
|
from sglang.srt.managers.scheduler import Scheduler
|
58
63
|
|
59
64
|
|
65
|
+
class DecodeReqToTokenPool:
|
66
|
+
"""
|
67
|
+
The difference of DecodeReqToTokenPool and ReqToTokenPool is that
|
68
|
+
DecodeReqToTokenPool subscribes memory for pre-allocated requests.
|
69
|
+
|
70
|
+
In ReqToTokenPool, if `--max-running-requests` is 8,
|
71
|
+
#pre-allocated + #transfer + #running <= 8, but there are in fact more memory can carry pre-allocated requests.
|
72
|
+
|
73
|
+
In DecodeReqToTokenPool, if `--max-running-requests` is 8,
|
74
|
+
#running <= 8, #pre-allocated + #transfer <= pre_alloc_size, so we can use the free memory to pre-allocate requests to unblock prefill.
|
75
|
+
"""
|
76
|
+
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
size: int,
|
80
|
+
max_context_len: int,
|
81
|
+
device: str,
|
82
|
+
enable_memory_saver: bool,
|
83
|
+
pre_alloc_size: int,
|
84
|
+
):
|
85
|
+
memory_saver_adapter = TorchMemorySaverAdapter.create(
|
86
|
+
enable=enable_memory_saver
|
87
|
+
)
|
88
|
+
|
89
|
+
self.size = size
|
90
|
+
self.max_context_len = max_context_len
|
91
|
+
self.device = device
|
92
|
+
self.pre_alloc_size = pre_alloc_size
|
93
|
+
with memory_saver_adapter.region():
|
94
|
+
self.req_to_token = torch.zeros(
|
95
|
+
(size + pre_alloc_size, max_context_len),
|
96
|
+
dtype=torch.int32,
|
97
|
+
device=device,
|
98
|
+
)
|
99
|
+
|
100
|
+
self.free_slots = list(range(size + pre_alloc_size))
|
101
|
+
|
102
|
+
def write(self, indices, values):
|
103
|
+
self.req_to_token[indices] = values
|
104
|
+
|
105
|
+
def available_size(self):
|
106
|
+
return len(self.free_slots)
|
107
|
+
|
108
|
+
def alloc(self, need_size: int) -> List[int]:
|
109
|
+
if need_size > len(self.free_slots):
|
110
|
+
return None
|
111
|
+
|
112
|
+
select_index = self.free_slots[:need_size]
|
113
|
+
self.free_slots = self.free_slots[need_size:]
|
114
|
+
return select_index
|
115
|
+
|
116
|
+
def free(self, free_index: Union[int, List[int]]):
|
117
|
+
if isinstance(free_index, (int,)):
|
118
|
+
self.free_slots.append(free_index)
|
119
|
+
else:
|
120
|
+
self.free_slots.extend(free_index)
|
121
|
+
|
122
|
+
def clear(self):
|
123
|
+
self.free_slots = list(range(self.size + self.pre_alloc_size))
|
124
|
+
|
125
|
+
|
60
126
|
@dataclass
|
61
127
|
class DecodeRequest:
|
62
128
|
req: Req
|
@@ -83,7 +149,12 @@ class DecodePreallocQueue:
|
|
83
149
|
gloo_group: ProcessGroup,
|
84
150
|
tp_rank: int,
|
85
151
|
tp_size: int,
|
152
|
+
dp_size: int,
|
153
|
+
gpu_id: int,
|
86
154
|
bootstrap_port: int,
|
155
|
+
max_total_num_tokens: int,
|
156
|
+
prefill_pp_size: int,
|
157
|
+
num_reserved_decode_tokens: int,
|
87
158
|
transfer_backend: TransferBackend,
|
88
159
|
):
|
89
160
|
self.req_to_token_pool = req_to_token_pool
|
@@ -99,25 +170,33 @@ class DecodePreallocQueue:
|
|
99
170
|
self.gloo_group = gloo_group
|
100
171
|
self.tp_rank = tp_rank
|
101
172
|
self.tp_size = tp_size
|
173
|
+
self.dp_size = dp_size
|
174
|
+
self.gpu_id = gpu_id
|
102
175
|
self.bootstrap_port = bootstrap_port
|
103
|
-
|
104
|
-
self.
|
105
|
-
|
106
|
-
|
107
|
-
|
176
|
+
self.max_total_num_tokens = max_total_num_tokens
|
177
|
+
self.prefill_pp_size = prefill_pp_size
|
178
|
+
self.num_reserved_decode_tokens = num_reserved_decode_tokens
|
179
|
+
self.transfer_backend = transfer_backend
|
108
180
|
# Queue for requests pending pre-allocation
|
109
181
|
self.queue: List[DecodeRequest] = []
|
110
|
-
self.
|
182
|
+
self.retracted_queue: List[Req] = []
|
183
|
+
self.prefill_pp_size = prefill_pp_size
|
111
184
|
self.kv_manager = self._init_kv_manager()
|
112
185
|
|
113
186
|
def _init_kv_manager(self) -> BaseKVManager:
|
114
|
-
|
115
|
-
kv_args
|
187
|
+
kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
|
188
|
+
kv_args = kv_args_class()
|
189
|
+
|
190
|
+
attn_tp_size = self.tp_size // self.dp_size
|
191
|
+
kv_args.engine_rank = self.tp_rank % (attn_tp_size)
|
192
|
+
kv_args.decode_tp_size = attn_tp_size
|
193
|
+
kv_args.prefill_pp_size = self.prefill_pp_size
|
116
194
|
kv_data_ptrs, kv_data_lens, kv_item_lens = (
|
117
195
|
self.token_to_kv_pool.get_contiguous_buf_infos()
|
118
196
|
)
|
119
|
-
|
120
197
|
if self.draft_token_to_kv_pool is not None:
|
198
|
+
# We should also transfer draft model kv cache. The indices are
|
199
|
+
# always shared with a target model.
|
121
200
|
draft_kv_data_ptrs, draft_kv_data_lens, draft_kv_item_lens = (
|
122
201
|
self.draft_token_to_kv_pool.get_contiguous_buf_infos()
|
123
202
|
)
|
@@ -132,6 +211,7 @@ class DecodePreallocQueue:
|
|
132
211
|
kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
|
133
212
|
self.metadata_buffers.get_buf_infos()
|
134
213
|
)
|
214
|
+
|
135
215
|
kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
|
136
216
|
kv_args.gpu_id = self.scheduler.gpu_id
|
137
217
|
kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
|
@@ -143,26 +223,84 @@ class DecodePreallocQueue:
|
|
143
223
|
)
|
144
224
|
return kv_manager
|
145
225
|
|
146
|
-
def add(self, req: Req) -> None:
|
226
|
+
def add(self, req: Req, is_retracted: bool = False) -> None:
|
147
227
|
"""Add a request to the pending queue."""
|
148
|
-
if req
|
149
|
-
|
150
|
-
|
228
|
+
if self._check_if_req_exceed_kv_capacity(req):
|
229
|
+
return
|
230
|
+
|
231
|
+
if is_retracted:
|
232
|
+
self.retracted_queue.append(req)
|
151
233
|
else:
|
152
|
-
|
153
|
-
|
234
|
+
if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
|
235
|
+
kv_receiver_class = get_kv_class(
|
236
|
+
TransferBackend.FAKE, KVClassType.RECEIVER
|
237
|
+
)
|
238
|
+
else:
|
239
|
+
kv_receiver_class = get_kv_class(
|
240
|
+
self.transfer_backend, KVClassType.RECEIVER
|
241
|
+
)
|
242
|
+
|
243
|
+
kv_receiver = kv_receiver_class(
|
244
|
+
mgr=self.kv_manager,
|
245
|
+
bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
|
246
|
+
bootstrap_room=req.bootstrap_room,
|
247
|
+
data_parallel_rank=req.data_parallel_rank,
|
154
248
|
)
|
155
|
-
kv_receiver = kv_receiver_class(
|
156
|
-
mgr=self.kv_manager,
|
157
|
-
bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
|
158
|
-
bootstrap_room=req.bootstrap_room,
|
159
|
-
)
|
160
|
-
self.queue.append(DecodeRequest(req=req, kv_receiver=kv_receiver))
|
161
249
|
|
162
|
-
|
250
|
+
self.queue.append(
|
251
|
+
DecodeRequest(req=req, kv_receiver=kv_receiver, waiting_for_input=False)
|
252
|
+
)
|
253
|
+
|
254
|
+
def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
|
255
|
+
if len(req.origin_input_ids) > self.max_total_num_tokens:
|
256
|
+
message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
|
257
|
+
logger.error(message)
|
258
|
+
prepare_abort(req, message)
|
259
|
+
self.scheduler.stream_output([req], req.return_logprob)
|
260
|
+
return True
|
261
|
+
return False
|
262
|
+
|
263
|
+
def extend(self, reqs: List[Req], is_retracted: bool = False) -> None:
|
163
264
|
"""Add a request to the pending queue."""
|
164
265
|
for req in reqs:
|
165
|
-
self.add(req)
|
266
|
+
self.add(req, is_retracted=is_retracted)
|
267
|
+
|
268
|
+
def resume_retracted_reqs(self) -> List[Req]:
|
269
|
+
# TODO refactor the scheduling part, reuse with the unified engine logic as much as possible
|
270
|
+
|
271
|
+
# allocate memory
|
272
|
+
resumed_reqs = []
|
273
|
+
indices_to_remove = set()
|
274
|
+
allocatable_tokens = self._allocatable_tokens(count_retracted=False)
|
275
|
+
|
276
|
+
for i, req in enumerate(self.retracted_queue):
|
277
|
+
if self.req_to_token_pool.available_size() <= 0:
|
278
|
+
break
|
279
|
+
|
280
|
+
required_tokens_for_request = (
|
281
|
+
len(req.origin_input_ids)
|
282
|
+
+ len(req.output_ids)
|
283
|
+
+ self.num_reserved_decode_tokens
|
284
|
+
)
|
285
|
+
if required_tokens_for_request > allocatable_tokens:
|
286
|
+
break
|
287
|
+
|
288
|
+
resumed_reqs.append(req)
|
289
|
+
indices_to_remove.add(i)
|
290
|
+
req.is_retracted = False
|
291
|
+
self._pre_alloc(req)
|
292
|
+
allocatable_tokens -= required_tokens_for_request
|
293
|
+
|
294
|
+
# load from cpu, release the cpu copy
|
295
|
+
req.load_kv_cache(self.req_to_token_pool, self.token_to_kv_pool_allocator)
|
296
|
+
|
297
|
+
self.retracted_queue = [
|
298
|
+
entry
|
299
|
+
for i, entry in enumerate(self.retracted_queue)
|
300
|
+
if i not in indices_to_remove
|
301
|
+
]
|
302
|
+
|
303
|
+
return resumed_reqs
|
166
304
|
|
167
305
|
def _update_handshake_waiters(self) -> None:
|
168
306
|
if not self.queue:
|
@@ -192,6 +330,8 @@ class DecodePreallocQueue:
|
|
192
330
|
error_message,
|
193
331
|
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
194
332
|
)
|
333
|
+
else:
|
334
|
+
raise ValueError(f"Unexpected poll case: {poll}")
|
195
335
|
|
196
336
|
def pop_preallocated(self) -> List[DecodeRequest]:
|
197
337
|
"""Pop the preallocated requests from the pending queue (FIFO)."""
|
@@ -199,8 +339,16 @@ class DecodePreallocQueue:
|
|
199
339
|
|
200
340
|
preallocated_reqs = []
|
201
341
|
indices_to_remove = set()
|
202
|
-
allocatable_tokens = self._allocatable_tokens()
|
203
342
|
|
343
|
+
# We need to make sure that the sum of inflight tokens and allocatable tokens is greater than maximum input+output length of each inflight request
|
344
|
+
# Otherwise it is possible for one request running decode out of memory, while all other requests are in the transfer queue that cannot be retracted.
|
345
|
+
retractable_tokens = sum(
|
346
|
+
len(r.origin_input_ids) + len(r.output_ids)
|
347
|
+
for r in self.scheduler.running_batch.reqs
|
348
|
+
)
|
349
|
+
allocatable_tokens = self._allocatable_tokens(
|
350
|
+
retractable_tokens=retractable_tokens, count_retracted=True
|
351
|
+
)
|
204
352
|
# First, remove all failed requests from the queue
|
205
353
|
for i, decode_req in enumerate(self.queue):
|
206
354
|
if isinstance(decode_req.req.finished_reason, FINISH_ABORT):
|
@@ -209,6 +357,7 @@ class DecodePreallocQueue:
|
|
209
357
|
)
|
210
358
|
indices_to_remove.add(i)
|
211
359
|
|
360
|
+
# Then, preallocate the remaining requests if possible
|
212
361
|
for i, decode_req in enumerate(self.queue):
|
213
362
|
if i in indices_to_remove:
|
214
363
|
continue
|
@@ -222,10 +371,23 @@ class DecodePreallocQueue:
|
|
222
371
|
if self.req_to_metadata_buffer_idx_allocator.available_size() <= 0:
|
223
372
|
break
|
224
373
|
|
374
|
+
# Memory estimation: don't add if the projected memory cannot be met
|
375
|
+
# TODO: add new_token ratio
|
376
|
+
origin_input_len = len(decode_req.req.origin_input_ids)
|
225
377
|
required_tokens_for_request = (
|
226
|
-
|
378
|
+
origin_input_len + self.num_reserved_decode_tokens
|
227
379
|
)
|
228
380
|
|
381
|
+
if (
|
382
|
+
max(
|
383
|
+
required_tokens_for_request,
|
384
|
+
origin_input_len
|
385
|
+
+ decode_req.req.sampling_params.max_new_tokens
|
386
|
+
- retractable_tokens,
|
387
|
+
)
|
388
|
+
> allocatable_tokens
|
389
|
+
):
|
390
|
+
break
|
229
391
|
if required_tokens_for_request > allocatable_tokens:
|
230
392
|
break
|
231
393
|
|
@@ -238,7 +400,6 @@ class DecodePreallocQueue:
|
|
238
400
|
]
|
239
401
|
.cpu()
|
240
402
|
.numpy()
|
241
|
-
.astype(np.int64)
|
242
403
|
)
|
243
404
|
|
244
405
|
decode_req.metadata_buffer_index = (
|
@@ -258,15 +419,35 @@ class DecodePreallocQueue:
|
|
258
419
|
|
259
420
|
return preallocated_reqs
|
260
421
|
|
261
|
-
def _allocatable_tokens(
|
262
|
-
|
263
|
-
|
264
|
-
|
422
|
+
def _allocatable_tokens(
|
423
|
+
self, retractable_tokens: Optional[int] = None, count_retracted: bool = True
|
424
|
+
) -> int:
|
425
|
+
need_space_for_single_req = (
|
426
|
+
max(
|
427
|
+
[
|
428
|
+
x.sampling_params.max_new_tokens
|
429
|
+
+ len(x.origin_input_ids)
|
430
|
+
- retractable_tokens
|
431
|
+
for x in self.scheduler.running_batch.reqs
|
432
|
+
]
|
433
|
+
)
|
434
|
+
if retractable_tokens is not None
|
435
|
+
and len(self.scheduler.running_batch.reqs) > 0
|
436
|
+
else 0
|
437
|
+
)
|
438
|
+
|
439
|
+
available_size = self.token_to_kv_pool_allocator.available_size()
|
440
|
+
|
441
|
+
allocatable_tokens = available_size - max(
|
442
|
+
# preserve some space for future decode
|
443
|
+
self.num_reserved_decode_tokens
|
265
444
|
* (
|
266
445
|
len(self.scheduler.running_batch.reqs)
|
267
446
|
+ len(self.transfer_queue.queue)
|
268
447
|
+ len(self.scheduler.waiting_queue)
|
269
|
-
)
|
448
|
+
),
|
449
|
+
# make sure each request can finish if reach max_tokens with all other requests retracted
|
450
|
+
need_space_for_single_req,
|
270
451
|
)
|
271
452
|
|
272
453
|
# Note: if the last fake extend just finishes, and we enter `pop_preallocated` immediately in the next iteration
|
@@ -279,15 +460,27 @@ class DecodePreallocQueue:
|
|
279
460
|
self.scheduler.last_batch.reqs
|
280
461
|
)
|
281
462
|
|
463
|
+
if count_retracted:
|
464
|
+
allocatable_tokens -= sum(
|
465
|
+
[
|
466
|
+
len(req.origin_input_ids)
|
467
|
+
+ len(req.output_ids)
|
468
|
+
+ self.num_reserved_decode_tokens
|
469
|
+
for req in self.retracted_queue
|
470
|
+
]
|
471
|
+
)
|
282
472
|
return allocatable_tokens
|
283
473
|
|
284
474
|
def _pre_alloc(self, req: Req) -> torch.Tensor:
|
285
475
|
"""Pre-allocate the memory for req_to_token and token_kv_pool"""
|
286
476
|
req_pool_indices = self.req_to_token_pool.alloc(1)
|
287
477
|
|
288
|
-
assert
|
478
|
+
assert (
|
479
|
+
req_pool_indices is not None
|
480
|
+
), "req_pool_indices is full! There is a bug in memory estimation."
|
289
481
|
|
290
482
|
req.req_pool_idx = req_pool_indices[0]
|
483
|
+
|
291
484
|
if self.token_to_kv_pool_allocator.page_size == 1:
|
292
485
|
kv_loc = self.token_to_kv_pool_allocator.alloc(
|
293
486
|
len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
|
@@ -312,7 +505,10 @@ class DecodePreallocQueue:
|
|
312
505
|
),
|
313
506
|
extend_num_tokens=num_tokens,
|
314
507
|
)
|
315
|
-
|
508
|
+
|
509
|
+
assert (
|
510
|
+
kv_loc is not None
|
511
|
+
), "KV cache is full! There is a bug in memory estimation."
|
316
512
|
|
317
513
|
self.req_to_token_pool.write((req.req_pool_idx, slice(0, len(kv_loc))), kv_loc)
|
318
514
|
|
@@ -332,6 +528,7 @@ class DecodeTransferQueue:
|
|
332
528
|
self,
|
333
529
|
gloo_group: ProcessGroup,
|
334
530
|
req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
|
531
|
+
tp_rank: int,
|
335
532
|
metadata_buffers: MetadataBuffers,
|
336
533
|
scheduler: Scheduler,
|
337
534
|
tree_cache: BasePrefixCache,
|
@@ -339,6 +536,7 @@ class DecodeTransferQueue:
|
|
339
536
|
self.queue: List[DecodeRequest] = []
|
340
537
|
self.gloo_group = gloo_group
|
341
538
|
self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
|
539
|
+
self.tp_rank = tp_rank
|
342
540
|
self.metadata_buffers = metadata_buffers
|
343
541
|
self.scheduler = scheduler
|
344
542
|
self.tree_cache = tree_cache
|
@@ -349,10 +547,9 @@ class DecodeTransferQueue:
|
|
349
547
|
def extend(self, decode_reqs: List[DecodeRequest]) -> None:
|
350
548
|
self.queue.extend(decode_reqs)
|
351
549
|
|
352
|
-
def pop_transferred(self) -> List[
|
550
|
+
def pop_transferred(self) -> List[Req]:
|
353
551
|
if not self.queue:
|
354
552
|
return []
|
355
|
-
|
356
553
|
polls = poll_and_all_reduce(
|
357
554
|
[decode_req.kv_receiver for decode_req in self.queue], self.gloo_group
|
358
555
|
)
|
@@ -361,7 +558,7 @@ class DecodeTransferQueue:
|
|
361
558
|
indices_to_remove = set()
|
362
559
|
for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
|
363
560
|
if poll == KVPoll.Failed:
|
364
|
-
error_message = f"Decode transfer failed for request {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
|
561
|
+
error_message = f"Decode transfer failed for request rank={self.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
|
365
562
|
try:
|
366
563
|
decode_req.kv_receiver.failure_exception()
|
367
564
|
except Exception as e:
|
@@ -409,7 +606,8 @@ class DecodeTransferQueue:
|
|
409
606
|
: decode_req.req.top_logprobs_num
|
410
607
|
].tolist()
|
411
608
|
)
|
412
|
-
|
609
|
+
if hasattr(decode_req.kv_receiver, "clear"):
|
610
|
+
decode_req.kv_receiver.clear()
|
413
611
|
transferred_reqs.append(decode_req.req)
|
414
612
|
indices_to_remove.add(i)
|
415
613
|
elif poll in [
|
@@ -435,15 +633,6 @@ class DecodeTransferQueue:
|
|
435
633
|
|
436
634
|
class SchedulerDisaggregationDecodeMixin:
|
437
635
|
|
438
|
-
def _prepare_idle_batch_and_run(self, batch, delay_process=False):
|
439
|
-
batch, _ = self.prepare_dp_attn_batch(batch)
|
440
|
-
result = None
|
441
|
-
if batch:
|
442
|
-
result = self.run_batch(batch)
|
443
|
-
if not delay_process:
|
444
|
-
self.process_batch_result(batch, result)
|
445
|
-
return batch, result
|
446
|
-
|
447
636
|
@torch.no_grad()
|
448
637
|
def event_loop_normal_disagg_decode(self: Scheduler):
|
449
638
|
"""A normal scheduler loop for decode worker in disaggregation mode."""
|
@@ -479,13 +668,15 @@ class SchedulerDisaggregationDecodeMixin:
|
|
479
668
|
batch, _ = self._prepare_idle_batch_and_run(None)
|
480
669
|
|
481
670
|
if batch is None and (
|
482
|
-
len(self.
|
671
|
+
len(self.waiting_queue)
|
672
|
+
+ len(self.disagg_decode_transfer_queue.queue)
|
483
673
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
484
674
|
== 0
|
485
675
|
):
|
486
676
|
# When the server is idle, do self-check and re-init some states
|
487
677
|
self.check_memory()
|
488
678
|
self.new_token_ratio = self.init_new_token_ratio
|
679
|
+
self.maybe_sleep_on_idle()
|
489
680
|
|
490
681
|
self.last_batch = batch
|
491
682
|
|
@@ -557,17 +748,28 @@ class SchedulerDisaggregationDecodeMixin:
|
|
557
748
|
self.process_batch_result(tmp_batch, tmp_result)
|
558
749
|
|
559
750
|
if batch is None and (
|
560
|
-
len(self.
|
751
|
+
len(self.waiting_queue)
|
752
|
+
+ len(self.disagg_decode_transfer_queue.queue)
|
561
753
|
+ len(self.disagg_decode_prealloc_queue.queue)
|
562
754
|
== 0
|
563
755
|
):
|
564
756
|
# When the server is idle, do self-check and re-init some states
|
565
757
|
self.check_memory()
|
566
758
|
self.new_token_ratio = self.init_new_token_ratio
|
759
|
+
self.maybe_sleep_on_idle()
|
567
760
|
|
568
761
|
self.last_batch = batch
|
569
762
|
self.last_batch_in_queue = last_batch_in_queue
|
570
763
|
|
764
|
+
def _prepare_idle_batch_and_run(self, batch, delay_process=False):
|
765
|
+
batch, _ = self.prepare_dp_attn_batch(batch)
|
766
|
+
result = None
|
767
|
+
if batch:
|
768
|
+
result = self.run_batch(batch)
|
769
|
+
if not delay_process:
|
770
|
+
self.process_batch_result(batch, result)
|
771
|
+
return batch, result
|
772
|
+
|
571
773
|
def get_next_disagg_decode_batch_to_run(
|
572
774
|
self: Scheduler,
|
573
775
|
) -> Optional[Tuple[ScheduleBatch, bool]]:
|
@@ -650,6 +852,13 @@ class SchedulerDisaggregationDecodeMixin:
|
|
650
852
|
return new_batch
|
651
853
|
|
652
854
|
def process_decode_queue(self: Scheduler):
|
855
|
+
# try to resume retracted requests if there are enough space for another `num_reserved_decode_tokens` decode steps
|
856
|
+
resumed_reqs = self.disagg_decode_prealloc_queue.resume_retracted_reqs()
|
857
|
+
self.waiting_queue.extend(resumed_reqs)
|
858
|
+
if len(self.disagg_decode_prealloc_queue.retracted_queue) > 0:
|
859
|
+
# if there are still retracted requests, we do not allocate new requests
|
860
|
+
return
|
861
|
+
|
653
862
|
req_conns = self.disagg_decode_prealloc_queue.pop_preallocated()
|
654
863
|
self.disagg_decode_transfer_queue.extend(req_conns)
|
655
864
|
alloc_reqs = (
|
@@ -1 +1 @@
|
|
1
|
-
from .conn import FakeKVReceiver, FakeKVSender
|
1
|
+
from sglang.srt.disaggregation.fake.conn import FakeKVReceiver, FakeKVSender
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import
|
2
|
+
from typing import List, Optional
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import numpy.typing as npt
|
@@ -8,7 +8,6 @@ from sglang.srt.disaggregation.base.conn import (
|
|
8
8
|
BaseKVManager,
|
9
9
|
BaseKVReceiver,
|
10
10
|
BaseKVSender,
|
11
|
-
KVArgs,
|
12
11
|
KVPoll,
|
13
12
|
)
|
14
13
|
|
@@ -17,7 +16,14 @@ logger = logging.getLogger(__name__)
|
|
17
16
|
|
18
17
|
# For warmup reqs, we don't kv transfer, we use the fake sender and receiver
|
19
18
|
class FakeKVSender(BaseKVSender):
|
20
|
-
def __init__(
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
mgr: BaseKVManager,
|
22
|
+
bootstrap_addr: str,
|
23
|
+
bootstrap_room: int,
|
24
|
+
dest_tp_ranks: List[int],
|
25
|
+
pp_rank: int,
|
26
|
+
):
|
21
27
|
self.has_sent = False
|
22
28
|
|
23
29
|
def poll(self) -> KVPoll:
|
@@ -26,7 +32,7 @@ class FakeKVSender(BaseKVSender):
|
|
26
32
|
return KVPoll.WaitingForInput
|
27
33
|
else:
|
28
34
|
# Assume transfer completed instantly
|
29
|
-
logger.
|
35
|
+
logger.debug("FakeKVSender poll success")
|
30
36
|
return KVPoll.Success
|
31
37
|
|
32
38
|
def init(
|
@@ -34,17 +40,17 @@ class FakeKVSender(BaseKVSender):
|
|
34
40
|
kv_indices: list[int],
|
35
41
|
aux_index: Optional[int] = None,
|
36
42
|
):
|
37
|
-
logger.
|
43
|
+
logger.debug(
|
38
44
|
f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}"
|
39
45
|
)
|
40
46
|
pass
|
41
47
|
|
42
48
|
def send(
|
43
49
|
self,
|
44
|
-
kv_indices: npt.NDArray[np.
|
50
|
+
kv_indices: npt.NDArray[np.int32],
|
45
51
|
):
|
46
52
|
self.has_sent = True
|
47
|
-
logger.
|
53
|
+
logger.debug(f"FakeKVSender send with kv_indices: {kv_indices}")
|
48
54
|
|
49
55
|
def failure_exception(self):
|
50
56
|
raise Exception("Fake KVSender Exception")
|
@@ -56,6 +62,7 @@ class FakeKVReceiver(BaseKVReceiver):
|
|
56
62
|
mgr: BaseKVManager,
|
57
63
|
bootstrap_addr: str,
|
58
64
|
bootstrap_room: Optional[int] = None,
|
65
|
+
data_parallel_rank: Optional[int] = None,
|
59
66
|
):
|
60
67
|
self.has_init = False
|
61
68
|
|
@@ -65,12 +72,12 @@ class FakeKVReceiver(BaseKVReceiver):
|
|
65
72
|
return KVPoll.WaitingForInput
|
66
73
|
else:
|
67
74
|
# Assume transfer completed instantly
|
68
|
-
logger.
|
75
|
+
logger.debug("FakeKVReceiver poll success")
|
69
76
|
return KVPoll.Success
|
70
77
|
|
71
78
|
def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
|
72
79
|
self.has_init = True
|
73
|
-
logger.
|
80
|
+
logger.debug(
|
74
81
|
f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
|
75
82
|
)
|
76
83
|
|