sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -37,6 +37,7 @@ import hashlib
|
|
37
37
|
import logging
|
38
38
|
import threading
|
39
39
|
from enum import Enum, auto
|
40
|
+
from http import HTTPStatus
|
40
41
|
from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
|
41
42
|
|
42
43
|
import numpy as np
|
@@ -48,7 +49,11 @@ from sglang.global_config import global_config
|
|
48
49
|
from sglang.srt.configs.model_config import ModelConfig
|
49
50
|
from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
|
50
51
|
from sglang.srt.disaggregation.base import BaseKVSender
|
51
|
-
from sglang.srt.disaggregation.
|
52
|
+
from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
|
53
|
+
ScheduleBatchDisaggregationDecodeMixin,
|
54
|
+
)
|
55
|
+
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
|
56
|
+
from sglang.srt.layers.multimodal import gpu_tensor_hash
|
52
57
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
53
58
|
from sglang.srt.mem_cache.chunk_cache import ChunkCache
|
54
59
|
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
|
@@ -57,7 +62,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
|
|
57
62
|
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
58
63
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
59
64
|
from sglang.srt.server_args import ServerArgs
|
60
|
-
from sglang.srt.utils import flatten_nested_list,
|
65
|
+
from sglang.srt.utils import flatten_nested_list, support_triton
|
61
66
|
|
62
67
|
if TYPE_CHECKING:
|
63
68
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
@@ -65,29 +70,38 @@ if TYPE_CHECKING:
|
|
65
70
|
|
66
71
|
INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
67
72
|
|
73
|
+
GLOBAL_SERVER_ARGS_KEYS = [
|
74
|
+
"attention_backend",
|
75
|
+
"debug_tensor_dump_inject",
|
76
|
+
"debug_tensor_dump_output_folder",
|
77
|
+
"chunked_prefill_size",
|
78
|
+
"deepep_mode",
|
79
|
+
"device",
|
80
|
+
"disable_chunked_prefix_cache",
|
81
|
+
"disable_radix_cache",
|
82
|
+
"enable_deepep_moe",
|
83
|
+
"enable_dp_attention",
|
84
|
+
"enable_two_batch_overlap",
|
85
|
+
"enable_dp_lm_head",
|
86
|
+
"enable_ep_moe",
|
87
|
+
"deepep_config",
|
88
|
+
"enable_nan_detection",
|
89
|
+
"flashinfer_mla_disable_ragged",
|
90
|
+
"max_micro_batch_size",
|
91
|
+
"moe_dense_tp_size",
|
92
|
+
"ep_dispatch_algorithm",
|
93
|
+
"disable_shared_experts_fusion",
|
94
|
+
"sampling_backend",
|
95
|
+
"speculative_accept_threshold_acc",
|
96
|
+
"speculative_accept_threshold_single",
|
97
|
+
"torchao_config",
|
98
|
+
"triton_attention_reduce_in_fp32",
|
99
|
+
"ep_num_redundant_experts",
|
100
|
+
"mm_attention_backend",
|
101
|
+
]
|
102
|
+
|
68
103
|
# Put some global args for easy access
|
69
|
-
global_server_args_dict = {
|
70
|
-
"attention_backend": ServerArgs.attention_backend,
|
71
|
-
"chunked_prefill_size": ServerArgs.chunked_prefill_size,
|
72
|
-
"deepep_mode": ServerArgs.deepep_mode,
|
73
|
-
"device": ServerArgs.device,
|
74
|
-
"disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
|
75
|
-
"disable_radix_cache": ServerArgs.disable_radix_cache,
|
76
|
-
"enable_deepep_moe": ServerArgs.enable_deepep_moe,
|
77
|
-
"enable_dp_attention": ServerArgs.enable_dp_attention,
|
78
|
-
"enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
|
79
|
-
"enable_ep_moe": ServerArgs.enable_ep_moe,
|
80
|
-
"enable_nan_detection": ServerArgs.enable_nan_detection,
|
81
|
-
"flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
|
82
|
-
"max_micro_batch_size": ServerArgs.max_micro_batch_size,
|
83
|
-
"moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
|
84
|
-
"n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
|
85
|
-
"sampling_backend": ServerArgs.sampling_backend,
|
86
|
-
"speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
|
87
|
-
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
|
88
|
-
"torchao_config": ServerArgs.torchao_config,
|
89
|
-
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
90
|
-
}
|
104
|
+
global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
|
91
105
|
|
92
106
|
logger = logging.getLogger(__name__)
|
93
107
|
|
@@ -177,10 +191,10 @@ class MultimodalDataItem:
|
|
177
191
|
image_offsets: Optional[list] = None
|
178
192
|
|
179
193
|
# the real data, pixel_values or audio_features
|
180
|
-
# data: Union[List[torch.Tensor], List[np.
|
181
|
-
pixel_values: Union[torch.Tensor, np.
|
182
|
-
|
183
|
-
video_grid_thws: Union[torch.Tensor, np.
|
194
|
+
# data: Union[List[torch.Tensor], List[np.ndarray]]
|
195
|
+
pixel_values: Union[torch.Tensor, np.ndarray] = None
|
196
|
+
image_grid_thw: Union[torch.Tensor, np.ndarray] = None
|
197
|
+
video_grid_thws: Union[torch.Tensor, np.ndarray] = None
|
184
198
|
|
185
199
|
image_emb_mask: Optional[torch.Tensor] = None
|
186
200
|
image_spatial_crop: Optional[torch.Tensor] = None
|
@@ -189,8 +203,14 @@ class MultimodalDataItem:
|
|
189
203
|
# [num_images, (n, w, h)]
|
190
204
|
tgt_size: Tuple[int, int] = None
|
191
205
|
|
192
|
-
|
206
|
+
# kimi-vl related
|
207
|
+
image_grid_hws: Optional[List[torch.Tensor]] = None
|
208
|
+
|
209
|
+
audio_features: Union[torch.Tensor, np.ndarray] = None
|
193
210
|
audio_feature_lens: Optional[List[torch.Tensor]] = None
|
211
|
+
audio_offsets: Optional[List[Tuple[int, int]]] = None
|
212
|
+
|
213
|
+
precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
|
194
214
|
|
195
215
|
@staticmethod
|
196
216
|
def is_empty_list(l):
|
@@ -219,7 +239,8 @@ class MultimodalDataItem:
|
|
219
239
|
for x in tensor_list
|
220
240
|
]
|
221
241
|
tensor = torch.concat(tensor_list)
|
222
|
-
|
242
|
+
if tensor.is_cuda:
|
243
|
+
return gpu_tensor_hash(tensor)
|
223
244
|
tensor = tensor.detach().contiguous()
|
224
245
|
|
225
246
|
if tensor.dtype == torch.bfloat16:
|
@@ -249,7 +270,9 @@ class MultimodalDataItem:
|
|
249
270
|
return tensor_hash([f])
|
250
271
|
return data_hash(f)
|
251
272
|
|
252
|
-
if self.
|
273
|
+
if self.precomputed_features is not None:
|
274
|
+
self.hash = hash_feature(self.precomputed_features)
|
275
|
+
elif self.is_audio():
|
253
276
|
self.hash = hash_feature(self.audio_features)
|
254
277
|
else:
|
255
278
|
self.hash = hash_feature(self.pixel_values)
|
@@ -258,19 +281,24 @@ class MultimodalDataItem:
|
|
258
281
|
self.pad_value = self.hash % (1 << 30)
|
259
282
|
|
260
283
|
def is_audio(self):
|
261
|
-
return (
|
262
|
-
self.
|
263
|
-
|
284
|
+
return (self.modality == Modality.AUDIO) and (
|
285
|
+
self.precomputed_features is not None
|
286
|
+
or not MultimodalDataItem.is_empty_list(self.audio_features)
|
287
|
+
)
|
264
288
|
|
265
289
|
def is_image(self):
|
266
290
|
return (
|
267
291
|
self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
|
268
|
-
) and
|
292
|
+
) and (
|
293
|
+
self.precomputed_features is not None
|
294
|
+
or not MultimodalDataItem.is_empty_list(self.pixel_values)
|
295
|
+
)
|
269
296
|
|
270
297
|
def is_video(self):
|
271
|
-
return (
|
272
|
-
self.
|
273
|
-
|
298
|
+
return (self.modality == Modality.VIDEO) and (
|
299
|
+
self.precomputed_features is not None
|
300
|
+
or not MultimodalDataItem.is_empty_list(self.pixel_values)
|
301
|
+
)
|
274
302
|
|
275
303
|
def is_valid(self) -> bool:
|
276
304
|
return self.is_image() or self.is_video() or self.is_audio()
|
@@ -279,6 +307,16 @@ class MultimodalDataItem:
|
|
279
307
|
...
|
280
308
|
# TODO
|
281
309
|
|
310
|
+
@staticmethod
|
311
|
+
def from_dict(obj: dict):
|
312
|
+
kwargs = dict(obj)
|
313
|
+
modality = kwargs.pop("modality")
|
314
|
+
if isinstance(modality, str):
|
315
|
+
modality = Modality[modality]
|
316
|
+
ret = MultimodalDataItem(modality=modality, **kwargs)
|
317
|
+
ret.validate()
|
318
|
+
return ret
|
319
|
+
|
282
320
|
|
283
321
|
@dataclasses.dataclass
|
284
322
|
class MultimodalInputs:
|
@@ -304,8 +342,9 @@ class MultimodalInputs:
|
|
304
342
|
video_token_id: Optional[int] = None
|
305
343
|
|
306
344
|
# audio
|
307
|
-
|
308
|
-
|
345
|
+
audio_token_id: Optional[int] = None
|
346
|
+
audio_start_id: Optional[int] = None
|
347
|
+
audio_end_id: Optional[int] = None
|
309
348
|
|
310
349
|
@staticmethod
|
311
350
|
def from_dict(obj: dict):
|
@@ -329,6 +368,7 @@ class MultimodalInputs:
|
|
329
368
|
"slice_end_id",
|
330
369
|
"audio_start_id",
|
331
370
|
"audio_end_id",
|
371
|
+
"audio_token_id",
|
332
372
|
]
|
333
373
|
for arg in optional_args:
|
334
374
|
if arg in obj:
|
@@ -411,6 +451,7 @@ class Req:
|
|
411
451
|
bootstrap_host: Optional[str] = None,
|
412
452
|
bootstrap_port: Optional[int] = None,
|
413
453
|
bootstrap_room: Optional[int] = None,
|
454
|
+
data_parallel_rank: Optional[int] = None,
|
414
455
|
):
|
415
456
|
# Input and output info
|
416
457
|
self.rid = rid
|
@@ -565,6 +606,9 @@ class Req:
|
|
565
606
|
self.bootstrap_room: Optional[int] = bootstrap_room
|
566
607
|
self.disagg_kv_sender: Optional[BaseKVSender] = None
|
567
608
|
|
609
|
+
# For data parallel rank routing
|
610
|
+
self.data_parallel_rank: Optional[int] = data_parallel_rank
|
611
|
+
|
568
612
|
# the start index of the sent kv cache
|
569
613
|
# We want to send it chunk by chunk for chunked prefill.
|
570
614
|
# After every chunk forward, we do the following:
|
@@ -578,9 +622,6 @@ class Req:
|
|
578
622
|
self.tmp_end_idx: int = -1
|
579
623
|
self.metadata_buffer_index: int = -1
|
580
624
|
|
581
|
-
# The first output_id transferred from prefill instance.
|
582
|
-
self.transferred_output_id: Optional[int] = None
|
583
|
-
|
584
625
|
@property
|
585
626
|
def seqlen(self):
|
586
627
|
return len(self.origin_input_ids) + len(self.output_ids)
|
@@ -744,6 +785,16 @@ class Req:
|
|
744
785
|
logger.info(f"{prefix}: {self.time_stats}")
|
745
786
|
self.has_log_time_stats = True
|
746
787
|
|
788
|
+
def set_finish_with_abort(self, error_msg: str):
|
789
|
+
if get_tensor_model_parallel_rank() == 0:
|
790
|
+
logger.error(f"{error_msg}, {self.rid=}")
|
791
|
+
self.multimodal_inputs = None
|
792
|
+
self.grammar = None
|
793
|
+
self.origin_input_ids = [0] # set it to one token to skip the long prefill
|
794
|
+
self.finished_reason = FINISH_ABORT(
|
795
|
+
error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
|
796
|
+
)
|
797
|
+
|
747
798
|
def __repr__(self):
|
748
799
|
return (
|
749
800
|
f"Req(rid={self.rid}, "
|
@@ -805,6 +856,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
805
856
|
global_num_tokens: Optional[List[int]] = None
|
806
857
|
global_num_tokens_for_logprob: Optional[List[int]] = None
|
807
858
|
can_run_dp_cuda_graph: bool = False
|
859
|
+
tbo_split_seq_index: Optional[int] = None
|
860
|
+
global_forward_mode: Optional[ForwardMode] = None
|
808
861
|
|
809
862
|
# For processing logprobs
|
810
863
|
return_logprob: bool = False
|
@@ -1069,7 +1122,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1069
1122
|
else:
|
1070
1123
|
self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
|
1071
1124
|
|
1072
|
-
assert
|
1125
|
+
assert (
|
1126
|
+
len(self.out_cache_loc) == self.extend_num_tokens
|
1127
|
+
), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
|
1073
1128
|
|
1074
1129
|
def prepare_for_extend(self):
|
1075
1130
|
self.forward_mode = ForwardMode.EXTEND
|
@@ -1226,7 +1281,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1226
1281
|
self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
|
1227
1282
|
|
1228
1283
|
# Write to req_to_token_pool
|
1229
|
-
if global_server_args_dict
|
1284
|
+
if support_triton(global_server_args_dict.get("attention_backend")):
|
1230
1285
|
# TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
|
1231
1286
|
|
1232
1287
|
write_req_to_token_pool_triton[(bs,)](
|
@@ -1290,7 +1345,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1290
1345
|
page_size = self.token_to_kv_pool_allocator.page_size
|
1291
1346
|
if page_size == 1:
|
1292
1347
|
return len(self.reqs)
|
1293
|
-
|
1348
|
+
# In the decoding phase, the length of a request's KV cache should be
|
1349
|
+
# the total length of the request minus 1
|
1350
|
+
return sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
|
1294
1351
|
|
1295
1352
|
def check_decode_mem(self, buf_multiplier=1):
|
1296
1353
|
tokens_required = (
|
@@ -1579,7 +1636,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1579
1636
|
if self.spec_info:
|
1580
1637
|
self.spec_info.merge_batch(other.spec_info)
|
1581
1638
|
|
1582
|
-
def get_model_worker_batch(
|
1639
|
+
def get_model_worker_batch(
|
1640
|
+
self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
|
1641
|
+
) -> ModelWorkerBatch:
|
1583
1642
|
if self.forward_mode.is_decode_or_idle():
|
1584
1643
|
extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
|
1585
1644
|
else:
|
@@ -1589,15 +1648,20 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1589
1648
|
|
1590
1649
|
# Create seq_lens_cpu when needed
|
1591
1650
|
if (
|
1592
|
-
|
1651
|
+
global_server_args_dict["attention_backend"] == "fa3"
|
1652
|
+
or (
|
1593
1653
|
global_server_args_dict["use_mla_backend"]
|
1594
1654
|
and global_server_args_dict["attention_backend"] == "flashinfer"
|
1595
1655
|
)
|
1596
1656
|
or global_server_args_dict["attention_backend"] == "flashmla"
|
1597
|
-
or global_server_args_dict["attention_backend"] == "fa3"
|
1598
1657
|
or global_server_args_dict["attention_backend"] == "cutlass_mla"
|
1658
|
+
or global_server_args_dict["enable_two_batch_overlap"]
|
1599
1659
|
):
|
1600
|
-
seq_lens_cpu =
|
1660
|
+
seq_lens_cpu = (
|
1661
|
+
seq_lens_cpu_cache
|
1662
|
+
if seq_lens_cpu_cache is not None
|
1663
|
+
else self.seq_lens.cpu()
|
1664
|
+
)
|
1601
1665
|
else:
|
1602
1666
|
seq_lens_cpu = None
|
1603
1667
|
|
@@ -1616,6 +1680,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1616
1680
|
req_pool_indices=self.req_pool_indices,
|
1617
1681
|
seq_lens=self.seq_lens,
|
1618
1682
|
out_cache_loc=self.out_cache_loc,
|
1683
|
+
seq_lens_cpu=seq_lens_cpu,
|
1619
1684
|
seq_lens_sum=self.seq_lens_sum,
|
1620
1685
|
return_logprob=self.return_logprob,
|
1621
1686
|
top_logprobs_nums=self.top_logprobs_nums,
|
@@ -1623,7 +1688,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
|
|
1623
1688
|
global_num_tokens=self.global_num_tokens,
|
1624
1689
|
global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
|
1625
1690
|
can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
|
1626
|
-
|
1691
|
+
tbo_split_seq_index=self.tbo_split_seq_index,
|
1692
|
+
global_forward_mode=self.global_forward_mode,
|
1627
1693
|
extend_num_tokens=self.extend_num_tokens,
|
1628
1694
|
extend_seq_lens=extend_seq_lens,
|
1629
1695
|
extend_prefix_lens=extend_prefix_lens,
|
@@ -1685,11 +1751,11 @@ class ModelWorkerBatch:
|
|
1685
1751
|
req_pool_indices: torch.Tensor
|
1686
1752
|
# The sequence length
|
1687
1753
|
seq_lens: torch.Tensor
|
1688
|
-
seq_lens_cpu: Optional[torch.Tensor]
|
1689
1754
|
# The indices of output tokens in the token_to_kv_pool_allocator
|
1690
1755
|
out_cache_loc: torch.Tensor
|
1691
1756
|
|
1692
|
-
# The
|
1757
|
+
# The sequence length tensor on CPU
|
1758
|
+
seq_lens_cpu: Optional[torch.Tensor]
|
1693
1759
|
seq_lens_sum: int
|
1694
1760
|
|
1695
1761
|
# For logprob
|
@@ -1701,6 +1767,8 @@ class ModelWorkerBatch:
|
|
1701
1767
|
global_num_tokens: Optional[List[int]]
|
1702
1768
|
global_num_tokens_for_logprob: Optional[List[int]]
|
1703
1769
|
can_run_dp_cuda_graph: bool
|
1770
|
+
tbo_split_seq_index: Optional[int]
|
1771
|
+
global_forward_mode: Optional[ForwardMode]
|
1704
1772
|
|
1705
1773
|
# For extend
|
1706
1774
|
extend_num_tokens: Optional[int]
|
@@ -1774,10 +1842,72 @@ def write_req_to_token_pool_triton(
|
|
1774
1842
|
)
|
1775
1843
|
|
1776
1844
|
|
1777
|
-
|
1778
|
-
|
1845
|
+
def get_last_loc(
|
1846
|
+
req_to_token: torch.Tensor,
|
1847
|
+
req_pool_indices_tensor: torch.Tensor,
|
1848
|
+
prefix_lens_tensor: torch.Tensor,
|
1849
|
+
) -> torch.Tensor:
|
1850
|
+
if global_server_args_dict["attention_backend"] != "torch_native":
|
1851
|
+
impl = get_last_loc_triton
|
1852
|
+
else:
|
1853
|
+
impl = get_last_loc_torch
|
1854
|
+
|
1855
|
+
return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
|
1856
|
+
|
1857
|
+
|
1858
|
+
def get_last_loc_torch(
|
1859
|
+
req_to_token: torch.Tensor,
|
1860
|
+
req_pool_indices_tensor: torch.Tensor,
|
1861
|
+
prefix_lens_tensor: torch.Tensor,
|
1862
|
+
) -> torch.Tensor:
|
1779
1863
|
return torch.where(
|
1780
1864
|
prefix_lens_tensor > 0,
|
1781
1865
|
req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
|
1782
1866
|
torch.full_like(prefix_lens_tensor, -1),
|
1783
1867
|
)
|
1868
|
+
|
1869
|
+
|
1870
|
+
@triton.jit
|
1871
|
+
def get_last_loc_kernel(
|
1872
|
+
req_to_token,
|
1873
|
+
req_pool_indices_tensor,
|
1874
|
+
prefix_lens_tensor,
|
1875
|
+
result,
|
1876
|
+
num_tokens,
|
1877
|
+
req_to_token_stride,
|
1878
|
+
BLOCK_SIZE: tl.constexpr,
|
1879
|
+
):
|
1880
|
+
pid = tl.program_id(0)
|
1881
|
+
offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
|
1882
|
+
mask = offset < num_tokens
|
1883
|
+
|
1884
|
+
prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
|
1885
|
+
req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
|
1886
|
+
|
1887
|
+
token_mask = prefix_lens > 0
|
1888
|
+
token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
|
1889
|
+
tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
|
1890
|
+
|
1891
|
+
tl.store(result + offset, tokens, mask=mask)
|
1892
|
+
|
1893
|
+
|
1894
|
+
def get_last_loc_triton(
|
1895
|
+
req_to_token: torch.Tensor,
|
1896
|
+
req_pool_indices_tensor: torch.Tensor,
|
1897
|
+
prefix_lens_tensor: torch.Tensor,
|
1898
|
+
) -> torch.Tensor:
|
1899
|
+
BLOCK_SIZE = 256
|
1900
|
+
num_tokens = prefix_lens_tensor.shape[0]
|
1901
|
+
result = torch.empty_like(prefix_lens_tensor)
|
1902
|
+
grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
|
1903
|
+
|
1904
|
+
get_last_loc_kernel[grid](
|
1905
|
+
req_to_token,
|
1906
|
+
req_pool_indices_tensor,
|
1907
|
+
prefix_lens_tensor,
|
1908
|
+
result,
|
1909
|
+
num_tokens,
|
1910
|
+
req_to_token.stride(0),
|
1911
|
+
BLOCK_SIZE,
|
1912
|
+
)
|
1913
|
+
return result
|
@@ -22,11 +22,7 @@ from typing import Dict, List, Optional, Set, Union
|
|
22
22
|
|
23
23
|
import torch
|
24
24
|
|
25
|
-
from sglang.srt.managers.schedule_batch import
|
26
|
-
Req,
|
27
|
-
ScheduleBatch,
|
28
|
-
global_server_args_dict,
|
29
|
-
)
|
25
|
+
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
30
26
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
31
27
|
from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
|
32
28
|
from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
|
@@ -468,6 +464,9 @@ class PrefillAdder:
|
|
468
464
|
return AddReqResult.OTHER
|
469
465
|
|
470
466
|
with self._lock_node(req.last_node):
|
467
|
+
if total_tokens > self.rem_total_tokens:
|
468
|
+
return AddReqResult.NO_TOKEN
|
469
|
+
|
471
470
|
if (
|
472
471
|
enable_hierarchical_cache
|
473
472
|
and req.last_node_global is not None
|