sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -24,6 +24,7 @@ import warnings
|
|
24
24
|
from argparse import ArgumentParser
|
25
25
|
from dataclasses import dataclass, field
|
26
26
|
from datetime import datetime
|
27
|
+
from json import JSONDecodeError
|
27
28
|
from pathlib import Path
|
28
29
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
29
30
|
|
@@ -38,7 +39,6 @@ from transformers import (
|
|
38
39
|
PreTrainedTokenizerFast,
|
39
40
|
)
|
40
41
|
|
41
|
-
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
42
42
|
ASSISTANT_SUFFIX = "Assistant:"
|
43
43
|
|
44
44
|
global args
|
@@ -50,6 +50,19 @@ def _get_bool_env_var(name: str, default: str = "false") -> bool:
|
|
50
50
|
return value.lower() in ("true", "1")
|
51
51
|
|
52
52
|
|
53
|
+
def _create_bench_client_session():
|
54
|
+
# When the pressure is big, the read buffer could be full before aio thread read
|
55
|
+
# the content. We increase the read_bufsize from 64K to 10M.
|
56
|
+
# Define constants for timeout and buffer size for clarity and maintainability
|
57
|
+
BENCH_AIOHTTP_TIMEOUT_SECONDS = 6 * 60 * 60 # 6 hours
|
58
|
+
BENCH_AIOHTTP_READ_BUFSIZE_BYTES = 10 * 1024**2 # 10 MB
|
59
|
+
|
60
|
+
aiohttp_timeout = aiohttp.ClientTimeout(total=BENCH_AIOHTTP_TIMEOUT_SECONDS)
|
61
|
+
return aiohttp.ClientSession(
|
62
|
+
timeout=aiohttp_timeout, read_bufsize=BENCH_AIOHTTP_READ_BUFSIZE_BYTES
|
63
|
+
)
|
64
|
+
|
65
|
+
|
53
66
|
@dataclass
|
54
67
|
class RequestFuncInput:
|
55
68
|
prompt: str
|
@@ -73,6 +86,12 @@ class RequestFuncOutput:
|
|
73
86
|
error: str = ""
|
74
87
|
output_len: int = 0
|
75
88
|
|
89
|
+
@staticmethod
|
90
|
+
def init_new(request_func_input: RequestFuncInput):
|
91
|
+
output = RequestFuncOutput()
|
92
|
+
output.prompt_len = request_func_input.prompt_len
|
93
|
+
return output
|
94
|
+
|
76
95
|
|
77
96
|
def remove_prefix(text: str, prefix: str) -> str:
|
78
97
|
return text[len(prefix) :] if text.startswith(prefix) else text
|
@@ -99,7 +118,7 @@ async def async_request_trt_llm(
|
|
99
118
|
api_url = request_func_input.api_url
|
100
119
|
assert api_url.endswith("generate_stream")
|
101
120
|
|
102
|
-
async with
|
121
|
+
async with _create_bench_client_session() as session:
|
103
122
|
payload = {
|
104
123
|
"accumulate_tokens": True,
|
105
124
|
"text_input": request_func_input.prompt,
|
@@ -114,8 +133,7 @@ async def async_request_trt_llm(
|
|
114
133
|
if args.disable_ignore_eos:
|
115
134
|
del payload["min_length"]
|
116
135
|
del payload["end_id"]
|
117
|
-
output = RequestFuncOutput()
|
118
|
-
output.prompt_len = request_func_input.prompt_len
|
136
|
+
output = RequestFuncOutput.init_new(request_func_input)
|
119
137
|
|
120
138
|
ttft = 0.0
|
121
139
|
st = time.perf_counter()
|
@@ -173,7 +191,7 @@ async def async_request_openai_completions(
|
|
173
191
|
|
174
192
|
prompt = request_func_input.prompt
|
175
193
|
|
176
|
-
async with
|
194
|
+
async with _create_bench_client_session() as session:
|
177
195
|
payload = {
|
178
196
|
"model": request_func_input.model,
|
179
197
|
"prompt": prompt,
|
@@ -186,8 +204,7 @@ async def async_request_openai_completions(
|
|
186
204
|
}
|
187
205
|
headers = get_auth_headers()
|
188
206
|
|
189
|
-
output = RequestFuncOutput()
|
190
|
-
output.prompt_len = request_func_input.prompt_len
|
207
|
+
output = RequestFuncOutput.init_new(request_func_input)
|
191
208
|
|
192
209
|
generated_text = ""
|
193
210
|
output_len = request_func_input.output_len
|
@@ -256,7 +273,7 @@ async def async_request_truss(
|
|
256
273
|
|
257
274
|
prompt = request_func_input.prompt
|
258
275
|
|
259
|
-
async with
|
276
|
+
async with _create_bench_client_session() as session:
|
260
277
|
payload = {
|
261
278
|
"model": request_func_input.model,
|
262
279
|
"prompt": prompt,
|
@@ -269,8 +286,7 @@ async def async_request_truss(
|
|
269
286
|
}
|
270
287
|
headers = get_auth_headers()
|
271
288
|
|
272
|
-
output = RequestFuncOutput()
|
273
|
-
output.prompt_len = request_func_input.prompt_len
|
289
|
+
output = RequestFuncOutput.init_new(request_func_input)
|
274
290
|
|
275
291
|
generated_text = ""
|
276
292
|
ttft = 0.0
|
@@ -334,9 +350,9 @@ async def async_request_sglang_generate(
|
|
334
350
|
api_url = request_func_input.api_url
|
335
351
|
prompt = request_func_input.prompt
|
336
352
|
|
337
|
-
async with
|
353
|
+
async with _create_bench_client_session() as session:
|
338
354
|
payload = {
|
339
|
-
"text": prompt,
|
355
|
+
("text" if isinstance(prompt, str) else "input_ids"): prompt,
|
340
356
|
"sampling_params": {
|
341
357
|
"temperature": 0.0,
|
342
358
|
"max_new_tokens": request_func_input.output_len,
|
@@ -355,8 +371,7 @@ async def async_request_sglang_generate(
|
|
355
371
|
|
356
372
|
headers = get_auth_headers()
|
357
373
|
|
358
|
-
output = RequestFuncOutput()
|
359
|
-
output.prompt_len = request_func_input.prompt_len
|
374
|
+
output = RequestFuncOutput.init_new(request_func_input)
|
360
375
|
|
361
376
|
generated_text = ""
|
362
377
|
output_len = request_func_input.output_len
|
@@ -373,7 +388,6 @@ async def async_request_sglang_generate(
|
|
373
388
|
chunk_bytes = chunk_bytes.strip()
|
374
389
|
if not chunk_bytes:
|
375
390
|
continue
|
376
|
-
# print(chunk_bytes)
|
377
391
|
|
378
392
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
379
393
|
latency = time.perf_counter() - st
|
@@ -434,7 +448,7 @@ async def async_request_gserver(
|
|
434
448
|
|
435
449
|
|
436
450
|
async def async_request_profile(api_url: str) -> RequestFuncOutput:
|
437
|
-
async with
|
451
|
+
async with _create_bench_client_session() as session:
|
438
452
|
output = RequestFuncOutput()
|
439
453
|
try:
|
440
454
|
async with session.post(url=api_url) as response:
|
@@ -469,6 +483,10 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|
469
483
|
def get_tokenizer(
|
470
484
|
pretrained_model_name_or_path: str,
|
471
485
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
486
|
+
assert (
|
487
|
+
pretrained_model_name_or_path is not None
|
488
|
+
and pretrained_model_name_or_path != ""
|
489
|
+
)
|
472
490
|
if pretrained_model_name_or_path.endswith(
|
473
491
|
".json"
|
474
492
|
) or pretrained_model_name_or_path.endswith(".model"):
|
@@ -486,7 +504,9 @@ def get_tokenizer(
|
|
486
504
|
|
487
505
|
|
488
506
|
def get_dataset(args, tokenizer):
|
507
|
+
tokenize_prompt = getattr(args, "tokenize_prompt", False)
|
489
508
|
if args.dataset_name == "sharegpt":
|
509
|
+
assert not tokenize_prompt
|
490
510
|
input_requests = sample_sharegpt_requests(
|
491
511
|
dataset_path=args.dataset_path,
|
492
512
|
num_requests=args.num_prompts,
|
@@ -505,8 +525,10 @@ def get_dataset(args, tokenizer):
|
|
505
525
|
tokenizer=tokenizer,
|
506
526
|
dataset_path=args.dataset_path,
|
507
527
|
random_sample=args.dataset_name == "random",
|
528
|
+
return_text=not tokenize_prompt,
|
508
529
|
)
|
509
530
|
elif args.dataset_name == "generated-shared-prefix":
|
531
|
+
assert not tokenize_prompt
|
510
532
|
input_requests = sample_generated_shared_prefix_requests(
|
511
533
|
num_groups=args.gsp_num_groups,
|
512
534
|
prompts_per_group=args.gsp_prompts_per_group,
|
@@ -517,6 +539,7 @@ def get_dataset(args, tokenizer):
|
|
517
539
|
args=args,
|
518
540
|
)
|
519
541
|
elif args.dataset_name == "mmmu":
|
542
|
+
assert not tokenize_prompt
|
520
543
|
input_requests = sample_mmmu_requests(
|
521
544
|
num_requests=args.num_prompts,
|
522
545
|
tokenizer=tokenizer,
|
@@ -582,7 +605,7 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
582
605
|
filename = os.path.join("/tmp", url.split("/")[-1])
|
583
606
|
|
584
607
|
# Check if the cache file already exists
|
585
|
-
if
|
608
|
+
if is_file_valid_json(filename):
|
586
609
|
return filename
|
587
610
|
|
588
611
|
print(f"Downloading from {url} to {filename}")
|
@@ -610,12 +633,36 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
|
|
610
633
|
return filename
|
611
634
|
|
612
635
|
|
636
|
+
def is_file_valid_json(path):
|
637
|
+
if not os.path.isfile(path):
|
638
|
+
return False
|
639
|
+
|
640
|
+
# TODO can fuse into the real file open later
|
641
|
+
try:
|
642
|
+
with open(path) as f:
|
643
|
+
json.load(f)
|
644
|
+
return True
|
645
|
+
except JSONDecodeError as e:
|
646
|
+
print(
|
647
|
+
f"{path} exists but json loading fails ({e=}), thus treat as invalid file"
|
648
|
+
)
|
649
|
+
return False
|
650
|
+
|
651
|
+
|
652
|
+
@dataclass
|
653
|
+
class DatasetRow:
|
654
|
+
prompt: str
|
655
|
+
prompt_len: int
|
656
|
+
output_len: int
|
657
|
+
image_data: Optional[str] = None
|
658
|
+
|
659
|
+
|
613
660
|
def sample_mmmu_requests(
|
614
661
|
num_requests: int,
|
615
662
|
tokenizer: PreTrainedTokenizerBase,
|
616
663
|
fixed_output_len: Optional[int] = None,
|
617
664
|
random_sample: bool = True,
|
618
|
-
) -> List[
|
665
|
+
) -> List[DatasetRow]:
|
619
666
|
"""
|
620
667
|
Sample requests from the MMMU dataset using HuggingFace datasets.
|
621
668
|
|
@@ -683,40 +730,52 @@ def sample_mmmu_requests(
|
|
683
730
|
buffered = io.BytesIO()
|
684
731
|
image.save(buffered, format="JPEG")
|
685
732
|
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
686
|
-
|
733
|
+
image_data = f"data:image/jpeg;base64,{img_str}"
|
687
734
|
else:
|
688
735
|
continue
|
689
736
|
|
690
737
|
# Extract the question
|
691
738
|
question = example.get("question")
|
692
739
|
|
693
|
-
#
|
740
|
+
# Construct the prompt
|
694
741
|
prompt = f"Question: {question}\n\nAnswer: "
|
695
|
-
prompt = tokenizer.apply_chat_template(
|
696
|
-
[
|
697
|
-
{
|
698
|
-
"role": "user",
|
699
|
-
"content": [
|
700
|
-
{"type": "image_url", "image_url": {"url": image_path}},
|
701
|
-
{"type": "text", "text": prompt},
|
702
|
-
],
|
703
|
-
}
|
704
|
-
],
|
705
|
-
add_generation_prompt=True,
|
706
|
-
tokenize=False,
|
707
|
-
)
|
708
|
-
prompt = f"<image>{image_path}</image>{prompt}"
|
709
742
|
|
710
|
-
|
711
|
-
|
743
|
+
try:
|
744
|
+
prompt = tokenizer.apply_chat_template(
|
745
|
+
[
|
746
|
+
{
|
747
|
+
"role": "user",
|
748
|
+
"content": [
|
749
|
+
{
|
750
|
+
"type": "image_url",
|
751
|
+
"image_url": {"url": image_data},
|
752
|
+
},
|
753
|
+
{"type": "text", "text": prompt},
|
754
|
+
],
|
755
|
+
}
|
756
|
+
],
|
757
|
+
add_generation_prompt=True,
|
758
|
+
tokenize=False,
|
759
|
+
)
|
760
|
+
except Exception as e:
|
761
|
+
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
|
762
|
+
print(f"Error applying chat template: {e}, fallback to <image> tag")
|
763
|
+
prompt = f"<image>{prompt}"
|
764
|
+
|
765
|
+
# Calculate token lengths for text only (without image data)
|
712
766
|
prompt_token_ids = tokenizer.encode(prompt)
|
713
|
-
prompt_len = (
|
714
|
-
len(prompt_token_ids) + 512
|
715
|
-
) # Add estimate for image tokens
|
767
|
+
prompt_len = len(prompt_token_ids)
|
716
768
|
|
717
769
|
output_len = fixed_output_len if fixed_output_len is not None else 256
|
718
770
|
|
719
|
-
filtered_dataset.append(
|
771
|
+
filtered_dataset.append(
|
772
|
+
DatasetRow(
|
773
|
+
prompt=prompt,
|
774
|
+
prompt_len=prompt_len,
|
775
|
+
output_len=output_len,
|
776
|
+
image_data=image_data,
|
777
|
+
)
|
778
|
+
)
|
720
779
|
|
721
780
|
except Exception as e:
|
722
781
|
print(f"Error processing example {i}: {e}")
|
@@ -733,12 +792,12 @@ def sample_sharegpt_requests(
|
|
733
792
|
context_len: Optional[int] = None,
|
734
793
|
prompt_suffix: Optional[str] = "",
|
735
794
|
apply_chat_template=False,
|
736
|
-
) -> List[
|
795
|
+
) -> List[DatasetRow]:
|
737
796
|
if fixed_output_len is not None and fixed_output_len < 4:
|
738
797
|
raise ValueError("output_len too small")
|
739
798
|
|
740
799
|
# Download sharegpt if necessary
|
741
|
-
if not
|
800
|
+
if not is_file_valid_json(dataset_path) and dataset_path == "":
|
742
801
|
dataset_path = download_and_cache_file(SHAREGPT_URL)
|
743
802
|
|
744
803
|
# Load the dataset.
|
@@ -764,7 +823,7 @@ def sample_sharegpt_requests(
|
|
764
823
|
random.shuffle(dataset)
|
765
824
|
|
766
825
|
# Filter out sequences that are too long or too short
|
767
|
-
filtered_dataset: List[
|
826
|
+
filtered_dataset: List[DatasetRow] = []
|
768
827
|
for i in range(len(dataset)):
|
769
828
|
if len(filtered_dataset) == num_requests:
|
770
829
|
break
|
@@ -802,10 +861,12 @@ def sample_sharegpt_requests(
|
|
802
861
|
# Prune too long sequences.
|
803
862
|
continue
|
804
863
|
|
805
|
-
filtered_dataset.append(
|
864
|
+
filtered_dataset.append(
|
865
|
+
DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
|
866
|
+
)
|
806
867
|
|
807
|
-
print(f"#Input tokens: {np.sum([x
|
808
|
-
print(f"#Output tokens: {np.sum([x
|
868
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
|
869
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in filtered_dataset])}")
|
809
870
|
return filtered_dataset
|
810
871
|
|
811
872
|
|
@@ -817,7 +878,8 @@ def sample_random_requests(
|
|
817
878
|
tokenizer: PreTrainedTokenizerBase,
|
818
879
|
dataset_path: str,
|
819
880
|
random_sample: bool = True,
|
820
|
-
|
881
|
+
return_text: bool = True,
|
882
|
+
) -> List[DatasetRow]:
|
821
883
|
input_lens = np.random.randint(
|
822
884
|
max(int(input_len * range_ratio), 1),
|
823
885
|
input_len + 1,
|
@@ -833,7 +895,7 @@ def sample_random_requests(
|
|
833
895
|
# Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
|
834
896
|
|
835
897
|
# Download sharegpt if necessary
|
836
|
-
if not
|
898
|
+
if not is_file_valid_json(dataset_path):
|
837
899
|
dataset_path = download_and_cache_file(SHAREGPT_URL)
|
838
900
|
|
839
901
|
# Load the dataset.
|
@@ -857,7 +919,7 @@ def sample_random_requests(
|
|
857
919
|
random.shuffle(dataset)
|
858
920
|
|
859
921
|
# Filter out sequences that are too long or too short
|
860
|
-
input_requests: List[
|
922
|
+
input_requests: List[DatasetRow] = []
|
861
923
|
for data in dataset:
|
862
924
|
i = len(input_requests)
|
863
925
|
if i == num_prompts:
|
@@ -877,20 +939,34 @@ def sample_random_requests(
|
|
877
939
|
else:
|
878
940
|
ratio = (input_lens[i] + prompt_len - 1) // prompt_len
|
879
941
|
input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
|
880
|
-
|
881
|
-
|
942
|
+
input_content = input_ids
|
943
|
+
if return_text:
|
944
|
+
input_content = tokenizer.decode(input_content)
|
945
|
+
input_requests.append(
|
946
|
+
DatasetRow(
|
947
|
+
prompt=input_content,
|
948
|
+
prompt_len=int(input_lens[i]),
|
949
|
+
output_len=int(output_lens[i]),
|
950
|
+
)
|
951
|
+
)
|
882
952
|
else:
|
883
953
|
# Sample token ids from random integers. This can cause some NaN issues.
|
884
954
|
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
|
885
955
|
input_requests = []
|
886
956
|
for i in range(num_prompts):
|
887
|
-
|
888
|
-
[
|
889
|
-
|
890
|
-
|
891
|
-
|
957
|
+
input_content = [
|
958
|
+
(offsets[i] + i + j) % tokenizer.vocab_size
|
959
|
+
for j in range(input_lens[i])
|
960
|
+
]
|
961
|
+
if return_text:
|
962
|
+
input_content = tokenizer.decode(input_content)
|
963
|
+
input_requests.append(
|
964
|
+
DatasetRow(
|
965
|
+
prompt=input_content,
|
966
|
+
prompt_len=int(input_lens[i]),
|
967
|
+
output_len=int(output_lens[i]),
|
968
|
+
)
|
892
969
|
)
|
893
|
-
input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
|
894
970
|
|
895
971
|
print(f"#Input tokens: {np.sum(input_lens)}")
|
896
972
|
print(f"#Output tokens: {np.sum(output_lens)}")
|
@@ -925,7 +1001,7 @@ def sample_generated_shared_prefix_requests(
|
|
925
1001
|
output_len: int,
|
926
1002
|
tokenizer: PreTrainedTokenizerBase,
|
927
1003
|
args: argparse.Namespace,
|
928
|
-
) -> List[
|
1004
|
+
) -> List[DatasetRow]:
|
929
1005
|
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
|
930
1006
|
cache_path = get_gen_prefix_cache_path(args, tokenizer)
|
931
1007
|
|
@@ -963,7 +1039,11 @@ def sample_generated_shared_prefix_requests(
|
|
963
1039
|
full_prompt = f"{system_prompt}\n\n{question}"
|
964
1040
|
prompt_len = len(tokenizer.encode(full_prompt))
|
965
1041
|
|
966
|
-
input_requests.append(
|
1042
|
+
input_requests.append(
|
1043
|
+
DatasetRow(
|
1044
|
+
prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
|
1045
|
+
)
|
1046
|
+
)
|
967
1047
|
total_input_tokens += prompt_len
|
968
1048
|
total_output_tokens += output_len
|
969
1049
|
|
@@ -994,9 +1074,9 @@ def sample_generated_shared_prefix_requests(
|
|
994
1074
|
|
995
1075
|
|
996
1076
|
async def get_request(
|
997
|
-
input_requests: List[
|
1077
|
+
input_requests: List[DatasetRow],
|
998
1078
|
request_rate: float,
|
999
|
-
) -> AsyncGenerator[
|
1079
|
+
) -> AsyncGenerator[DatasetRow, None]:
|
1000
1080
|
input_requests = iter(input_requests)
|
1001
1081
|
for request in input_requests:
|
1002
1082
|
yield request
|
@@ -1012,7 +1092,7 @@ async def get_request(
|
|
1012
1092
|
|
1013
1093
|
|
1014
1094
|
def calculate_metrics(
|
1015
|
-
input_requests: List[
|
1095
|
+
input_requests: List[DatasetRow],
|
1016
1096
|
outputs: List[RequestFuncOutput],
|
1017
1097
|
dur_s: float,
|
1018
1098
|
tokenizer: PreTrainedTokenizerBase,
|
@@ -1034,7 +1114,7 @@ def calculate_metrics(
|
|
1034
1114
|
tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
|
1035
1115
|
)
|
1036
1116
|
retokenized_output_lens.append(retokenized_output_len)
|
1037
|
-
total_input += input_requests[i]
|
1117
|
+
total_input += input_requests[i].prompt_len
|
1038
1118
|
if output_len > 1:
|
1039
1119
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1040
1120
|
itls += outputs[i].itl
|
@@ -1096,7 +1176,7 @@ async def benchmark(
|
|
1096
1176
|
base_url: str,
|
1097
1177
|
model_id: str,
|
1098
1178
|
tokenizer: PreTrainedTokenizerBase,
|
1099
|
-
input_requests: List[
|
1179
|
+
input_requests: List[DatasetRow],
|
1100
1180
|
request_rate: float,
|
1101
1181
|
max_concurrency: Optional[int],
|
1102
1182
|
disable_tqdm: bool,
|
@@ -1126,30 +1206,22 @@ async def benchmark(
|
|
1126
1206
|
print(f"Starting warmup with {warmup_requests} sequences...")
|
1127
1207
|
|
1128
1208
|
# Use the first request for all warmup iterations
|
1129
|
-
|
1209
|
+
test_request = input_requests[0]
|
1210
|
+
|
1130
1211
|
if lora_names is not None and len(lora_names) != 0:
|
1131
1212
|
lora_name = lora_names[0]
|
1132
1213
|
else:
|
1133
1214
|
lora_name = None
|
1134
1215
|
|
1135
|
-
if "<image>" in test_prompt:
|
1136
|
-
import re
|
1137
|
-
|
1138
|
-
image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
|
1139
|
-
image_data = image_match.group(1) if image_match else None
|
1140
|
-
test_prompt = image_match.group(2) if image_match else test_prompt
|
1141
|
-
else:
|
1142
|
-
image_data = None
|
1143
|
-
|
1144
1216
|
# Create the test input once
|
1145
1217
|
test_input = RequestFuncInput(
|
1146
1218
|
model=model_id,
|
1147
|
-
prompt=
|
1219
|
+
prompt=test_request.prompt,
|
1148
1220
|
api_url=api_url,
|
1149
|
-
prompt_len=
|
1150
|
-
output_len=min(
|
1221
|
+
prompt_len=test_request.prompt_len,
|
1222
|
+
output_len=min(test_request.output_len, 32),
|
1151
1223
|
lora_name=lora_name,
|
1152
|
-
image_data=image_data,
|
1224
|
+
image_data=test_request.image_data,
|
1153
1225
|
extra_request_body=extra_request_body,
|
1154
1226
|
)
|
1155
1227
|
|
@@ -1194,32 +1266,23 @@ async def benchmark(
|
|
1194
1266
|
benchmark_start_time = time.perf_counter()
|
1195
1267
|
tasks: List[asyncio.Task] = []
|
1196
1268
|
async for request in get_request(input_requests, request_rate):
|
1197
|
-
prompt, prompt_len, output_len = request
|
1198
1269
|
if lora_names is not None and len(lora_names) != 0:
|
1199
1270
|
idx = random.randint(0, len(lora_names) - 1)
|
1200
1271
|
lora_name = lora_names[idx]
|
1201
1272
|
else:
|
1202
1273
|
lora_name = None
|
1203
1274
|
|
1204
|
-
if "<image>" in prompt:
|
1205
|
-
import re
|
1206
|
-
|
1207
|
-
image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
|
1208
|
-
image_data = image_match.group(1) if image_match else None
|
1209
|
-
prompt = image_match.group(2) if image_match else prompt
|
1210
|
-
else:
|
1211
|
-
image_data = None
|
1212
|
-
|
1213
1275
|
request_func_input = RequestFuncInput(
|
1214
1276
|
model=model_id,
|
1215
|
-
prompt=prompt,
|
1277
|
+
prompt=request.prompt,
|
1216
1278
|
api_url=api_url,
|
1217
|
-
prompt_len=prompt_len,
|
1218
|
-
output_len=output_len,
|
1279
|
+
prompt_len=request.prompt_len,
|
1280
|
+
output_len=request.output_len,
|
1219
1281
|
lora_name=lora_name,
|
1220
|
-
image_data=image_data,
|
1282
|
+
image_data=request.image_data,
|
1221
1283
|
extra_request_body=extra_request_body,
|
1222
1284
|
)
|
1285
|
+
|
1223
1286
|
tasks.append(
|
1224
1287
|
asyncio.create_task(
|
1225
1288
|
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
@@ -1239,14 +1302,15 @@ async def benchmark(
|
|
1239
1302
|
|
1240
1303
|
if "sglang" in backend:
|
1241
1304
|
server_info = requests.get(base_url + "/get_server_info")
|
1242
|
-
if
|
1243
|
-
|
1305
|
+
if server_info.status_code == 200:
|
1306
|
+
server_info_json = server_info.json()
|
1307
|
+
if "decode" in server_info_json:
|
1308
|
+
server_info_json = server_info_json["decode"][0]
|
1309
|
+
accept_length = server_info_json["internal_states"][0].get(
|
1244
1310
|
"avg_spec_accept_length", None
|
1245
1311
|
)
|
1246
1312
|
else:
|
1247
|
-
accept_length =
|
1248
|
-
"avg_spec_accept_length", None
|
1249
|
-
)
|
1313
|
+
accept_length = None
|
1250
1314
|
else:
|
1251
1315
|
accept_length = None
|
1252
1316
|
|
@@ -1380,21 +1444,24 @@ async def benchmark(
|
|
1380
1444
|
else:
|
1381
1445
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
1382
1446
|
|
1447
|
+
result_details = {
|
1448
|
+
"input_lens": [output.prompt_len for output in outputs],
|
1449
|
+
"output_lens": output_lens,
|
1450
|
+
"ttfts": [output.ttft for output in outputs],
|
1451
|
+
"itls": [output.itl for output in outputs],
|
1452
|
+
"generated_texts": [output.generated_text for output in outputs],
|
1453
|
+
"errors": [output.error for output in outputs],
|
1454
|
+
}
|
1455
|
+
|
1383
1456
|
# Append results to a JSONL file
|
1384
1457
|
with open(output_file_name, "a") as file:
|
1385
|
-
|
1386
|
-
|
1387
|
-
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
|
1392
|
-
"itls": [output.itl for output in outputs],
|
1393
|
-
"generated_texts": [output.generated_text for output in outputs],
|
1394
|
-
"errors": [output.error for output in outputs],
|
1395
|
-
}
|
1396
|
-
)
|
1397
|
-
return result
|
1458
|
+
if args.output_details:
|
1459
|
+
result_for_dump = result | result_details
|
1460
|
+
else:
|
1461
|
+
result_for_dump = result
|
1462
|
+
file.write(json.dumps(result_for_dump) + "\n")
|
1463
|
+
|
1464
|
+
return result | result_details
|
1398
1465
|
|
1399
1466
|
|
1400
1467
|
def check_chat_template(model_path):
|
@@ -1424,6 +1491,12 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1424
1491
|
if not hasattr(args, "warmup_requests"):
|
1425
1492
|
args.warmup_requests = 1
|
1426
1493
|
|
1494
|
+
if not hasattr(args, "output_details"):
|
1495
|
+
args.output_details = False
|
1496
|
+
|
1497
|
+
if not hasattr(args, "tokenize_prompt"):
|
1498
|
+
args.tokenize_prompt = False
|
1499
|
+
|
1427
1500
|
print(f"benchmark_args={args}")
|
1428
1501
|
|
1429
1502
|
# Set global environments
|
@@ -1435,6 +1508,11 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1435
1508
|
if args.extra_request_body:
|
1436
1509
|
extra_request_body = json.loads(args.extra_request_body)
|
1437
1510
|
|
1511
|
+
if args.tokenize_prompt:
|
1512
|
+
assert (
|
1513
|
+
args.backend == "sglang"
|
1514
|
+
), "`--tokenize-prompt` only compatible with `--backend sglang` currently"
|
1515
|
+
|
1438
1516
|
# Set url
|
1439
1517
|
if args.port is None:
|
1440
1518
|
args.port = {
|
@@ -1545,6 +1623,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1545
1623
|
profile=args.profile,
|
1546
1624
|
pd_separated=args.pd_separated,
|
1547
1625
|
flush_cache=args.flush_cache,
|
1626
|
+
warmup_requests=args.warmup_requests,
|
1548
1627
|
)
|
1549
1628
|
)
|
1550
1629
|
|
@@ -1668,6 +1747,9 @@ if __name__ == "__main__":
|
|
1668
1747
|
"if the server is not processing requests fast enough to keep up.",
|
1669
1748
|
)
|
1670
1749
|
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
1750
|
+
parser.add_argument(
|
1751
|
+
"--output-details", action="store_true", help="Output details of benchmarking."
|
1752
|
+
)
|
1671
1753
|
parser.add_argument(
|
1672
1754
|
"--disable-tqdm",
|
1673
1755
|
action="store_true",
|
@@ -1737,6 +1819,11 @@ if __name__ == "__main__":
|
|
1737
1819
|
default=1,
|
1738
1820
|
help="Number of warmup requests to run before the benchmark",
|
1739
1821
|
)
|
1822
|
+
parser.add_argument(
|
1823
|
+
"--tokenize-prompt",
|
1824
|
+
action="store_true",
|
1825
|
+
help="Use integer ids instead of string for inputs. Useful to control prompt lengths accurately",
|
1826
|
+
)
|
1740
1827
|
|
1741
1828
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1742
1829
|
group.add_argument(
|
sglang/compile_deep_gemm.py
CHANGED
@@ -82,8 +82,8 @@ def launch_server_process_and_send_one_request(
|
|
82
82
|
base_url = f"http://{server_args.host}:{server_args.port}"
|
83
83
|
timeout = compile_args.timeout
|
84
84
|
|
85
|
-
start_time = time.
|
86
|
-
while time.
|
85
|
+
start_time = time.perf_counter()
|
86
|
+
while time.perf_counter() - start_time < timeout:
|
87
87
|
try:
|
88
88
|
headers = {
|
89
89
|
"Content-Type": "application/json; charset=utf-8",
|
@@ -112,9 +112,9 @@ def launch_server_process_and_send_one_request(
|
|
112
112
|
raise RuntimeError(f"Sync request failed: {error}")
|
113
113
|
# Other nodes should wait for the exit signal from Rank-0 node.
|
114
114
|
else:
|
115
|
-
start_time_waiting = time.
|
115
|
+
start_time_waiting = time.perf_counter()
|
116
116
|
while proc.is_alive():
|
117
|
-
if time.
|
117
|
+
if time.perf_counter() - start_time_waiting < timeout:
|
118
118
|
time.sleep(10)
|
119
119
|
else:
|
120
120
|
raise TimeoutError("Waiting for main node timeout!")
|