sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,291 @@
|
|
1
|
+
# Copyright 2025 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
|
15
|
+
# Adapted from
|
16
|
+
# https://github.com/vllm-project/vllm/blob/a1a2aaadb9122f05667140e39cf67e5736c8b6d6/vllm/model_executor/models/transformers.py
|
17
|
+
"""Wrapper around `transformers` models"""
|
18
|
+
import logging
|
19
|
+
import re
|
20
|
+
from typing import Iterable, Literal, Optional, Tuple, Union
|
21
|
+
|
22
|
+
import torch
|
23
|
+
from torch import nn
|
24
|
+
from transformers import AutoModel, PretrainedConfig, PreTrainedModel
|
25
|
+
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
|
26
|
+
|
27
|
+
from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
|
28
|
+
from sglang.srt.layers.linear import (
|
29
|
+
ColumnParallelLinear,
|
30
|
+
ReplicatedLinear,
|
31
|
+
RowParallelLinear,
|
32
|
+
)
|
33
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
34
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
35
|
+
from sglang.srt.layers.radix_attention import RadixAttention
|
36
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
37
|
+
ParallelLMHead,
|
38
|
+
VocabParallelEmbedding,
|
39
|
+
)
|
40
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
41
|
+
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
42
|
+
|
43
|
+
logger = logging.getLogger(__name__)
|
44
|
+
|
45
|
+
|
46
|
+
def maybe_prefix(prefix: str, name: str) -> str:
|
47
|
+
"""Add a prefix to a name if the prefix is non-empty.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
prefix: The prefix to add. If empty, no prefix will be added.
|
51
|
+
name: The name to potentially prefix.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
The string "prefix.name" if prefix was non-empty, otherwise just "name".
|
55
|
+
"""
|
56
|
+
return name if not prefix else f"{prefix}.{name}"
|
57
|
+
|
58
|
+
|
59
|
+
def sglang_flash_attention_forward(
|
60
|
+
# Transformers args
|
61
|
+
module: torch.nn.Module,
|
62
|
+
query: torch.Tensor,
|
63
|
+
key: torch.Tensor,
|
64
|
+
value: torch.Tensor,
|
65
|
+
attention_mask: torch.Tensor,
|
66
|
+
# sglang kwargs
|
67
|
+
forward_batch: ForwardBatch,
|
68
|
+
# Transformers kwargs
|
69
|
+
scaling: float = None,
|
70
|
+
attention_instances: list[RadixAttention] = None,
|
71
|
+
**kwargs,
|
72
|
+
):
|
73
|
+
self_attn: RadixAttention = attention_instances[module.layer_idx]
|
74
|
+
if scaling is not None:
|
75
|
+
self_attn.scaling = float(scaling)
|
76
|
+
hidden = query.shape[-2]
|
77
|
+
query, key, value = (x.transpose(1, 2) for x in (query, key, value))
|
78
|
+
query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
|
79
|
+
return self_attn.forward(query, key, value, forward_batch=forward_batch), None
|
80
|
+
|
81
|
+
|
82
|
+
ALL_ATTENTION_FUNCTIONS["sglang"] = sglang_flash_attention_forward
|
83
|
+
|
84
|
+
|
85
|
+
class HFColumnParallelLinear(ColumnParallelLinear):
|
86
|
+
|
87
|
+
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
88
|
+
return super().forward(input)[0]
|
89
|
+
|
90
|
+
|
91
|
+
class HFRowParallelLinear(RowParallelLinear):
|
92
|
+
|
93
|
+
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
94
|
+
return super().forward(input)[0]
|
95
|
+
|
96
|
+
|
97
|
+
def replace_linear_class(
|
98
|
+
linear: nn.Linear,
|
99
|
+
style: Literal["colwise", "rowwise"],
|
100
|
+
quant_config: QuantizationConfig,
|
101
|
+
) -> Union[ColumnParallelLinear, RowParallelLinear]:
|
102
|
+
"""
|
103
|
+
Replace nn.Linear with one of vLLM's tensor parallel linear classes.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
linear (nn.Linear): `nn.Linear` to be replaced.
|
107
|
+
style (str): Tensor parallel style of the new linear, e.g. "colwise".
|
108
|
+
quant_config (QuantConfig): Quantization config for the new linear.
|
109
|
+
Returns:
|
110
|
+
Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
|
111
|
+
"""
|
112
|
+
|
113
|
+
if not isinstance(style, str):
|
114
|
+
raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
|
115
|
+
|
116
|
+
sglang_linear_cls = {
|
117
|
+
"colwise": ColumnParallelLinear,
|
118
|
+
"rowwise": RowParallelLinear,
|
119
|
+
}.get(style, ReplicatedLinear)
|
120
|
+
|
121
|
+
class HFCompatibleLinear(sglang_linear_cls):
|
122
|
+
"""
|
123
|
+
Wrapper class that removes `output_bias` from returned output.
|
124
|
+
"""
|
125
|
+
|
126
|
+
@property
|
127
|
+
def parent_cls(self) -> type:
|
128
|
+
return sglang_linear_cls
|
129
|
+
|
130
|
+
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
131
|
+
return super().forward(input)[0]
|
132
|
+
|
133
|
+
return HFCompatibleLinear(
|
134
|
+
input_size=linear.in_features,
|
135
|
+
output_size=linear.out_features,
|
136
|
+
bias=linear.bias is not None,
|
137
|
+
quant_config=quant_config,
|
138
|
+
)
|
139
|
+
|
140
|
+
|
141
|
+
class TransformersForCausalLM(nn.Module):
|
142
|
+
|
143
|
+
def __init__(
|
144
|
+
self,
|
145
|
+
config: PretrainedConfig,
|
146
|
+
quant_config: Optional[QuantizationConfig] = None,
|
147
|
+
prefix: str = "",
|
148
|
+
) -> None:
|
149
|
+
super().__init__()
|
150
|
+
logger.info("Using Transformers backend.")
|
151
|
+
|
152
|
+
self.quant_config = quant_config
|
153
|
+
self.config = config
|
154
|
+
self.vocab_size = config.vocab_size
|
155
|
+
self.unpadded_vocab_size = config.vocab_size
|
156
|
+
|
157
|
+
# model is loaded under set_default_torch_dtype(model_config.dtype)
|
158
|
+
self.model: PreTrainedModel = AutoModel.from_config(
|
159
|
+
self.config,
|
160
|
+
torch_dtype=torch.get_default_dtype(),
|
161
|
+
attn_implementation="sglang",
|
162
|
+
trust_remote_code=True,
|
163
|
+
)
|
164
|
+
|
165
|
+
# Attention modifications (assumes 1 attention op per hidden layer)
|
166
|
+
tp_size = get_tensor_model_parallel_world_size()
|
167
|
+
|
168
|
+
# MLP modifications
|
169
|
+
self.tensor_parallel(tp_size)
|
170
|
+
|
171
|
+
head_dim = (
|
172
|
+
(config.hidden_size // config.num_attention_heads)
|
173
|
+
if not hasattr(config, "head_dim")
|
174
|
+
else config.head_dim
|
175
|
+
)
|
176
|
+
self.attention_instances = [
|
177
|
+
RadixAttention(
|
178
|
+
num_heads=divide(config.num_attention_heads, tp_size),
|
179
|
+
head_dim=head_dim,
|
180
|
+
# NOTE: We use Llama scale as default, if it's set by
|
181
|
+
# Transformers, it's updated in sglang_flash_attention_forward
|
182
|
+
scaling=head_dim**-0.5,
|
183
|
+
num_kv_heads=divide(config.num_key_value_heads, tp_size),
|
184
|
+
layer_id=i,
|
185
|
+
quant_config=self.quant_config,
|
186
|
+
prefix=f"{i}.attn",
|
187
|
+
)
|
188
|
+
for i in range(config.num_hidden_layers)
|
189
|
+
]
|
190
|
+
|
191
|
+
# Model modifications
|
192
|
+
self.replace_vocab_embed_class(self.model)
|
193
|
+
|
194
|
+
# ForCausalLM modifications
|
195
|
+
self.lm_head = ParallelLMHead(
|
196
|
+
config.vocab_size,
|
197
|
+
config.hidden_size,
|
198
|
+
quant_config=self.quant_config,
|
199
|
+
prefix=maybe_prefix(prefix, "lm_head"),
|
200
|
+
)
|
201
|
+
if config.tie_word_embeddings:
|
202
|
+
self.lm_head.weight = self.model.get_input_embeddings().weight
|
203
|
+
|
204
|
+
self.logits_processor = LogitsProcessor(config)
|
205
|
+
|
206
|
+
def log_replacement(self, name: str, old_module: nn.Module, new_module: nn.Module):
|
207
|
+
logger.debug("%s: %s -> %s", name, old_module, new_module)
|
208
|
+
|
209
|
+
def tensor_parallel(self, tp_size: int):
|
210
|
+
"""
|
211
|
+
Apply the model's tensor parallelization plan.
|
212
|
+
Currently only supports linear layers.
|
213
|
+
"""
|
214
|
+
if not self.model.supports_tp_plan:
|
215
|
+
if tp_size <= 1:
|
216
|
+
return
|
217
|
+
|
218
|
+
raise ValueError(
|
219
|
+
f"{type(self.model)} does not support tensor parallel yet!"
|
220
|
+
)
|
221
|
+
|
222
|
+
tp_plan = self.model._tp_plan
|
223
|
+
|
224
|
+
def _tensor_parallel(module: nn.Module, prefix: str = ""):
|
225
|
+
for child_name, child_module in module.named_children():
|
226
|
+
qual_name = maybe_prefix(prefix, child_name)
|
227
|
+
for pattern, style in tp_plan.items():
|
228
|
+
if re.match(pattern, qual_name) and isinstance(
|
229
|
+
child_module, nn.Linear
|
230
|
+
):
|
231
|
+
new_module = replace_linear_class(
|
232
|
+
child_module, style, self.quant_config
|
233
|
+
)
|
234
|
+
setattr(module, child_name, new_module)
|
235
|
+
self.log_replacement(qual_name, child_module, new_module)
|
236
|
+
else:
|
237
|
+
_tensor_parallel(child_module, prefix=qual_name)
|
238
|
+
|
239
|
+
_tensor_parallel(self.model)
|
240
|
+
|
241
|
+
def replace_vocab_embed_class(self, module: nn.Module):
|
242
|
+
# Use native set input embeddings
|
243
|
+
new_module = VocabParallelEmbedding(
|
244
|
+
self.vocab_size,
|
245
|
+
self.config.hidden_size,
|
246
|
+
org_num_embeddings=self.config.vocab_size,
|
247
|
+
quant_config=None,
|
248
|
+
)
|
249
|
+
self.log_replacement(
|
250
|
+
"input embedding", self.model.get_input_embeddings(), new_module
|
251
|
+
)
|
252
|
+
self.model.set_input_embeddings(new_module)
|
253
|
+
|
254
|
+
@torch.no_grad()
|
255
|
+
def forward(
|
256
|
+
self,
|
257
|
+
input_ids: torch.Tensor,
|
258
|
+
positions: torch.Tensor,
|
259
|
+
forward_batch: ForwardBatch,
|
260
|
+
input_embeds: torch.Tensor = None,
|
261
|
+
get_embedding: bool = False,
|
262
|
+
) -> LogitsProcessorOutput:
|
263
|
+
assert get_embedding is False, "embedding is not supported yet"
|
264
|
+
aux_hidden_states = None
|
265
|
+
hidden_states = self.model(
|
266
|
+
input_ids[None, ...],
|
267
|
+
use_cache=False,
|
268
|
+
position_ids=positions[None, ...],
|
269
|
+
forward_batch=forward_batch,
|
270
|
+
attention_instances=self.attention_instances,
|
271
|
+
return_dict=False,
|
272
|
+
)[0][
|
273
|
+
0, ...
|
274
|
+
] # we remove batch dimension for now
|
275
|
+
|
276
|
+
return self.logits_processor(
|
277
|
+
input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
|
278
|
+
)
|
279
|
+
|
280
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
281
|
+
params_dict = dict(self.named_parameters())
|
282
|
+
for name, loaded_weight in weights:
|
283
|
+
if name not in params_dict:
|
284
|
+
name = f"{self.model.base_model_prefix}.{name}"
|
285
|
+
if name in params_dict:
|
286
|
+
param = params_dict[name]
|
287
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
288
|
+
weight_loader(param, loaded_weight)
|
289
|
+
|
290
|
+
|
291
|
+
EntryClass = [TransformersForCausalLM]
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -69,10 +69,16 @@ from sglang.srt.openai_api.protocol import (
|
|
69
69
|
FunctionResponse,
|
70
70
|
LogProbs,
|
71
71
|
MultimodalEmbeddingInput,
|
72
|
+
ScoringRequest,
|
73
|
+
ScoringResponse,
|
72
74
|
ToolCall,
|
73
75
|
TopLogprob,
|
74
76
|
UsageInfo,
|
75
77
|
)
|
78
|
+
from sglang.srt.openai_api.utils import (
|
79
|
+
detect_template_content_format,
|
80
|
+
process_content_for_template_format,
|
81
|
+
)
|
76
82
|
from sglang.srt.reasoning_parser import ReasoningParser
|
77
83
|
from sglang.utils import convert_json_schema_to_str, get_exception_traceback
|
78
84
|
|
@@ -80,6 +86,11 @@ logger = logging.getLogger(__name__)
|
|
80
86
|
|
81
87
|
chat_template_name = None
|
82
88
|
|
89
|
+
# Global cache for template content format detection (one model/template per instance)
|
90
|
+
# NOTE: A better approach would be to initialize the chat template format when the endpoint is created
|
91
|
+
_cached_chat_template = None
|
92
|
+
_cached_template_format = None
|
93
|
+
|
83
94
|
|
84
95
|
class FileMetadata:
|
85
96
|
def __init__(self, filename: str, purpose: str):
|
@@ -604,6 +615,9 @@ def v1_generate_request(
|
|
604
615
|
stream=all_requests[0].stream,
|
605
616
|
rid=request_ids,
|
606
617
|
lora_path=lora_paths,
|
618
|
+
bootstrap_host=all_requests[0].bootstrap_host,
|
619
|
+
bootstrap_port=all_requests[0].bootstrap_port,
|
620
|
+
bootstrap_room=all_requests[0].bootstrap_room,
|
607
621
|
)
|
608
622
|
|
609
623
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
@@ -995,23 +1009,42 @@ def v1_chat_generate_request(
|
|
995
1009
|
|
996
1010
|
if chat_template_name is None:
|
997
1011
|
openai_compatible_messages = []
|
1012
|
+
image_data = []
|
1013
|
+
audio_data = []
|
1014
|
+
modalities = []
|
1015
|
+
|
1016
|
+
# Detect template content format by analyzing the jinja template (cached globally)
|
1017
|
+
global _cached_chat_template, _cached_template_format
|
1018
|
+
current_template = tokenizer_manager.tokenizer.chat_template
|
1019
|
+
|
1020
|
+
if current_template != _cached_chat_template:
|
1021
|
+
# Template changed or first time - analyze it
|
1022
|
+
_cached_chat_template = current_template
|
1023
|
+
_cached_template_format = detect_template_content_format(
|
1024
|
+
current_template
|
1025
|
+
)
|
1026
|
+
logger.info(
|
1027
|
+
f"Detected chat template content format: {_cached_template_format}"
|
1028
|
+
)
|
1029
|
+
|
1030
|
+
template_content_format = _cached_template_format
|
998
1031
|
|
999
1032
|
for message in request.messages:
|
1000
1033
|
if message.content is None:
|
1001
1034
|
message.content = ""
|
1002
|
-
msg_dict = message.
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1035
|
+
msg_dict = message.model_dump()
|
1036
|
+
|
1037
|
+
# Process content based on detected template format
|
1038
|
+
processed_msg = process_content_for_template_format(
|
1039
|
+
msg_dict,
|
1040
|
+
template_content_format,
|
1041
|
+
image_data,
|
1042
|
+
audio_data,
|
1043
|
+
modalities,
|
1044
|
+
)
|
1045
|
+
openai_compatible_messages.append(processed_msg)
|
1046
|
+
|
1047
|
+
# Handle assistant prefix for continue_final_message
|
1015
1048
|
if (
|
1016
1049
|
openai_compatible_messages
|
1017
1050
|
and openai_compatible_messages[-1]["role"] == "assistant"
|
@@ -1065,9 +1098,9 @@ def v1_chat_generate_request(
|
|
1065
1098
|
if is_multimodal:
|
1066
1099
|
prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
|
1067
1100
|
stop = request.stop
|
1068
|
-
image_data = None
|
1069
|
-
audio_data = None
|
1070
|
-
modalities = []
|
1101
|
+
image_data = image_data if image_data else None
|
1102
|
+
audio_data = audio_data if audio_data else None
|
1103
|
+
modalities = modalities if modalities else []
|
1071
1104
|
else:
|
1072
1105
|
conv = generate_chat_conv(request, chat_template_name)
|
1073
1106
|
# If we should continue the final assistant message, adjust the conversation.
|
@@ -1327,7 +1360,6 @@ def v1_chat_generate_response(
|
|
1327
1360
|
tool_calls = [
|
1328
1361
|
ToolCall(
|
1329
1362
|
id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
|
1330
|
-
index=call_info.tool_index,
|
1331
1363
|
function=FunctionResponse(
|
1332
1364
|
name=call_info.name, arguments=call_info.parameters
|
1333
1365
|
),
|
@@ -1391,7 +1423,9 @@ def v1_chat_generate_response(
|
|
1391
1423
|
"id": ret[i]["meta_info"]["id"],
|
1392
1424
|
"object": "chat.completion",
|
1393
1425
|
"created": created,
|
1394
|
-
"model":
|
1426
|
+
"model": (
|
1427
|
+
request[i].model if isinstance(request, list) else request.model
|
1428
|
+
),
|
1395
1429
|
"choices": choice,
|
1396
1430
|
"usage": {
|
1397
1431
|
"prompt_tokens": ret[i]["meta_info"]["prompt_tokens"],
|
@@ -1618,14 +1652,14 @@ async def v1_chat_completions(
|
|
1618
1652
|
latest_delta_len = len(call_item.parameters)
|
1619
1653
|
|
1620
1654
|
expected_call = json.dumps(
|
1621
|
-
parser.
|
1622
|
-
|
1623
|
-
|
1655
|
+
parser.detector.prev_tool_call_arr[index].get(
|
1656
|
+
"arguments", {}
|
1657
|
+
),
|
1624
1658
|
ensure_ascii=False,
|
1625
1659
|
)
|
1626
|
-
actual_call = parser.
|
1627
|
-
|
1628
|
-
]
|
1660
|
+
actual_call = parser.detector.streamed_args_for_tool[
|
1661
|
+
index
|
1662
|
+
]
|
1629
1663
|
if latest_delta_len > 0:
|
1630
1664
|
actual_call = actual_call[:-latest_delta_len]
|
1631
1665
|
remaining_call = expected_call.replace(
|
@@ -1926,3 +1960,31 @@ def to_openai_style_logprobs(
|
|
1926
1960
|
append_top_logprobs(output_top_logprobs)
|
1927
1961
|
|
1928
1962
|
return ret_logprobs
|
1963
|
+
|
1964
|
+
|
1965
|
+
async def v1_score(tokenizer_manager, raw_request):
|
1966
|
+
try:
|
1967
|
+
# Parse request
|
1968
|
+
request_data = await raw_request.json()
|
1969
|
+
request = ScoringRequest(**request_data)
|
1970
|
+
|
1971
|
+
# Use tokenizer_manager's score_request method directly
|
1972
|
+
scores = await tokenizer_manager.score_request(
|
1973
|
+
query=request.query,
|
1974
|
+
items=request.items,
|
1975
|
+
label_token_ids=request.label_token_ids,
|
1976
|
+
apply_softmax=request.apply_softmax,
|
1977
|
+
item_first=request.item_first,
|
1978
|
+
request=request,
|
1979
|
+
)
|
1980
|
+
|
1981
|
+
# Create response with just the scores, without usage info
|
1982
|
+
response = ScoringResponse(
|
1983
|
+
scores=scores,
|
1984
|
+
model=request.model,
|
1985
|
+
)
|
1986
|
+
return response
|
1987
|
+
|
1988
|
+
except Exception as e:
|
1989
|
+
logger.error(f"Error in v1_score: {str(e)}")
|
1990
|
+
return create_error_response(str(e))
|
@@ -183,12 +183,17 @@ class CompletionRequest(BaseModel):
|
|
183
183
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
184
184
|
session_params: Optional[Dict] = None
|
185
185
|
|
186
|
+
# For PD disaggregation
|
187
|
+
bootstrap_host: Optional[str] = None
|
188
|
+
bootstrap_port: Optional[int] = None
|
189
|
+
bootstrap_room: Optional[int] = None
|
190
|
+
|
186
191
|
|
187
192
|
class CompletionResponseChoice(BaseModel):
|
188
193
|
index: int
|
189
194
|
text: str
|
190
195
|
logprobs: Optional[LogProbs] = None
|
191
|
-
finish_reason: Literal["stop", "length", "content_filter"]
|
196
|
+
finish_reason: Literal["stop", "length", "content_filter", "abort"]
|
192
197
|
matched_stop: Union[None, int, str] = None
|
193
198
|
|
194
199
|
|
@@ -413,7 +418,7 @@ class ChatCompletionResponseChoice(BaseModel):
|
|
413
418
|
message: ChatMessage
|
414
419
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
415
420
|
finish_reason: Literal[
|
416
|
-
"stop", "length", "tool_calls", "content_filter", "function_call"
|
421
|
+
"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
|
417
422
|
]
|
418
423
|
matched_stop: Union[None, int, str] = None
|
419
424
|
|
@@ -484,3 +489,27 @@ class EmbeddingResponse(BaseModel):
|
|
484
489
|
model: str
|
485
490
|
object: str = "list"
|
486
491
|
usage: Optional[UsageInfo] = None
|
492
|
+
|
493
|
+
|
494
|
+
class ScoringRequest(BaseModel):
|
495
|
+
query: Optional[Union[str, List[int]]] = (
|
496
|
+
None # Query text or pre-tokenized token IDs
|
497
|
+
)
|
498
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = (
|
499
|
+
None # Item text(s) or pre-tokenized token IDs
|
500
|
+
)
|
501
|
+
label_token_ids: Optional[List[int]] = (
|
502
|
+
None # Token IDs to compute probabilities for
|
503
|
+
)
|
504
|
+
apply_softmax: bool = False
|
505
|
+
item_first: bool = False
|
506
|
+
model: str
|
507
|
+
|
508
|
+
|
509
|
+
class ScoringResponse(BaseModel):
|
510
|
+
scores: List[
|
511
|
+
List[float]
|
512
|
+
] # List of lists of probabilities, each in the order of label_token_ids
|
513
|
+
model: str
|
514
|
+
usage: Optional[UsageInfo] = None
|
515
|
+
object: str = "scoring"
|
@@ -0,0 +1,172 @@
|
|
1
|
+
"""
|
2
|
+
Utility functions for OpenAI API adapter.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import Dict, List
|
7
|
+
|
8
|
+
import jinja2.nodes
|
9
|
+
import transformers.utils.chat_template_utils as hf_chat_utils
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
# ============================================================================
|
14
|
+
# JINJA TEMPLATE CONTENT FORMAT DETECTION
|
15
|
+
# ============================================================================
|
16
|
+
#
|
17
|
+
# This adapts vLLM's approach for detecting chat template content format:
|
18
|
+
# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
|
19
|
+
# - Analyzes Jinja template AST to detect content iteration patterns
|
20
|
+
# - 'openai' format: templates with {%- for content in message['content'] -%} loops
|
21
|
+
# - 'string' format: templates that expect simple string content
|
22
|
+
# - Processes content accordingly to match template expectations
|
23
|
+
|
24
|
+
|
25
|
+
def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
|
26
|
+
"""Check if node is a variable access like {{ varname }}"""
|
27
|
+
if isinstance(node, jinja2.nodes.Name):
|
28
|
+
return node.ctx == "load" and node.name == varname
|
29
|
+
return False
|
30
|
+
|
31
|
+
|
32
|
+
def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
|
33
|
+
"""Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
|
34
|
+
if isinstance(node, jinja2.nodes.Getitem):
|
35
|
+
return (
|
36
|
+
_is_var_access(node.node, varname)
|
37
|
+
and isinstance(node.arg, jinja2.nodes.Const)
|
38
|
+
and node.arg.value == key
|
39
|
+
)
|
40
|
+
|
41
|
+
if isinstance(node, jinja2.nodes.Getattr):
|
42
|
+
return _is_var_access(node.node, varname) and node.attr == key
|
43
|
+
|
44
|
+
return False
|
45
|
+
|
46
|
+
|
47
|
+
def _is_var_or_elems_access(
|
48
|
+
node: jinja2.nodes.Node,
|
49
|
+
varname: str,
|
50
|
+
key: str = None,
|
51
|
+
) -> bool:
|
52
|
+
"""Check if node accesses varname or varname[key] with filters/tests"""
|
53
|
+
if isinstance(node, jinja2.nodes.Filter):
|
54
|
+
return node.node is not None and _is_var_or_elems_access(
|
55
|
+
node.node, varname, key
|
56
|
+
)
|
57
|
+
if isinstance(node, jinja2.nodes.Test):
|
58
|
+
return _is_var_or_elems_access(node.node, varname, key)
|
59
|
+
|
60
|
+
if isinstance(node, jinja2.nodes.Getitem) and isinstance(
|
61
|
+
node.arg, jinja2.nodes.Slice
|
62
|
+
):
|
63
|
+
return _is_var_or_elems_access(node.node, varname, key)
|
64
|
+
|
65
|
+
return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
|
66
|
+
|
67
|
+
|
68
|
+
def _try_extract_ast(chat_template: str):
|
69
|
+
"""Try to parse the Jinja template into an AST"""
|
70
|
+
try:
|
71
|
+
jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
|
72
|
+
return jinja_compiled.environment.parse(chat_template)
|
73
|
+
except Exception as e:
|
74
|
+
logger.debug(f"Error when compiling Jinja template: {e}")
|
75
|
+
return None
|
76
|
+
|
77
|
+
|
78
|
+
def detect_template_content_format(chat_template: str) -> str:
|
79
|
+
"""
|
80
|
+
Detect whether a chat template expects 'string' or 'openai' content format.
|
81
|
+
|
82
|
+
- 'string': content is a simple string (like DeepSeek templates)
|
83
|
+
- 'openai': content is a list of structured dicts (like Llama4 templates)
|
84
|
+
|
85
|
+
Detection logic:
|
86
|
+
- If template has loops like {%- for content in message['content'] -%} → 'openai'
|
87
|
+
- Otherwise → 'string'
|
88
|
+
"""
|
89
|
+
jinja_ast = _try_extract_ast(chat_template)
|
90
|
+
if jinja_ast is None:
|
91
|
+
return "string"
|
92
|
+
|
93
|
+
try:
|
94
|
+
# Look for patterns like: {%- for content in message['content'] -%}
|
95
|
+
for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
|
96
|
+
loop_iter = loop_ast.iter
|
97
|
+
|
98
|
+
# Check if iterating over message['content'] or similar
|
99
|
+
if _is_var_or_elems_access(loop_iter, "message", "content"):
|
100
|
+
return "openai" # Found content iteration → openai format
|
101
|
+
|
102
|
+
return "string" # No content loops found → string format
|
103
|
+
except Exception as e:
|
104
|
+
logger.debug(f"Error when parsing AST of Jinja template: {e}")
|
105
|
+
return "string"
|
106
|
+
|
107
|
+
|
108
|
+
def process_content_for_template_format(
|
109
|
+
msg_dict: dict,
|
110
|
+
content_format: str,
|
111
|
+
image_data: list,
|
112
|
+
audio_data: list,
|
113
|
+
modalities: list,
|
114
|
+
) -> dict:
|
115
|
+
"""
|
116
|
+
Process message content based on detected template format.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
msg_dict: Message dictionary with content
|
120
|
+
content_format: 'string' or 'openai' (detected via AST analysis)
|
121
|
+
image_data: List to append extracted image URLs
|
122
|
+
audio_data: List to append extracted audio URLs
|
123
|
+
modalities: List to append modalities
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
Processed message dictionary
|
127
|
+
"""
|
128
|
+
if not isinstance(msg_dict.get("content"), list):
|
129
|
+
# Already a string or None, no processing needed
|
130
|
+
return {k: v for k, v in msg_dict.items() if v is not None}
|
131
|
+
|
132
|
+
if content_format == "openai":
|
133
|
+
# OpenAI format: preserve structured content list, normalize types
|
134
|
+
processed_content_parts = []
|
135
|
+
for chunk in msg_dict["content"]:
|
136
|
+
if isinstance(chunk, dict):
|
137
|
+
chunk_type = chunk.get("type")
|
138
|
+
|
139
|
+
if chunk_type == "image_url":
|
140
|
+
image_data.append(chunk["image_url"]["url"])
|
141
|
+
if chunk.get("modalities"):
|
142
|
+
modalities.append(chunk.get("modalities"))
|
143
|
+
# Normalize to simple 'image' type for template compatibility
|
144
|
+
processed_content_parts.append({"type": "image"})
|
145
|
+
elif chunk_type == "audio_url":
|
146
|
+
audio_data.append(chunk["audio_url"]["url"])
|
147
|
+
# Normalize to simple 'audio' type
|
148
|
+
processed_content_parts.append({"type": "audio"})
|
149
|
+
else:
|
150
|
+
# Keep other content as-is (text, etc.)
|
151
|
+
processed_content_parts.append(chunk)
|
152
|
+
|
153
|
+
new_msg = {
|
154
|
+
k: v for k, v in msg_dict.items() if v is not None and k != "content"
|
155
|
+
}
|
156
|
+
new_msg["content"] = processed_content_parts
|
157
|
+
return new_msg
|
158
|
+
|
159
|
+
else: # content_format == "string"
|
160
|
+
# String format: flatten to text only (for templates like DeepSeek)
|
161
|
+
text_parts = []
|
162
|
+
for chunk in msg_dict["content"]:
|
163
|
+
if isinstance(chunk, dict) and chunk.get("type") == "text":
|
164
|
+
text_parts.append(chunk["text"])
|
165
|
+
# Note: For string format, we ignore images/audio since the template
|
166
|
+
# doesn't expect structured content - multimodal placeholders would
|
167
|
+
# need to be inserted differently
|
168
|
+
|
169
|
+
new_msg = msg_dict.copy()
|
170
|
+
new_msg["content"] = " ".join(text_parts) if text_parts else ""
|
171
|
+
new_msg = {k: v for k, v in new_msg.items() if v is not None}
|
172
|
+
return new_msg
|