sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -41,7 +41,11 @@ from sglang.srt.conversation import (
|
|
41
41
|
register_conv_template,
|
42
42
|
)
|
43
43
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
44
|
-
from sglang.srt.managers.io_struct import
|
44
|
+
from sglang.srt.managers.io_struct import (
|
45
|
+
EmbeddingReqInput,
|
46
|
+
GenerateReqInput,
|
47
|
+
V1RerankReqInput,
|
48
|
+
)
|
45
49
|
from sglang.srt.openai_api.protocol import (
|
46
50
|
BatchRequest,
|
47
51
|
BatchResponse,
|
@@ -69,10 +73,17 @@ from sglang.srt.openai_api.protocol import (
|
|
69
73
|
FunctionResponse,
|
70
74
|
LogProbs,
|
71
75
|
MultimodalEmbeddingInput,
|
76
|
+
RerankResponse,
|
77
|
+
ScoringRequest,
|
78
|
+
ScoringResponse,
|
72
79
|
ToolCall,
|
73
80
|
TopLogprob,
|
74
81
|
UsageInfo,
|
75
82
|
)
|
83
|
+
from sglang.srt.openai_api.utils import (
|
84
|
+
detect_template_content_format,
|
85
|
+
process_content_for_template_format,
|
86
|
+
)
|
76
87
|
from sglang.srt.reasoning_parser import ReasoningParser
|
77
88
|
from sglang.utils import convert_json_schema_to_str, get_exception_traceback
|
78
89
|
|
@@ -80,6 +91,11 @@ logger = logging.getLogger(__name__)
|
|
80
91
|
|
81
92
|
chat_template_name = None
|
82
93
|
|
94
|
+
# Global cache for template content format detection (one model/template per instance)
|
95
|
+
# NOTE: A better approach would be to initialize the chat template format when the endpoint is created
|
96
|
+
_cached_chat_template = None
|
97
|
+
_cached_template_format = None
|
98
|
+
|
83
99
|
|
84
100
|
class FileMetadata:
|
85
101
|
def __init__(self, filename: str, purpose: str):
|
@@ -531,6 +547,7 @@ def v1_generate_request(
|
|
531
547
|
logprob_start_lens = []
|
532
548
|
top_logprobs_nums = []
|
533
549
|
lora_paths = []
|
550
|
+
return_hidden_states = []
|
534
551
|
|
535
552
|
for request in all_requests:
|
536
553
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
@@ -570,6 +587,7 @@ def v1_generate_request(
|
|
570
587
|
"no_stop_trim": request.no_stop_trim,
|
571
588
|
"ignore_eos": request.ignore_eos,
|
572
589
|
"skip_special_tokens": request.skip_special_tokens,
|
590
|
+
"logit_bias": request.logit_bias,
|
573
591
|
}
|
574
592
|
)
|
575
593
|
return_logprobs.append(request.logprobs is not None)
|
@@ -577,6 +595,7 @@ def v1_generate_request(
|
|
577
595
|
top_logprobs_nums.append(
|
578
596
|
request.logprobs if request.logprobs is not None else 0
|
579
597
|
)
|
598
|
+
return_hidden_states.append(request.return_hidden_states)
|
580
599
|
|
581
600
|
if len(all_requests) == 1:
|
582
601
|
if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
|
@@ -588,6 +607,7 @@ def v1_generate_request(
|
|
588
607
|
logprob_start_lens = logprob_start_lens[0]
|
589
608
|
top_logprobs_nums = top_logprobs_nums[0]
|
590
609
|
lora_paths = lora_paths[0]
|
610
|
+
return_hidden_states = return_hidden_states[0]
|
591
611
|
else:
|
592
612
|
if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
|
593
613
|
prompt_kwargs = {"text": prompts}
|
@@ -604,6 +624,10 @@ def v1_generate_request(
|
|
604
624
|
stream=all_requests[0].stream,
|
605
625
|
rid=request_ids,
|
606
626
|
lora_path=lora_paths,
|
627
|
+
return_hidden_states=return_hidden_states,
|
628
|
+
bootstrap_host=all_requests[0].bootstrap_host,
|
629
|
+
bootstrap_port=all_requests[0].bootstrap_port,
|
630
|
+
bootstrap_room=all_requests[0].bootstrap_room,
|
607
631
|
)
|
608
632
|
|
609
633
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
@@ -669,6 +693,16 @@ def v1_generate_response(
|
|
669
693
|
else:
|
670
694
|
logprobs = None
|
671
695
|
|
696
|
+
hidden_states = None
|
697
|
+
if isinstance(request, list) and request[idx].return_hidden_states:
|
698
|
+
hidden_states = ret_item["meta_info"].get("hidden_states", None)
|
699
|
+
elif (not isinstance(request, list)) and request.return_hidden_states:
|
700
|
+
hidden_states = ret_item["meta_info"].get("hidden_states", None)
|
701
|
+
if hidden_states is not None:
|
702
|
+
hidden_states = (
|
703
|
+
hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
|
704
|
+
)
|
705
|
+
|
672
706
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
673
707
|
|
674
708
|
if to_file:
|
@@ -684,6 +718,8 @@ def v1_generate_response(
|
|
684
718
|
else None
|
685
719
|
),
|
686
720
|
}
|
721
|
+
if hidden_states is not None:
|
722
|
+
choice_data["hidden_states"] = hidden_states
|
687
723
|
else:
|
688
724
|
choice_data = CompletionResponseChoice(
|
689
725
|
index=idx,
|
@@ -695,6 +731,7 @@ def v1_generate_response(
|
|
695
731
|
if finish_reason and "matched" in finish_reason
|
696
732
|
else None
|
697
733
|
),
|
734
|
+
hidden_states=hidden_states,
|
698
735
|
)
|
699
736
|
|
700
737
|
choices.append(choice_data)
|
@@ -763,6 +800,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
763
800
|
prompt_tokens = {}
|
764
801
|
completion_tokens = {}
|
765
802
|
cached_tokens = {}
|
803
|
+
hidden_states = {}
|
766
804
|
|
767
805
|
try:
|
768
806
|
async for content in tokenizer_manager.generate_request(
|
@@ -777,6 +815,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
777
815
|
prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
|
778
816
|
completion_tokens[index] = content["meta_info"]["completion_tokens"]
|
779
817
|
cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
|
818
|
+
hidden_states[index] = content["meta_info"].get(
|
819
|
+
"hidden_states", None
|
820
|
+
) or hidden_states.get(index)
|
780
821
|
|
781
822
|
if not stream_buffer: # The first chunk
|
782
823
|
if request.echo:
|
@@ -859,6 +900,27 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
859
900
|
n_prev_tokens[index] = n_prev_token
|
860
901
|
|
861
902
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
903
|
+
if request.return_hidden_states and hidden_states:
|
904
|
+
for index, choice_hidden_states in hidden_states.items():
|
905
|
+
last_token_hidden_states = (
|
906
|
+
choice_hidden_states[-1]
|
907
|
+
if choice_hidden_states and len(choice_hidden_states) > 1
|
908
|
+
else []
|
909
|
+
)
|
910
|
+
hidden_states_chunk = CompletionStreamResponse(
|
911
|
+
id=content["meta_info"]["id"],
|
912
|
+
created=created,
|
913
|
+
choices=[
|
914
|
+
CompletionResponseStreamChoice(
|
915
|
+
text="",
|
916
|
+
index=index,
|
917
|
+
hidden_states=last_token_hidden_states,
|
918
|
+
finish_reason=None,
|
919
|
+
)
|
920
|
+
],
|
921
|
+
model=request.model,
|
922
|
+
)
|
923
|
+
yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
|
862
924
|
if request.stream_options and request.stream_options.include_usage:
|
863
925
|
total_prompt_tokens = sum(
|
864
926
|
tokens
|
@@ -959,6 +1021,7 @@ def v1_chat_generate_request(
|
|
959
1021
|
top_logprobs_nums = []
|
960
1022
|
modalities_list = []
|
961
1023
|
lora_paths = []
|
1024
|
+
return_hidden_states = []
|
962
1025
|
|
963
1026
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
964
1027
|
|
@@ -995,23 +1058,42 @@ def v1_chat_generate_request(
|
|
995
1058
|
|
996
1059
|
if chat_template_name is None:
|
997
1060
|
openai_compatible_messages = []
|
1061
|
+
image_data = []
|
1062
|
+
audio_data = []
|
1063
|
+
modalities = []
|
1064
|
+
|
1065
|
+
# Detect template content format by analyzing the jinja template (cached globally)
|
1066
|
+
global _cached_chat_template, _cached_template_format
|
1067
|
+
current_template = tokenizer_manager.tokenizer.chat_template
|
1068
|
+
|
1069
|
+
if current_template != _cached_chat_template:
|
1070
|
+
# Template changed or first time - analyze it
|
1071
|
+
_cached_chat_template = current_template
|
1072
|
+
_cached_template_format = detect_template_content_format(
|
1073
|
+
current_template
|
1074
|
+
)
|
1075
|
+
logger.info(
|
1076
|
+
f"Detected chat template content format: {_cached_template_format}"
|
1077
|
+
)
|
1078
|
+
|
1079
|
+
template_content_format = _cached_template_format
|
998
1080
|
|
999
1081
|
for message in request.messages:
|
1000
1082
|
if message.content is None:
|
1001
1083
|
message.content = ""
|
1002
|
-
msg_dict = message.
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1084
|
+
msg_dict = message.model_dump()
|
1085
|
+
|
1086
|
+
# Process content based on detected template format
|
1087
|
+
processed_msg = process_content_for_template_format(
|
1088
|
+
msg_dict,
|
1089
|
+
template_content_format,
|
1090
|
+
image_data,
|
1091
|
+
audio_data,
|
1092
|
+
modalities,
|
1093
|
+
)
|
1094
|
+
openai_compatible_messages.append(processed_msg)
|
1095
|
+
|
1096
|
+
# Handle assistant prefix for continue_final_message
|
1015
1097
|
if (
|
1016
1098
|
openai_compatible_messages
|
1017
1099
|
and openai_compatible_messages[-1]["role"] == "assistant"
|
@@ -1065,9 +1147,9 @@ def v1_chat_generate_request(
|
|
1065
1147
|
if is_multimodal:
|
1066
1148
|
prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
|
1067
1149
|
stop = request.stop
|
1068
|
-
image_data = None
|
1069
|
-
audio_data = None
|
1070
|
-
modalities = []
|
1150
|
+
image_data = image_data if image_data else None
|
1151
|
+
audio_data = audio_data if audio_data else None
|
1152
|
+
modalities = modalities if modalities else []
|
1071
1153
|
else:
|
1072
1154
|
conv = generate_chat_conv(request, chat_template_name)
|
1073
1155
|
# If we should continue the final assistant message, adjust the conversation.
|
@@ -1143,6 +1225,7 @@ def v1_chat_generate_request(
|
|
1143
1225
|
"no_stop_trim": request.no_stop_trim,
|
1144
1226
|
"ignore_eos": request.ignore_eos,
|
1145
1227
|
"skip_special_tokens": request.skip_special_tokens,
|
1228
|
+
"logit_bias": request.logit_bias,
|
1146
1229
|
}
|
1147
1230
|
|
1148
1231
|
if request.response_format and request.response_format.type == "json_schema":
|
@@ -1182,6 +1265,7 @@ def v1_chat_generate_request(
|
|
1182
1265
|
image_data_list.append(image_data)
|
1183
1266
|
audio_data_list.append(audio_data)
|
1184
1267
|
modalities_list.append(modalities)
|
1268
|
+
return_hidden_states.append(request.return_hidden_states)
|
1185
1269
|
if len(all_requests) == 1:
|
1186
1270
|
if is_multimodal:
|
1187
1271
|
# processor will need text input
|
@@ -1200,6 +1284,7 @@ def v1_chat_generate_request(
|
|
1200
1284
|
modalities_list = modalities_list[0]
|
1201
1285
|
lora_paths = lora_paths[0]
|
1202
1286
|
request_ids = request_ids[0]
|
1287
|
+
return_hidden_states = return_hidden_states[0]
|
1203
1288
|
else:
|
1204
1289
|
if tokenizer_manager.model_config.is_multimodal:
|
1205
1290
|
# processor will need text input
|
@@ -1226,6 +1311,7 @@ def v1_chat_generate_request(
|
|
1226
1311
|
bootstrap_host=all_requests[0].bootstrap_host,
|
1227
1312
|
bootstrap_port=all_requests[0].bootstrap_port,
|
1228
1313
|
bootstrap_room=all_requests[0].bootstrap_room,
|
1314
|
+
return_hidden_states=return_hidden_states,
|
1229
1315
|
)
|
1230
1316
|
|
1231
1317
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
@@ -1286,6 +1372,20 @@ def v1_chat_generate_response(
|
|
1286
1372
|
else:
|
1287
1373
|
choice_logprobs = None
|
1288
1374
|
|
1375
|
+
if isinstance(request, list) and request[idx].return_hidden_states:
|
1376
|
+
include_hidden_states = True
|
1377
|
+
elif not isinstance(request, list) and request.return_hidden_states:
|
1378
|
+
include_hidden_states = True
|
1379
|
+
else:
|
1380
|
+
include_hidden_states = False
|
1381
|
+
if include_hidden_states and ret_item["meta_info"].get("hidden_states", None):
|
1382
|
+
hidden_states = ret_item["meta_info"]["hidden_states"]
|
1383
|
+
hidden_states = (
|
1384
|
+
hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
|
1385
|
+
)
|
1386
|
+
else:
|
1387
|
+
hidden_states = None
|
1388
|
+
|
1289
1389
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
1290
1390
|
|
1291
1391
|
tool_calls = None
|
@@ -1327,7 +1427,6 @@ def v1_chat_generate_response(
|
|
1327
1427
|
tool_calls = [
|
1328
1428
|
ToolCall(
|
1329
1429
|
id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
|
1330
|
-
index=call_info.tool_index,
|
1331
1430
|
function=FunctionResponse(
|
1332
1431
|
name=call_info.name, arguments=call_info.parameters
|
1333
1432
|
),
|
@@ -1359,6 +1458,8 @@ def v1_chat_generate_response(
|
|
1359
1458
|
else None
|
1360
1459
|
),
|
1361
1460
|
}
|
1461
|
+
if hidden_states is not None:
|
1462
|
+
choice_data["hidden_states"] = hidden_states
|
1362
1463
|
else:
|
1363
1464
|
choice_data = ChatCompletionResponseChoice(
|
1364
1465
|
index=idx,
|
@@ -1375,6 +1476,7 @@ def v1_chat_generate_response(
|
|
1375
1476
|
if finish_reason and "matched" in finish_reason
|
1376
1477
|
else None
|
1377
1478
|
),
|
1479
|
+
hidden_states=hidden_states,
|
1378
1480
|
)
|
1379
1481
|
|
1380
1482
|
choices.append(choice_data)
|
@@ -1391,7 +1493,9 @@ def v1_chat_generate_response(
|
|
1391
1493
|
"id": ret[i]["meta_info"]["id"],
|
1392
1494
|
"object": "chat.completion",
|
1393
1495
|
"created": created,
|
1394
|
-
"model":
|
1496
|
+
"model": (
|
1497
|
+
request[i].model if isinstance(request, list) else request.model
|
1498
|
+
),
|
1395
1499
|
"choices": choice,
|
1396
1500
|
"usage": {
|
1397
1501
|
"prompt_tokens": ret[i]["meta_info"]["prompt_tokens"],
|
@@ -1445,19 +1549,23 @@ async def v1_chat_completions(
|
|
1445
1549
|
reasoning_parser_dict = {}
|
1446
1550
|
|
1447
1551
|
async def generate_stream_resp():
|
1448
|
-
|
1552
|
+
tool_index_previous = -1
|
1449
1553
|
is_firsts = {}
|
1450
1554
|
stream_buffers = {}
|
1451
1555
|
n_prev_tokens = {}
|
1452
1556
|
prompt_tokens = {}
|
1453
1557
|
completion_tokens = {}
|
1454
1558
|
cached_tokens = {}
|
1559
|
+
hidden_states = {}
|
1455
1560
|
try:
|
1456
1561
|
async for content in tokenizer_manager.generate_request(
|
1457
1562
|
adapted_request, raw_request
|
1458
1563
|
):
|
1459
1564
|
index = content.get("index", 0)
|
1460
1565
|
text = content["text"]
|
1566
|
+
hidden_states[index] = content["meta_info"].get(
|
1567
|
+
"hidden_states", None
|
1568
|
+
) or hidden_states.get(index)
|
1461
1569
|
|
1462
1570
|
is_first = is_firsts.get(index, True)
|
1463
1571
|
stream_buffer = stream_buffers.get(index, "")
|
@@ -1579,6 +1687,7 @@ async def v1_chat_completions(
|
|
1579
1687
|
if (delta and len(delta) == 0) or not delta:
|
1580
1688
|
stream_buffers[index] = new_stream_buffer
|
1581
1689
|
is_firsts[index] = is_first
|
1690
|
+
n_prev_tokens[index] = n_prev_token
|
1582
1691
|
continue
|
1583
1692
|
|
1584
1693
|
if request.tool_choice != "none" and request.tools:
|
@@ -1611,6 +1720,7 @@ async def v1_chat_completions(
|
|
1611
1720
|
|
1612
1721
|
# 2) if we found calls, we output them as separate chunk(s)
|
1613
1722
|
for call_item in calls:
|
1723
|
+
tool_index_current = call_item.tool_index
|
1614
1724
|
# transform call_item -> FunctionResponse + ToolCall
|
1615
1725
|
if finish_reason_type == "stop":
|
1616
1726
|
latest_delta_len = 0
|
@@ -1618,14 +1728,14 @@ async def v1_chat_completions(
|
|
1618
1728
|
latest_delta_len = len(call_item.parameters)
|
1619
1729
|
|
1620
1730
|
expected_call = json.dumps(
|
1621
|
-
parser.
|
1622
|
-
|
1623
|
-
|
1731
|
+
parser.detector.prev_tool_call_arr[index].get(
|
1732
|
+
"arguments", {}
|
1733
|
+
),
|
1624
1734
|
ensure_ascii=False,
|
1625
1735
|
)
|
1626
|
-
actual_call = parser.
|
1627
|
-
|
1628
|
-
]
|
1736
|
+
actual_call = parser.detector.streamed_args_for_tool[
|
1737
|
+
index
|
1738
|
+
]
|
1629
1739
|
if latest_delta_len > 0:
|
1630
1740
|
actual_call = actual_call[:-latest_delta_len]
|
1631
1741
|
remaining_call = expected_call.replace(
|
@@ -1637,7 +1747,7 @@ async def v1_chat_completions(
|
|
1637
1747
|
tool_call = ToolCall(
|
1638
1748
|
id=(
|
1639
1749
|
f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}"
|
1640
|
-
if
|
1750
|
+
if tool_index_previous != tool_index_current
|
1641
1751
|
else None
|
1642
1752
|
),
|
1643
1753
|
index=call_item.tool_index,
|
@@ -1646,7 +1756,7 @@ async def v1_chat_completions(
|
|
1646
1756
|
arguments=call_item.parameters,
|
1647
1757
|
),
|
1648
1758
|
)
|
1649
|
-
|
1759
|
+
tool_index_previous = tool_index_current
|
1650
1760
|
choice_data = ChatCompletionResponseStreamChoice(
|
1651
1761
|
index=index,
|
1652
1762
|
delta=DeltaMessage(tool_calls=[tool_call]),
|
@@ -1667,6 +1777,7 @@ async def v1_chat_completions(
|
|
1667
1777
|
|
1668
1778
|
stream_buffers[index] = new_stream_buffer
|
1669
1779
|
is_firsts[index] = is_first
|
1780
|
+
n_prev_tokens[index] = n_prev_token
|
1670
1781
|
|
1671
1782
|
else:
|
1672
1783
|
# No tool calls => just treat this as normal text
|
@@ -1699,6 +1810,7 @@ async def v1_chat_completions(
|
|
1699
1810
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
1700
1811
|
stream_buffers[index] = new_stream_buffer
|
1701
1812
|
is_firsts[index] = is_first
|
1813
|
+
n_prev_tokens[index] = n_prev_token
|
1702
1814
|
if finish_reason_type == "stop" and request.tool_choice != "none":
|
1703
1815
|
parser = FunctionCallParser(
|
1704
1816
|
tools=request.tools,
|
@@ -1734,6 +1846,28 @@ async def v1_chat_completions(
|
|
1734
1846
|
|
1735
1847
|
else:
|
1736
1848
|
usage = None
|
1849
|
+
if request.return_hidden_states and hidden_states:
|
1850
|
+
for index, choice_hidden_states in hidden_states.items():
|
1851
|
+
last_token_hidden_states = (
|
1852
|
+
choice_hidden_states[-1]
|
1853
|
+
if choice_hidden_states and len(choice_hidden_states) > 1
|
1854
|
+
else []
|
1855
|
+
)
|
1856
|
+
hidden_states_chunk = ChatCompletionStreamResponse(
|
1857
|
+
id=content["meta_info"]["id"],
|
1858
|
+
created=created,
|
1859
|
+
choices=[
|
1860
|
+
ChatCompletionResponseStreamChoice(
|
1861
|
+
index=index,
|
1862
|
+
delta=DeltaMessage(
|
1863
|
+
hidden_states=last_token_hidden_states
|
1864
|
+
),
|
1865
|
+
finish_reason=finish_reason_type,
|
1866
|
+
)
|
1867
|
+
],
|
1868
|
+
model=request.model,
|
1869
|
+
)
|
1870
|
+
yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
|
1737
1871
|
final_usage_chunk = ChatCompletionStreamResponse(
|
1738
1872
|
id=content["meta_info"]["id"],
|
1739
1873
|
created=created,
|
@@ -1891,6 +2025,64 @@ async def v1_embeddings(tokenizer_manager, raw_request: Request):
|
|
1891
2025
|
return response
|
1892
2026
|
|
1893
2027
|
|
2028
|
+
def v1_rerank_request(obj: V1RerankReqInput):
|
2029
|
+
if obj.query is None:
|
2030
|
+
raise ValueError("query is required")
|
2031
|
+
if obj.documents is None or len(obj.documents) == 0:
|
2032
|
+
raise ValueError("documents is required")
|
2033
|
+
|
2034
|
+
pairs = []
|
2035
|
+
for doc in obj.documents:
|
2036
|
+
pairs.append([obj.query, doc])
|
2037
|
+
|
2038
|
+
adapted_request = EmbeddingReqInput(
|
2039
|
+
text=pairs,
|
2040
|
+
is_cross_encoder_request=True,
|
2041
|
+
)
|
2042
|
+
|
2043
|
+
return adapted_request
|
2044
|
+
|
2045
|
+
|
2046
|
+
def v1_rerank_response(ret, obj: V1RerankReqInput):
|
2047
|
+
|
2048
|
+
response = []
|
2049
|
+
for idx, ret_item in enumerate(ret):
|
2050
|
+
response.append(
|
2051
|
+
RerankResponse(
|
2052
|
+
score=ret[idx]["embedding"],
|
2053
|
+
document=obj.documents[idx],
|
2054
|
+
index=idx,
|
2055
|
+
meta_info=ret[idx]["meta_info"],
|
2056
|
+
)
|
2057
|
+
)
|
2058
|
+
|
2059
|
+
response.sort(key=lambda x: x.score, reverse=True)
|
2060
|
+
|
2061
|
+
return response
|
2062
|
+
|
2063
|
+
|
2064
|
+
async def v1_rerank(tokenizer_manager, obj: V1RerankReqInput, raw_request: Request):
|
2065
|
+
adapted_request = v1_rerank_request(obj)
|
2066
|
+
|
2067
|
+
try:
|
2068
|
+
ret = await tokenizer_manager.generate_request(
|
2069
|
+
adapted_request, raw_request
|
2070
|
+
).__anext__()
|
2071
|
+
|
2072
|
+
except ValueError as e:
|
2073
|
+
return create_error_response(str(e))
|
2074
|
+
|
2075
|
+
if not isinstance(ret, list):
|
2076
|
+
ret = [ret]
|
2077
|
+
|
2078
|
+
response = v1_rerank_response(
|
2079
|
+
ret,
|
2080
|
+
obj,
|
2081
|
+
)
|
2082
|
+
|
2083
|
+
return response
|
2084
|
+
|
2085
|
+
|
1894
2086
|
def to_openai_style_logprobs(
|
1895
2087
|
input_token_logprobs=None,
|
1896
2088
|
output_token_logprobs=None,
|
@@ -1926,3 +2118,31 @@ def to_openai_style_logprobs(
|
|
1926
2118
|
append_top_logprobs(output_top_logprobs)
|
1927
2119
|
|
1928
2120
|
return ret_logprobs
|
2121
|
+
|
2122
|
+
|
2123
|
+
async def v1_score(tokenizer_manager, raw_request):
|
2124
|
+
try:
|
2125
|
+
# Parse request
|
2126
|
+
request_data = await raw_request.json()
|
2127
|
+
request = ScoringRequest(**request_data)
|
2128
|
+
|
2129
|
+
# Use tokenizer_manager's score_request method directly
|
2130
|
+
scores = await tokenizer_manager.score_request(
|
2131
|
+
query=request.query,
|
2132
|
+
items=request.items,
|
2133
|
+
label_token_ids=request.label_token_ids,
|
2134
|
+
apply_softmax=request.apply_softmax,
|
2135
|
+
item_first=request.item_first,
|
2136
|
+
request=request,
|
2137
|
+
)
|
2138
|
+
|
2139
|
+
# Create response with just the scores, without usage info
|
2140
|
+
response = ScoringResponse(
|
2141
|
+
scores=scores,
|
2142
|
+
model=request.model,
|
2143
|
+
)
|
2144
|
+
return response
|
2145
|
+
|
2146
|
+
except Exception as e:
|
2147
|
+
logger.error(f"Error in v1_score: {str(e)}")
|
2148
|
+
return create_error_response(str(e))
|
@@ -16,7 +16,7 @@
|
|
16
16
|
import time
|
17
17
|
from typing import Dict, List, Optional, Union
|
18
18
|
|
19
|
-
from pydantic import BaseModel, Field, root_validator
|
19
|
+
from pydantic import BaseModel, Field, model_serializer, root_validator
|
20
20
|
from typing_extensions import Literal
|
21
21
|
|
22
22
|
|
@@ -182,14 +182,25 @@ class CompletionRequest(BaseModel):
|
|
182
182
|
skip_special_tokens: bool = True
|
183
183
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
184
184
|
session_params: Optional[Dict] = None
|
185
|
+
return_hidden_states: Optional[bool] = False
|
186
|
+
|
187
|
+
# For PD disaggregation
|
188
|
+
bootstrap_host: Optional[str] = None
|
189
|
+
bootstrap_port: Optional[int] = None
|
190
|
+
bootstrap_room: Optional[int] = None
|
185
191
|
|
186
192
|
|
187
193
|
class CompletionResponseChoice(BaseModel):
|
188
194
|
index: int
|
189
195
|
text: str
|
190
196
|
logprobs: Optional[LogProbs] = None
|
191
|
-
finish_reason: Literal["stop", "length", "content_filter"]
|
197
|
+
finish_reason: Literal["stop", "length", "content_filter", "abort"]
|
192
198
|
matched_stop: Union[None, int, str] = None
|
199
|
+
hidden_states: Optional[object] = None
|
200
|
+
|
201
|
+
@model_serializer
|
202
|
+
def _serialize(self):
|
203
|
+
return exclude_if_none(self, ["hidden_states"])
|
193
204
|
|
194
205
|
|
195
206
|
class CompletionResponse(BaseModel):
|
@@ -207,6 +218,11 @@ class CompletionResponseStreamChoice(BaseModel):
|
|
207
218
|
logprobs: Optional[LogProbs] = None
|
208
219
|
finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
|
209
220
|
matched_stop: Union[None, int, str] = None
|
221
|
+
hidden_states: Optional[object] = None
|
222
|
+
|
223
|
+
@model_serializer
|
224
|
+
def _serialize(self):
|
225
|
+
return exclude_if_none(self, ["hidden_states"])
|
210
226
|
|
211
227
|
|
212
228
|
class CompletionStreamResponse(BaseModel):
|
@@ -400,6 +416,9 @@ class ChatCompletionRequest(BaseModel):
|
|
400
416
|
bootstrap_port: Optional[int] = None
|
401
417
|
bootstrap_room: Optional[int] = None
|
402
418
|
|
419
|
+
# Hidden States
|
420
|
+
return_hidden_states: Optional[bool] = False
|
421
|
+
|
403
422
|
|
404
423
|
class ChatMessage(BaseModel):
|
405
424
|
role: Optional[str] = None
|
@@ -413,9 +432,14 @@ class ChatCompletionResponseChoice(BaseModel):
|
|
413
432
|
message: ChatMessage
|
414
433
|
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
415
434
|
finish_reason: Literal[
|
416
|
-
"stop", "length", "tool_calls", "content_filter", "function_call"
|
435
|
+
"stop", "length", "tool_calls", "content_filter", "function_call", "abort"
|
417
436
|
]
|
418
437
|
matched_stop: Union[None, int, str] = None
|
438
|
+
hidden_states: Optional[object] = None
|
439
|
+
|
440
|
+
@model_serializer
|
441
|
+
def _serialize(self):
|
442
|
+
return exclude_if_none(self, ["hidden_states"])
|
419
443
|
|
420
444
|
|
421
445
|
class ChatCompletionResponse(BaseModel):
|
@@ -432,6 +456,11 @@ class DeltaMessage(BaseModel):
|
|
432
456
|
content: Optional[str] = None
|
433
457
|
reasoning_content: Optional[str] = None
|
434
458
|
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
459
|
+
hidden_states: Optional[object] = None
|
460
|
+
|
461
|
+
@model_serializer
|
462
|
+
def _serialize(self):
|
463
|
+
return exclude_if_none(self, ["hidden_states"])
|
435
464
|
|
436
465
|
|
437
466
|
class ChatCompletionResponseStreamChoice(BaseModel):
|
@@ -484,3 +513,39 @@ class EmbeddingResponse(BaseModel):
|
|
484
513
|
model: str
|
485
514
|
object: str = "list"
|
486
515
|
usage: Optional[UsageInfo] = None
|
516
|
+
|
517
|
+
|
518
|
+
class ScoringRequest(BaseModel):
|
519
|
+
query: Optional[Union[str, List[int]]] = (
|
520
|
+
None # Query text or pre-tokenized token IDs
|
521
|
+
)
|
522
|
+
items: Optional[Union[str, List[str], List[List[int]]]] = (
|
523
|
+
None # Item text(s) or pre-tokenized token IDs
|
524
|
+
)
|
525
|
+
label_token_ids: Optional[List[int]] = (
|
526
|
+
None # Token IDs to compute probabilities for
|
527
|
+
)
|
528
|
+
apply_softmax: bool = False
|
529
|
+
item_first: bool = False
|
530
|
+
model: str
|
531
|
+
|
532
|
+
|
533
|
+
class ScoringResponse(BaseModel):
|
534
|
+
scores: List[
|
535
|
+
List[float]
|
536
|
+
] # List of lists of probabilities, each in the order of label_token_ids
|
537
|
+
model: str
|
538
|
+
usage: Optional[UsageInfo] = None
|
539
|
+
object: str = "scoring"
|
540
|
+
|
541
|
+
|
542
|
+
class RerankResponse(BaseModel):
|
543
|
+
score: float
|
544
|
+
document: str
|
545
|
+
index: int
|
546
|
+
meta_info: Optional[dict] = None
|
547
|
+
|
548
|
+
|
549
|
+
def exclude_if_none(obj, field_names: List[str]):
|
550
|
+
omit_if_none_fields = {k for k, v in obj.model_fields.items() if k in field_names}
|
551
|
+
return {k: v for k, v in obj if k not in omit_if_none_fields or v is not None}
|