sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -41,7 +41,11 @@ from sglang.srt.conversation import (
41
41
  register_conv_template,
42
42
  )
43
43
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
44
- from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
44
+ from sglang.srt.managers.io_struct import (
45
+ EmbeddingReqInput,
46
+ GenerateReqInput,
47
+ V1RerankReqInput,
48
+ )
45
49
  from sglang.srt.openai_api.protocol import (
46
50
  BatchRequest,
47
51
  BatchResponse,
@@ -69,10 +73,17 @@ from sglang.srt.openai_api.protocol import (
69
73
  FunctionResponse,
70
74
  LogProbs,
71
75
  MultimodalEmbeddingInput,
76
+ RerankResponse,
77
+ ScoringRequest,
78
+ ScoringResponse,
72
79
  ToolCall,
73
80
  TopLogprob,
74
81
  UsageInfo,
75
82
  )
83
+ from sglang.srt.openai_api.utils import (
84
+ detect_template_content_format,
85
+ process_content_for_template_format,
86
+ )
76
87
  from sglang.srt.reasoning_parser import ReasoningParser
77
88
  from sglang.utils import convert_json_schema_to_str, get_exception_traceback
78
89
 
@@ -80,6 +91,11 @@ logger = logging.getLogger(__name__)
80
91
 
81
92
  chat_template_name = None
82
93
 
94
+ # Global cache for template content format detection (one model/template per instance)
95
+ # NOTE: A better approach would be to initialize the chat template format when the endpoint is created
96
+ _cached_chat_template = None
97
+ _cached_template_format = None
98
+
83
99
 
84
100
  class FileMetadata:
85
101
  def __init__(self, filename: str, purpose: str):
@@ -531,6 +547,7 @@ def v1_generate_request(
531
547
  logprob_start_lens = []
532
548
  top_logprobs_nums = []
533
549
  lora_paths = []
550
+ return_hidden_states = []
534
551
 
535
552
  for request in all_requests:
536
553
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -570,6 +587,7 @@ def v1_generate_request(
570
587
  "no_stop_trim": request.no_stop_trim,
571
588
  "ignore_eos": request.ignore_eos,
572
589
  "skip_special_tokens": request.skip_special_tokens,
590
+ "logit_bias": request.logit_bias,
573
591
  }
574
592
  )
575
593
  return_logprobs.append(request.logprobs is not None)
@@ -577,6 +595,7 @@ def v1_generate_request(
577
595
  top_logprobs_nums.append(
578
596
  request.logprobs if request.logprobs is not None else 0
579
597
  )
598
+ return_hidden_states.append(request.return_hidden_states)
580
599
 
581
600
  if len(all_requests) == 1:
582
601
  if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
@@ -588,6 +607,7 @@ def v1_generate_request(
588
607
  logprob_start_lens = logprob_start_lens[0]
589
608
  top_logprobs_nums = top_logprobs_nums[0]
590
609
  lora_paths = lora_paths[0]
610
+ return_hidden_states = return_hidden_states[0]
591
611
  else:
592
612
  if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
593
613
  prompt_kwargs = {"text": prompts}
@@ -604,6 +624,10 @@ def v1_generate_request(
604
624
  stream=all_requests[0].stream,
605
625
  rid=request_ids,
606
626
  lora_path=lora_paths,
627
+ return_hidden_states=return_hidden_states,
628
+ bootstrap_host=all_requests[0].bootstrap_host,
629
+ bootstrap_port=all_requests[0].bootstrap_port,
630
+ bootstrap_room=all_requests[0].bootstrap_room,
607
631
  )
608
632
 
609
633
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -669,6 +693,16 @@ def v1_generate_response(
669
693
  else:
670
694
  logprobs = None
671
695
 
696
+ hidden_states = None
697
+ if isinstance(request, list) and request[idx].return_hidden_states:
698
+ hidden_states = ret_item["meta_info"].get("hidden_states", None)
699
+ elif (not isinstance(request, list)) and request.return_hidden_states:
700
+ hidden_states = ret_item["meta_info"].get("hidden_states", None)
701
+ if hidden_states is not None:
702
+ hidden_states = (
703
+ hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
704
+ )
705
+
672
706
  finish_reason = ret_item["meta_info"]["finish_reason"]
673
707
 
674
708
  if to_file:
@@ -684,6 +718,8 @@ def v1_generate_response(
684
718
  else None
685
719
  ),
686
720
  }
721
+ if hidden_states is not None:
722
+ choice_data["hidden_states"] = hidden_states
687
723
  else:
688
724
  choice_data = CompletionResponseChoice(
689
725
  index=idx,
@@ -695,6 +731,7 @@ def v1_generate_response(
695
731
  if finish_reason and "matched" in finish_reason
696
732
  else None
697
733
  ),
734
+ hidden_states=hidden_states,
698
735
  )
699
736
 
700
737
  choices.append(choice_data)
@@ -763,6 +800,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
763
800
  prompt_tokens = {}
764
801
  completion_tokens = {}
765
802
  cached_tokens = {}
803
+ hidden_states = {}
766
804
 
767
805
  try:
768
806
  async for content in tokenizer_manager.generate_request(
@@ -777,6 +815,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
777
815
  prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
778
816
  completion_tokens[index] = content["meta_info"]["completion_tokens"]
779
817
  cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
818
+ hidden_states[index] = content["meta_info"].get(
819
+ "hidden_states", None
820
+ ) or hidden_states.get(index)
780
821
 
781
822
  if not stream_buffer: # The first chunk
782
823
  if request.echo:
@@ -859,6 +900,27 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
859
900
  n_prev_tokens[index] = n_prev_token
860
901
 
861
902
  yield f"data: {chunk.model_dump_json()}\n\n"
903
+ if request.return_hidden_states and hidden_states:
904
+ for index, choice_hidden_states in hidden_states.items():
905
+ last_token_hidden_states = (
906
+ choice_hidden_states[-1]
907
+ if choice_hidden_states and len(choice_hidden_states) > 1
908
+ else []
909
+ )
910
+ hidden_states_chunk = CompletionStreamResponse(
911
+ id=content["meta_info"]["id"],
912
+ created=created,
913
+ choices=[
914
+ CompletionResponseStreamChoice(
915
+ text="",
916
+ index=index,
917
+ hidden_states=last_token_hidden_states,
918
+ finish_reason=None,
919
+ )
920
+ ],
921
+ model=request.model,
922
+ )
923
+ yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
862
924
  if request.stream_options and request.stream_options.include_usage:
863
925
  total_prompt_tokens = sum(
864
926
  tokens
@@ -959,6 +1021,7 @@ def v1_chat_generate_request(
959
1021
  top_logprobs_nums = []
960
1022
  modalities_list = []
961
1023
  lora_paths = []
1024
+ return_hidden_states = []
962
1025
 
963
1026
  # NOTE: with openai API, the prompt's logprobs are always not computed
964
1027
 
@@ -995,23 +1058,42 @@ def v1_chat_generate_request(
995
1058
 
996
1059
  if chat_template_name is None:
997
1060
  openai_compatible_messages = []
1061
+ image_data = []
1062
+ audio_data = []
1063
+ modalities = []
1064
+
1065
+ # Detect template content format by analyzing the jinja template (cached globally)
1066
+ global _cached_chat_template, _cached_template_format
1067
+ current_template = tokenizer_manager.tokenizer.chat_template
1068
+
1069
+ if current_template != _cached_chat_template:
1070
+ # Template changed or first time - analyze it
1071
+ _cached_chat_template = current_template
1072
+ _cached_template_format = detect_template_content_format(
1073
+ current_template
1074
+ )
1075
+ logger.info(
1076
+ f"Detected chat template content format: {_cached_template_format}"
1077
+ )
1078
+
1079
+ template_content_format = _cached_template_format
998
1080
 
999
1081
  for message in request.messages:
1000
1082
  if message.content is None:
1001
1083
  message.content = ""
1002
- msg_dict = message.dict()
1003
- if isinstance(msg_dict.get("content"), list):
1004
- for chunk in msg_dict["content"]:
1005
- if isinstance(chunk, dict) and chunk.get("type") == "text":
1006
- new_msg = msg_dict.copy()
1007
- new_msg["content"] = chunk["text"]
1008
- new_msg = {
1009
- k: v for k, v in new_msg.items() if v is not None
1010
- }
1011
- openai_compatible_messages.append(new_msg)
1012
- else:
1013
- msg_dict = {k: v for k, v in msg_dict.items() if v is not None}
1014
- openai_compatible_messages.append(msg_dict)
1084
+ msg_dict = message.model_dump()
1085
+
1086
+ # Process content based on detected template format
1087
+ processed_msg = process_content_for_template_format(
1088
+ msg_dict,
1089
+ template_content_format,
1090
+ image_data,
1091
+ audio_data,
1092
+ modalities,
1093
+ )
1094
+ openai_compatible_messages.append(processed_msg)
1095
+
1096
+ # Handle assistant prefix for continue_final_message
1015
1097
  if (
1016
1098
  openai_compatible_messages
1017
1099
  and openai_compatible_messages[-1]["role"] == "assistant"
@@ -1065,9 +1147,9 @@ def v1_chat_generate_request(
1065
1147
  if is_multimodal:
1066
1148
  prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
1067
1149
  stop = request.stop
1068
- image_data = None
1069
- audio_data = None
1070
- modalities = []
1150
+ image_data = image_data if image_data else None
1151
+ audio_data = audio_data if audio_data else None
1152
+ modalities = modalities if modalities else []
1071
1153
  else:
1072
1154
  conv = generate_chat_conv(request, chat_template_name)
1073
1155
  # If we should continue the final assistant message, adjust the conversation.
@@ -1143,6 +1225,7 @@ def v1_chat_generate_request(
1143
1225
  "no_stop_trim": request.no_stop_trim,
1144
1226
  "ignore_eos": request.ignore_eos,
1145
1227
  "skip_special_tokens": request.skip_special_tokens,
1228
+ "logit_bias": request.logit_bias,
1146
1229
  }
1147
1230
 
1148
1231
  if request.response_format and request.response_format.type == "json_schema":
@@ -1182,6 +1265,7 @@ def v1_chat_generate_request(
1182
1265
  image_data_list.append(image_data)
1183
1266
  audio_data_list.append(audio_data)
1184
1267
  modalities_list.append(modalities)
1268
+ return_hidden_states.append(request.return_hidden_states)
1185
1269
  if len(all_requests) == 1:
1186
1270
  if is_multimodal:
1187
1271
  # processor will need text input
@@ -1200,6 +1284,7 @@ def v1_chat_generate_request(
1200
1284
  modalities_list = modalities_list[0]
1201
1285
  lora_paths = lora_paths[0]
1202
1286
  request_ids = request_ids[0]
1287
+ return_hidden_states = return_hidden_states[0]
1203
1288
  else:
1204
1289
  if tokenizer_manager.model_config.is_multimodal:
1205
1290
  # processor will need text input
@@ -1226,6 +1311,7 @@ def v1_chat_generate_request(
1226
1311
  bootstrap_host=all_requests[0].bootstrap_host,
1227
1312
  bootstrap_port=all_requests[0].bootstrap_port,
1228
1313
  bootstrap_room=all_requests[0].bootstrap_room,
1314
+ return_hidden_states=return_hidden_states,
1229
1315
  )
1230
1316
 
1231
1317
  return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -1286,6 +1372,20 @@ def v1_chat_generate_response(
1286
1372
  else:
1287
1373
  choice_logprobs = None
1288
1374
 
1375
+ if isinstance(request, list) and request[idx].return_hidden_states:
1376
+ include_hidden_states = True
1377
+ elif not isinstance(request, list) and request.return_hidden_states:
1378
+ include_hidden_states = True
1379
+ else:
1380
+ include_hidden_states = False
1381
+ if include_hidden_states and ret_item["meta_info"].get("hidden_states", None):
1382
+ hidden_states = ret_item["meta_info"]["hidden_states"]
1383
+ hidden_states = (
1384
+ hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
1385
+ )
1386
+ else:
1387
+ hidden_states = None
1388
+
1289
1389
  finish_reason = ret_item["meta_info"]["finish_reason"]
1290
1390
 
1291
1391
  tool_calls = None
@@ -1327,7 +1427,6 @@ def v1_chat_generate_response(
1327
1427
  tool_calls = [
1328
1428
  ToolCall(
1329
1429
  id=f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}",
1330
- index=call_info.tool_index,
1331
1430
  function=FunctionResponse(
1332
1431
  name=call_info.name, arguments=call_info.parameters
1333
1432
  ),
@@ -1359,6 +1458,8 @@ def v1_chat_generate_response(
1359
1458
  else None
1360
1459
  ),
1361
1460
  }
1461
+ if hidden_states is not None:
1462
+ choice_data["hidden_states"] = hidden_states
1362
1463
  else:
1363
1464
  choice_data = ChatCompletionResponseChoice(
1364
1465
  index=idx,
@@ -1375,6 +1476,7 @@ def v1_chat_generate_response(
1375
1476
  if finish_reason and "matched" in finish_reason
1376
1477
  else None
1377
1478
  ),
1479
+ hidden_states=hidden_states,
1378
1480
  )
1379
1481
 
1380
1482
  choices.append(choice_data)
@@ -1391,7 +1493,9 @@ def v1_chat_generate_response(
1391
1493
  "id": ret[i]["meta_info"]["id"],
1392
1494
  "object": "chat.completion",
1393
1495
  "created": created,
1394
- "model": request[i].model,
1496
+ "model": (
1497
+ request[i].model if isinstance(request, list) else request.model
1498
+ ),
1395
1499
  "choices": choice,
1396
1500
  "usage": {
1397
1501
  "prompt_tokens": ret[i]["meta_info"]["prompt_tokens"],
@@ -1445,19 +1549,23 @@ async def v1_chat_completions(
1445
1549
  reasoning_parser_dict = {}
1446
1550
 
1447
1551
  async def generate_stream_resp():
1448
- tool_call_first = True
1552
+ tool_index_previous = -1
1449
1553
  is_firsts = {}
1450
1554
  stream_buffers = {}
1451
1555
  n_prev_tokens = {}
1452
1556
  prompt_tokens = {}
1453
1557
  completion_tokens = {}
1454
1558
  cached_tokens = {}
1559
+ hidden_states = {}
1455
1560
  try:
1456
1561
  async for content in tokenizer_manager.generate_request(
1457
1562
  adapted_request, raw_request
1458
1563
  ):
1459
1564
  index = content.get("index", 0)
1460
1565
  text = content["text"]
1566
+ hidden_states[index] = content["meta_info"].get(
1567
+ "hidden_states", None
1568
+ ) or hidden_states.get(index)
1461
1569
 
1462
1570
  is_first = is_firsts.get(index, True)
1463
1571
  stream_buffer = stream_buffers.get(index, "")
@@ -1579,6 +1687,7 @@ async def v1_chat_completions(
1579
1687
  if (delta and len(delta) == 0) or not delta:
1580
1688
  stream_buffers[index] = new_stream_buffer
1581
1689
  is_firsts[index] = is_first
1690
+ n_prev_tokens[index] = n_prev_token
1582
1691
  continue
1583
1692
 
1584
1693
  if request.tool_choice != "none" and request.tools:
@@ -1611,6 +1720,7 @@ async def v1_chat_completions(
1611
1720
 
1612
1721
  # 2) if we found calls, we output them as separate chunk(s)
1613
1722
  for call_item in calls:
1723
+ tool_index_current = call_item.tool_index
1614
1724
  # transform call_item -> FunctionResponse + ToolCall
1615
1725
  if finish_reason_type == "stop":
1616
1726
  latest_delta_len = 0
@@ -1618,14 +1728,14 @@ async def v1_chat_completions(
1618
1728
  latest_delta_len = len(call_item.parameters)
1619
1729
 
1620
1730
  expected_call = json.dumps(
1621
- parser.multi_format_parser.detectors[0]
1622
- .prev_tool_call_arr[index]
1623
- .get("arguments", {}),
1731
+ parser.detector.prev_tool_call_arr[index].get(
1732
+ "arguments", {}
1733
+ ),
1624
1734
  ensure_ascii=False,
1625
1735
  )
1626
- actual_call = parser.multi_format_parser.detectors[
1627
- 0
1628
- ].streamed_args_for_tool[index]
1736
+ actual_call = parser.detector.streamed_args_for_tool[
1737
+ index
1738
+ ]
1629
1739
  if latest_delta_len > 0:
1630
1740
  actual_call = actual_call[:-latest_delta_len]
1631
1741
  remaining_call = expected_call.replace(
@@ -1637,7 +1747,7 @@ async def v1_chat_completions(
1637
1747
  tool_call = ToolCall(
1638
1748
  id=(
1639
1749
  f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}"
1640
- if tool_call_first
1750
+ if tool_index_previous != tool_index_current
1641
1751
  else None
1642
1752
  ),
1643
1753
  index=call_item.tool_index,
@@ -1646,7 +1756,7 @@ async def v1_chat_completions(
1646
1756
  arguments=call_item.parameters,
1647
1757
  ),
1648
1758
  )
1649
- tool_call_first = False
1759
+ tool_index_previous = tool_index_current
1650
1760
  choice_data = ChatCompletionResponseStreamChoice(
1651
1761
  index=index,
1652
1762
  delta=DeltaMessage(tool_calls=[tool_call]),
@@ -1667,6 +1777,7 @@ async def v1_chat_completions(
1667
1777
 
1668
1778
  stream_buffers[index] = new_stream_buffer
1669
1779
  is_firsts[index] = is_first
1780
+ n_prev_tokens[index] = n_prev_token
1670
1781
 
1671
1782
  else:
1672
1783
  # No tool calls => just treat this as normal text
@@ -1699,6 +1810,7 @@ async def v1_chat_completions(
1699
1810
  yield f"data: {chunk.model_dump_json()}\n\n"
1700
1811
  stream_buffers[index] = new_stream_buffer
1701
1812
  is_firsts[index] = is_first
1813
+ n_prev_tokens[index] = n_prev_token
1702
1814
  if finish_reason_type == "stop" and request.tool_choice != "none":
1703
1815
  parser = FunctionCallParser(
1704
1816
  tools=request.tools,
@@ -1734,6 +1846,28 @@ async def v1_chat_completions(
1734
1846
 
1735
1847
  else:
1736
1848
  usage = None
1849
+ if request.return_hidden_states and hidden_states:
1850
+ for index, choice_hidden_states in hidden_states.items():
1851
+ last_token_hidden_states = (
1852
+ choice_hidden_states[-1]
1853
+ if choice_hidden_states and len(choice_hidden_states) > 1
1854
+ else []
1855
+ )
1856
+ hidden_states_chunk = ChatCompletionStreamResponse(
1857
+ id=content["meta_info"]["id"],
1858
+ created=created,
1859
+ choices=[
1860
+ ChatCompletionResponseStreamChoice(
1861
+ index=index,
1862
+ delta=DeltaMessage(
1863
+ hidden_states=last_token_hidden_states
1864
+ ),
1865
+ finish_reason=finish_reason_type,
1866
+ )
1867
+ ],
1868
+ model=request.model,
1869
+ )
1870
+ yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
1737
1871
  final_usage_chunk = ChatCompletionStreamResponse(
1738
1872
  id=content["meta_info"]["id"],
1739
1873
  created=created,
@@ -1891,6 +2025,64 @@ async def v1_embeddings(tokenizer_manager, raw_request: Request):
1891
2025
  return response
1892
2026
 
1893
2027
 
2028
+ def v1_rerank_request(obj: V1RerankReqInput):
2029
+ if obj.query is None:
2030
+ raise ValueError("query is required")
2031
+ if obj.documents is None or len(obj.documents) == 0:
2032
+ raise ValueError("documents is required")
2033
+
2034
+ pairs = []
2035
+ for doc in obj.documents:
2036
+ pairs.append([obj.query, doc])
2037
+
2038
+ adapted_request = EmbeddingReqInput(
2039
+ text=pairs,
2040
+ is_cross_encoder_request=True,
2041
+ )
2042
+
2043
+ return adapted_request
2044
+
2045
+
2046
+ def v1_rerank_response(ret, obj: V1RerankReqInput):
2047
+
2048
+ response = []
2049
+ for idx, ret_item in enumerate(ret):
2050
+ response.append(
2051
+ RerankResponse(
2052
+ score=ret[idx]["embedding"],
2053
+ document=obj.documents[idx],
2054
+ index=idx,
2055
+ meta_info=ret[idx]["meta_info"],
2056
+ )
2057
+ )
2058
+
2059
+ response.sort(key=lambda x: x.score, reverse=True)
2060
+
2061
+ return response
2062
+
2063
+
2064
+ async def v1_rerank(tokenizer_manager, obj: V1RerankReqInput, raw_request: Request):
2065
+ adapted_request = v1_rerank_request(obj)
2066
+
2067
+ try:
2068
+ ret = await tokenizer_manager.generate_request(
2069
+ adapted_request, raw_request
2070
+ ).__anext__()
2071
+
2072
+ except ValueError as e:
2073
+ return create_error_response(str(e))
2074
+
2075
+ if not isinstance(ret, list):
2076
+ ret = [ret]
2077
+
2078
+ response = v1_rerank_response(
2079
+ ret,
2080
+ obj,
2081
+ )
2082
+
2083
+ return response
2084
+
2085
+
1894
2086
  def to_openai_style_logprobs(
1895
2087
  input_token_logprobs=None,
1896
2088
  output_token_logprobs=None,
@@ -1926,3 +2118,31 @@ def to_openai_style_logprobs(
1926
2118
  append_top_logprobs(output_top_logprobs)
1927
2119
 
1928
2120
  return ret_logprobs
2121
+
2122
+
2123
+ async def v1_score(tokenizer_manager, raw_request):
2124
+ try:
2125
+ # Parse request
2126
+ request_data = await raw_request.json()
2127
+ request = ScoringRequest(**request_data)
2128
+
2129
+ # Use tokenizer_manager's score_request method directly
2130
+ scores = await tokenizer_manager.score_request(
2131
+ query=request.query,
2132
+ items=request.items,
2133
+ label_token_ids=request.label_token_ids,
2134
+ apply_softmax=request.apply_softmax,
2135
+ item_first=request.item_first,
2136
+ request=request,
2137
+ )
2138
+
2139
+ # Create response with just the scores, without usage info
2140
+ response = ScoringResponse(
2141
+ scores=scores,
2142
+ model=request.model,
2143
+ )
2144
+ return response
2145
+
2146
+ except Exception as e:
2147
+ logger.error(f"Error in v1_score: {str(e)}")
2148
+ return create_error_response(str(e))
@@ -16,7 +16,7 @@
16
16
  import time
17
17
  from typing import Dict, List, Optional, Union
18
18
 
19
- from pydantic import BaseModel, Field, root_validator
19
+ from pydantic import BaseModel, Field, model_serializer, root_validator
20
20
  from typing_extensions import Literal
21
21
 
22
22
 
@@ -182,14 +182,25 @@ class CompletionRequest(BaseModel):
182
182
  skip_special_tokens: bool = True
183
183
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
184
184
  session_params: Optional[Dict] = None
185
+ return_hidden_states: Optional[bool] = False
186
+
187
+ # For PD disaggregation
188
+ bootstrap_host: Optional[str] = None
189
+ bootstrap_port: Optional[int] = None
190
+ bootstrap_room: Optional[int] = None
185
191
 
186
192
 
187
193
  class CompletionResponseChoice(BaseModel):
188
194
  index: int
189
195
  text: str
190
196
  logprobs: Optional[LogProbs] = None
191
- finish_reason: Literal["stop", "length", "content_filter"]
197
+ finish_reason: Literal["stop", "length", "content_filter", "abort"]
192
198
  matched_stop: Union[None, int, str] = None
199
+ hidden_states: Optional[object] = None
200
+
201
+ @model_serializer
202
+ def _serialize(self):
203
+ return exclude_if_none(self, ["hidden_states"])
193
204
 
194
205
 
195
206
  class CompletionResponse(BaseModel):
@@ -207,6 +218,11 @@ class CompletionResponseStreamChoice(BaseModel):
207
218
  logprobs: Optional[LogProbs] = None
208
219
  finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
209
220
  matched_stop: Union[None, int, str] = None
221
+ hidden_states: Optional[object] = None
222
+
223
+ @model_serializer
224
+ def _serialize(self):
225
+ return exclude_if_none(self, ["hidden_states"])
210
226
 
211
227
 
212
228
  class CompletionStreamResponse(BaseModel):
@@ -400,6 +416,9 @@ class ChatCompletionRequest(BaseModel):
400
416
  bootstrap_port: Optional[int] = None
401
417
  bootstrap_room: Optional[int] = None
402
418
 
419
+ # Hidden States
420
+ return_hidden_states: Optional[bool] = False
421
+
403
422
 
404
423
  class ChatMessage(BaseModel):
405
424
  role: Optional[str] = None
@@ -413,9 +432,14 @@ class ChatCompletionResponseChoice(BaseModel):
413
432
  message: ChatMessage
414
433
  logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
415
434
  finish_reason: Literal[
416
- "stop", "length", "tool_calls", "content_filter", "function_call"
435
+ "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
417
436
  ]
418
437
  matched_stop: Union[None, int, str] = None
438
+ hidden_states: Optional[object] = None
439
+
440
+ @model_serializer
441
+ def _serialize(self):
442
+ return exclude_if_none(self, ["hidden_states"])
419
443
 
420
444
 
421
445
  class ChatCompletionResponse(BaseModel):
@@ -432,6 +456,11 @@ class DeltaMessage(BaseModel):
432
456
  content: Optional[str] = None
433
457
  reasoning_content: Optional[str] = None
434
458
  tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
459
+ hidden_states: Optional[object] = None
460
+
461
+ @model_serializer
462
+ def _serialize(self):
463
+ return exclude_if_none(self, ["hidden_states"])
435
464
 
436
465
 
437
466
  class ChatCompletionResponseStreamChoice(BaseModel):
@@ -484,3 +513,39 @@ class EmbeddingResponse(BaseModel):
484
513
  model: str
485
514
  object: str = "list"
486
515
  usage: Optional[UsageInfo] = None
516
+
517
+
518
+ class ScoringRequest(BaseModel):
519
+ query: Optional[Union[str, List[int]]] = (
520
+ None # Query text or pre-tokenized token IDs
521
+ )
522
+ items: Optional[Union[str, List[str], List[List[int]]]] = (
523
+ None # Item text(s) or pre-tokenized token IDs
524
+ )
525
+ label_token_ids: Optional[List[int]] = (
526
+ None # Token IDs to compute probabilities for
527
+ )
528
+ apply_softmax: bool = False
529
+ item_first: bool = False
530
+ model: str
531
+
532
+
533
+ class ScoringResponse(BaseModel):
534
+ scores: List[
535
+ List[float]
536
+ ] # List of lists of probabilities, each in the order of label_token_ids
537
+ model: str
538
+ usage: Optional[UsageInfo] = None
539
+ object: str = "scoring"
540
+
541
+
542
+ class RerankResponse(BaseModel):
543
+ score: float
544
+ document: str
545
+ index: int
546
+ meta_info: Optional[dict] = None
547
+
548
+
549
+ def exclude_if_none(obj, field_names: List[str]):
550
+ omit_if_none_fields = {k for k, v in obj.model_fields.items() if k in field_names}
551
+ return {k: v for k, v in obj if k not in omit_if_none_fields or v is not None}