sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -47,6 +47,7 @@ from sglang.srt.managers.io_struct import (
47
47
  EmbeddingReqInput,
48
48
  GenerateReqInput,
49
49
  GetWeightsByNameReqInput,
50
+ ImageDataItem,
50
51
  InitWeightsUpdateGroupReqInput,
51
52
  ReleaseMemoryOccupationReqInput,
52
53
  ResumeMemoryOccupationReqInput,
@@ -150,9 +151,9 @@ class Engine(EngineBase):
150
151
  # See also python/sglang/srt/utils.py:load_image for more details.
151
152
  image_data: Optional[
152
153
  Union[
153
- List[List[Union[Image, str]]],
154
- List[Union[Image, str]],
155
- Union[Image, str],
154
+ List[List[ImageDataItem]],
155
+ List[ImageDataItem],
156
+ ImageDataItem,
156
157
  ]
157
158
  ] = None,
158
159
  return_logprob: Optional[Union[List[bool], bool]] = False,
@@ -166,11 +167,22 @@ class Engine(EngineBase):
166
167
  bootstrap_host: Optional[Union[List[str], str]] = None,
167
168
  bootstrap_port: Optional[Union[List[int], int]] = None,
168
169
  bootstrap_room: Optional[Union[List[int], int]] = None,
170
+ data_parallel_rank: Optional[int] = None,
169
171
  ) -> Union[Dict, Iterator[Dict]]:
170
172
  """
171
173
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
172
174
  Please refer to `GenerateReqInput` for the documentation.
173
175
  """
176
+ if self.server_args.enable_dp_attention:
177
+ if data_parallel_rank is None:
178
+ logger.info("data_parallel_rank not provided, using default dispatch")
179
+ elif data_parallel_rank < 0:
180
+ raise ValueError("data_parallel_rank must be non-negative")
181
+ elif data_parallel_rank >= self.server_args.dp_size:
182
+ raise ValueError(
183
+ f"data_parallel_rank must be less than dp_size: {self.server_args.dp_size}"
184
+ )
185
+
174
186
  obj = GenerateReqInput(
175
187
  text=prompt,
176
188
  input_ids=input_ids,
@@ -187,6 +199,7 @@ class Engine(EngineBase):
187
199
  bootstrap_host=bootstrap_host,
188
200
  bootstrap_port=bootstrap_port,
189
201
  bootstrap_room=bootstrap_room,
202
+ data_parallel_rank=data_parallel_rank,
190
203
  )
191
204
  loop = asyncio.get_event_loop()
192
205
  generator = self.tokenizer_manager.generate_request(obj, None)
@@ -221,9 +234,9 @@ class Engine(EngineBase):
221
234
  # See also python/sglang/srt/utils.py:load_image for more details.
222
235
  image_data: Optional[
223
236
  Union[
224
- List[List[Union[Image, str]]],
225
- List[Union[Image, str]],
226
- Union[Image, str],
237
+ List[List[ImageDataItem]],
238
+ List[ImageDataItem],
239
+ ImageDataItem,
227
240
  ]
228
241
  ] = None,
229
242
  return_logprob: Optional[Union[List[bool], bool]] = False,
@@ -236,11 +249,24 @@ class Engine(EngineBase):
236
249
  bootstrap_host: Optional[Union[List[str], str]] = None,
237
250
  bootstrap_port: Optional[Union[List[int], int]] = None,
238
251
  bootstrap_room: Optional[Union[List[int], int]] = None,
252
+ data_parallel_rank: Optional[int] = None,
239
253
  ) -> Union[Dict, AsyncIterator[Dict]]:
240
254
  """
241
255
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
242
256
  Please refer to `GenerateReqInput` for the documentation.
243
257
  """
258
+
259
+ if self.server_args.enable_dp_attention:
260
+ if data_parallel_rank is None:
261
+ logger.info("data_parallel_rank not provided, using default dispatch")
262
+ elif data_parallel_rank < 0:
263
+ raise ValueError("data_parallel_rank must be non-negative")
264
+ elif data_parallel_rank >= self.server_args.dp_size:
265
+ raise ValueError(
266
+ f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
267
+ )
268
+
269
+ logger.info(f"data_parallel_rank: {data_parallel_rank}")
244
270
  obj = GenerateReqInput(
245
271
  text=prompt,
246
272
  input_ids=input_ids,
@@ -256,6 +282,7 @@ class Engine(EngineBase):
256
282
  bootstrap_host=bootstrap_host,
257
283
  bootstrap_port=bootstrap_port,
258
284
  bootstrap_room=bootstrap_room,
285
+ data_parallel_rank=data_parallel_rank,
259
286
  )
260
287
  generator = self.tokenizer_manager.generate_request(obj, None)
261
288
 
@@ -320,7 +347,26 @@ class Engine(EngineBase):
320
347
  loop.run_until_complete(self.tokenizer_manager.start_profile())
321
348
 
322
349
  def stop_profile(self):
323
- self.tokenizer_manager.stop_profile()
350
+ loop = asyncio.get_event_loop()
351
+ loop.run_until_complete(self.tokenizer_manager.stop_profile())
352
+
353
+ def start_expert_distribution_record(self):
354
+ loop = asyncio.get_event_loop()
355
+ loop.run_until_complete(
356
+ self.tokenizer_manager.start_expert_distribution_record()
357
+ )
358
+
359
+ def stop_expert_distribution_record(self):
360
+ loop = asyncio.get_event_loop()
361
+ loop.run_until_complete(
362
+ self.tokenizer_manager.stop_expert_distribution_record()
363
+ )
364
+
365
+ def dump_expert_distribution_record(self):
366
+ loop = asyncio.get_event_loop()
367
+ loop.run_until_complete(
368
+ self.tokenizer_manager.dump_expert_distribution_record()
369
+ )
324
370
 
325
371
  def get_server_info(self):
326
372
  loop = asyncio.get_event_loop()
@@ -452,6 +498,79 @@ class Engine(EngineBase):
452
498
  def save_sharded_model(self, **kwargs):
453
499
  self.collective_rpc("save_sharded_model", **kwargs)
454
500
 
501
+ def score(
502
+ self,
503
+ query: Optional[Union[str, List[int]]] = None,
504
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
505
+ label_token_ids: Optional[List[int]] = None,
506
+ apply_softmax: bool = False,
507
+ item_first: bool = False,
508
+ ) -> List[List[float]]:
509
+ """
510
+ Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
511
+ query = "<|user|>Is the following city the capital of France? "
512
+ items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
513
+ label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
514
+ item_first = False
515
+
516
+ This would pass the following prompts to the model:
517
+ "<|user|>Is the following city the capital of France? Paris <|assistant|>"
518
+ "<|user|>Is the following city the capital of France? London <|assistant|>"
519
+ "<|user|>Is the following city the capital of France? Berlin <|assistant|>"
520
+ The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
521
+ The output would look like:
522
+ [[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
523
+
524
+
525
+ Args:
526
+ query: The query text or pre-tokenized query token IDs. Must be provided.
527
+ items: The item text(s) or pre-tokenized item token IDs. Must be provided.
528
+ label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
529
+ apply_softmax: Whether to normalize probabilities using softmax.
530
+ item_first: If True, prepend items to query. Otherwise append items to query.
531
+
532
+ Returns:
533
+ List of dictionaries mapping token IDs to their probabilities for each item.
534
+ Each dictionary in the list corresponds to one item input.
535
+
536
+ Raises:
537
+ ValueError: If query is not provided, or if items is not provided,
538
+ or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
539
+ """
540
+ loop = asyncio.get_event_loop()
541
+ return loop.run_until_complete(
542
+ self.tokenizer_manager.score_request(
543
+ query=query,
544
+ items=items,
545
+ label_token_ids=label_token_ids,
546
+ apply_softmax=apply_softmax,
547
+ item_first=item_first,
548
+ request=None,
549
+ )
550
+ )
551
+
552
+ async def async_score(
553
+ self,
554
+ query: Optional[Union[str, List[int]]] = None,
555
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
556
+ label_token_ids: Optional[List[int]] = None,
557
+ apply_softmax: bool = False,
558
+ item_first: bool = False,
559
+ ) -> List[List[float]]:
560
+ """
561
+ Asynchronous version of score method.
562
+
563
+ See score() for detailed documentation.
564
+ """
565
+ return await self.tokenizer_manager.score_request(
566
+ query=query,
567
+ items=items,
568
+ label_token_ids=label_token_ids,
569
+ apply_softmax=apply_softmax,
570
+ item_first=item_first,
571
+ request=None,
572
+ )
573
+
455
574
 
456
575
  def _set_envs_and_config(server_args: ServerArgs):
457
576
  # Set global environments
@@ -478,7 +597,7 @@ def _set_envs_and_config(server_args: ServerArgs):
478
597
  if server_args.attention_backend == "flashinfer":
479
598
  assert_pkg_version(
480
599
  "flashinfer_python",
481
- "0.2.5",
600
+ "0.2.6.post1",
482
601
  "Please uninstall the old version and "
483
602
  "reinstall the latest version by following the instructions "
484
603
  "at https://docs.flashinfer.ai/installation.html.",
@@ -486,7 +605,7 @@ def _set_envs_and_config(server_args: ServerArgs):
486
605
  if _is_cuda:
487
606
  assert_pkg_version(
488
607
  "sgl-kernel",
489
- "0.1.2.post1",
608
+ "0.1.7",
490
609
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
491
610
  )
492
611
 
@@ -494,9 +613,7 @@ def _set_envs_and_config(server_args: ServerArgs):
494
613
  pid, exitcode = os.waitpid(0, os.WNOHANG)
495
614
  if exitcode != 0:
496
615
  logger.warning(
497
- "Child process unexpectedly failed with an exit code %d. pid=%d",
498
- exitcode,
499
- pid,
616
+ f"Child process unexpectedly failed with {exitcode=}. {pid=}"
500
617
  )
501
618
 
502
619
  signal.signal(signal.SIGCHLD, sigchld_handler)
@@ -47,7 +47,7 @@ from sglang.srt.disaggregation.utils import (
47
47
  register_disaggregation_server,
48
48
  )
49
49
  from sglang.srt.entrypoints.engine import _launch_subprocesses
50
- from sglang.srt.function_call_parser import FunctionCallParser
50
+ from sglang.srt.function_call.function_call_parser import FunctionCallParser
51
51
  from sglang.srt.managers.io_struct import (
52
52
  AbortReq,
53
53
  CloseSessionReqInput,
@@ -82,6 +82,7 @@ from sglang.srt.openai_api.adapter import (
82
82
  v1_retrieve_batch,
83
83
  v1_retrieve_file,
84
84
  v1_retrieve_file_content,
85
+ v1_score,
85
86
  )
86
87
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
87
88
  from sglang.srt.reasoning_parser import ReasoningParser
@@ -182,13 +183,14 @@ async def health_generate(request: Request) -> Response:
182
183
  async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
183
184
  break
184
185
 
185
- tic = time.time()
186
+ tic = time.perf_counter()
186
187
  task = asyncio.create_task(gen())
187
- while time.time() < tic + HEALTH_CHECK_TIMEOUT:
188
+ while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
188
189
  await asyncio.sleep(1)
189
190
  if _global_state.tokenizer_manager.last_receive_tstamp > tic:
190
191
  task.cancel()
191
192
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
193
+ _global_state.tokenizer_manager.health_check_failed = False
192
194
  return Response(status_code=200)
193
195
 
194
196
  task.cancel()
@@ -202,6 +204,7 @@ async def health_generate(request: Request) -> Response:
202
204
  f"last_heartbeat time: {last_receive_time}"
203
205
  )
204
206
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
207
+ _global_state.tokenizer_manager.health_check_failed = True
205
208
  return Response(status_code=503)
206
209
 
207
210
 
@@ -227,6 +230,11 @@ async def get_server_info():
227
230
  }
228
231
 
229
232
 
233
+ @app.get("/get_load")
234
+ async def get_load():
235
+ return await _global_state.tokenizer_manager.get_load()
236
+
237
+
230
238
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
231
239
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
232
240
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -249,7 +257,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
249
257
  ) + b"\n\n"
250
258
  except ValueError as e:
251
259
  out = {"error": {"message": str(e)}}
252
- logger.error(f"Error: {e}")
260
+ logger.error(f"[http_server] Error: {e}")
253
261
  yield b"data: " + orjson.dumps(
254
262
  out, option=orjson.OPT_NON_STR_KEYS
255
263
  ) + b"\n\n"
@@ -267,7 +275,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
267
275
  ).__anext__()
268
276
  return ret
269
277
  except ValueError as e:
270
- logger.error(f"Error: {e}")
278
+ logger.error(f"[http_server] Error: {e}")
271
279
  return _create_error_response(e)
272
280
 
273
281
 
@@ -343,6 +351,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
343
351
  activities=obj.activities,
344
352
  with_stack=obj.with_stack,
345
353
  record_shapes=obj.record_shapes,
354
+ profile_by_stage=obj.profile_by_stage,
346
355
  )
347
356
  return Response(
348
357
  content="Start profiling.\n",
@@ -353,7 +362,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
353
362
  @app.api_route("/stop_profile", methods=["GET", "POST"])
354
363
  async def stop_profile_async():
355
364
  """Stop profiling."""
356
- _global_state.tokenizer_manager.stop_profile()
365
+ await _global_state.tokenizer_manager.stop_profile()
357
366
  return Response(
358
367
  content="Stop profiling. This will take some time.\n",
359
368
  status_code=200,
@@ -712,6 +721,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
712
721
  return ORJSONResponse({"predictions": ret})
713
722
 
714
723
 
724
+ @app.post("/v1/score")
725
+ async def v1_score_request(raw_request: Request):
726
+ """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
727
+ return await v1_score(_global_state.tokenizer_manager, raw_request)
728
+
729
+
715
730
  def _create_error_response(e):
716
731
  return ORJSONResponse(
717
732
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -24,10 +24,10 @@ def launch_server_process(server_args: ServerArgs) -> multiprocessing.Process:
24
24
 
25
25
  base_url = server_args.url()
26
26
  timeout = 300.0 # Increased timeout to 5 minutes for downloading large models
27
- start_time = time.time()
27
+ start_time = time.perf_counter()
28
28
 
29
29
  with requests.Session() as session:
30
- while time.time() - start_time < timeout:
30
+ while time.perf_counter() - start_time < timeout:
31
31
  try:
32
32
  headers = {
33
33
  "Content-Type": "application/json; charset=utf-8",
@@ -140,3 +140,6 @@ class HttpServerEngineAdapter(EngineBase):
140
140
 
141
141
  def resume_memory_occupation(self):
142
142
  return self._make_request("resume_memory_occupation")
143
+
144
+ def flush_cache(self):
145
+ return self._make_request("flush_cache")
@@ -0,0 +1,302 @@
1
+ import json
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, List
5
+
6
+ from partial_json_parser.core.exceptions import MalformedJSON
7
+ from partial_json_parser.core.options import Allow
8
+
9
+ from sglang.srt.function_call.core_types import (
10
+ StreamingParseResult,
11
+ ToolCallItem,
12
+ _GetInfoFunc,
13
+ )
14
+ from sglang.srt.function_call.utils import (
15
+ _find_common_prefix,
16
+ _is_complete_json,
17
+ _partial_json_loads,
18
+ )
19
+ from sglang.srt.openai_api.protocol import Tool
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class BaseFormatDetector(ABC):
25
+ """Base class providing two sets of interfaces: one-time and streaming incremental."""
26
+
27
+ def __init__(self):
28
+ # initialize properties used for state when parsing tool calls in
29
+ self._buffer = ""
30
+ # streaming mode
31
+ self.prev_tool_call_arr: List[Dict] = []
32
+ self.current_tool_id: int = -1
33
+ self.current_tool_name_sent: bool = False
34
+ self.streamed_args_for_tool: List[str] = (
35
+ []
36
+ ) # map what has been streamed for each tool so far to a list
37
+ self.bot_token = ""
38
+ self.eot_token = ""
39
+ self.tool_call_separator = ", "
40
+
41
+ def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
42
+ tool_indices = {
43
+ tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
44
+ }
45
+ if not isinstance(action, list):
46
+ action = [action]
47
+
48
+ results = []
49
+ for act in action:
50
+ name = act.get("name")
51
+ if name and name in tool_indices:
52
+ results.append(
53
+ ToolCallItem(
54
+ tool_index=-1, # Caller should update this based on the actual tools array called
55
+ name=name,
56
+ parameters=json.dumps(
57
+ act.get("parameters") or act.get("arguments", {}),
58
+ ensure_ascii=False,
59
+ ),
60
+ )
61
+ )
62
+ else:
63
+ logger.warning(f"Model attempted to call undefined function: {name}")
64
+
65
+ return results
66
+
67
+ @abstractmethod
68
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
69
+ """
70
+ Parses the text in one go. Returns success=True if the format matches, otherwise False.
71
+ Note that leftover_text here represents "content that this parser will not consume further".
72
+ """
73
+ action = json.loads(text)
74
+ return StreamingParseResult(calls=self.parse_base_json(action, tools))
75
+
76
+ def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
77
+ """
78
+ Check if buffer ends with a partial bot_token.
79
+ Return the length of the partial bot_token.
80
+
81
+ For some format, the bot_token is not a token in model's vocabulary, such as
82
+ `[TOOL_CALLS] [` in Mistral.
83
+ """
84
+ for i in range(1, min(len(buffer) + 1, len(bot_token))):
85
+ if bot_token.startswith(buffer[-i:]):
86
+ return i
87
+ return 0
88
+
89
+ def parse_streaming_increment(
90
+ self, new_text: str, tools: List[Tool]
91
+ ) -> StreamingParseResult:
92
+ """
93
+ Streaming incremental parsing with tool validation.
94
+
95
+ This base implementation works best with formats where:
96
+ 1. bot_token is followed immediately by JSON (e.g., bot_token + JSON_array)
97
+ 2. JSON can be parsed incrementally using partial_json_loads
98
+ 3. Multiple tool calls are separated by "; " or ", "
99
+
100
+ Examples of incompatible formats (need custom implementation, may reuse some logic from this class):
101
+ - Each tool call is wrapped in a separate block: See Qwen25Detector
102
+ - Multiple separate blocks: [TOOL_CALLS] [...] \n [TOOL_CALLS] [...]
103
+ - Tool call is Pythonic style
104
+
105
+ For incompatible formats, detectors should override this method with custom logic.
106
+ """
107
+ # Append new text to buffer
108
+ self._buffer += new_text
109
+ current_text = self._buffer
110
+
111
+ # The current_text has tool_call if it is the start of a new tool call sequence
112
+ # or it is the start of a new tool call after a tool call separator, when there is a previous tool call
113
+ if not (
114
+ self.bot_token in current_text
115
+ or current_text.startswith("{")
116
+ or (
117
+ self.current_tool_id > 0
118
+ and current_text.startswith(self.tool_call_separator + "{")
119
+ )
120
+ ):
121
+ # Only clear buffer if we're sure no tool call is starting
122
+ if not self._ends_with_partial_token(self._buffer, self.bot_token):
123
+ normal_text = self._buffer
124
+ self._buffer = ""
125
+ if self.eot_token in normal_text:
126
+ normal_text = normal_text.replace(self.eot_token, "")
127
+ return StreamingParseResult(normal_text=normal_text)
128
+ else:
129
+ # Might be partial bot_token, keep buffering
130
+ return StreamingParseResult()
131
+
132
+ # Build tool indices if not already built
133
+ if not hasattr(self, "_tool_indices"):
134
+ self._tool_indices = {
135
+ tool.function.name: i
136
+ for i, tool in enumerate(tools)
137
+ if tool.function and tool.function.name
138
+ }
139
+
140
+ flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
141
+
142
+ try:
143
+ try:
144
+ if current_text.startswith(self.bot_token):
145
+ start_idx = len(self.bot_token)
146
+ elif self.current_tool_id > 0 and current_text.startswith(
147
+ self.tool_call_separator
148
+ ):
149
+ start_idx = len(self.tool_call_separator)
150
+ else:
151
+ start_idx = 0
152
+
153
+ if start_idx >= len(current_text):
154
+ return StreamingParseResult()
155
+
156
+ (obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags)
157
+
158
+ is_current_complete = _is_complete_json(
159
+ current_text[start_idx : start_idx + end_idx]
160
+ )
161
+
162
+ # Validate tool name if present
163
+ if "name" in obj and obj["name"] not in self._tool_indices:
164
+ # Invalid tool name - reset state
165
+ self._buffer = ""
166
+ self.current_tool_id = -1
167
+ self.current_tool_name_sent = False
168
+ if self.streamed_args_for_tool:
169
+ self.streamed_args_for_tool.pop()
170
+ return StreamingParseResult()
171
+
172
+ # Handle parameters/arguments consistency
173
+ # NOTE: we assume here that the obj is always partial of a single tool call
174
+ if "parameters" in obj:
175
+ assert (
176
+ "arguments" not in obj
177
+ ), "model generated both parameters and arguments"
178
+ obj["arguments"] = obj["parameters"]
179
+
180
+ current_tool_call = obj
181
+
182
+ except MalformedJSON:
183
+ return StreamingParseResult()
184
+
185
+ if not current_tool_call:
186
+ return StreamingParseResult()
187
+
188
+ # Case 1: Handle tool name streaming
189
+ # This happens when we encounter a tool but haven't sent its name yet
190
+ if not self.current_tool_name_sent:
191
+ function_name = current_tool_call.get("name")
192
+
193
+ if function_name and function_name in self._tool_indices:
194
+ # If this is a new tool (current_tool_id was -1), initialize it
195
+ if self.current_tool_id == -1:
196
+ self.current_tool_id = 0
197
+ self.streamed_args_for_tool.append("")
198
+ # If this is a subsequent tool, ensure streamed_args_for_tool is large enough
199
+ elif self.current_tool_id >= len(self.streamed_args_for_tool):
200
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
201
+ self.streamed_args_for_tool.append("")
202
+
203
+ # Send the tool name with empty parameters
204
+ res = StreamingParseResult(
205
+ calls=[
206
+ ToolCallItem(
207
+ tool_index=self.current_tool_id,
208
+ name=function_name,
209
+ parameters="",
210
+ )
211
+ ],
212
+ )
213
+ self.current_tool_name_sent = True
214
+ else:
215
+ res = StreamingParseResult()
216
+
217
+ # Case 2: Handle streaming arguments
218
+ # This happens when we've already sent the tool name and now need to stream arguments incrementally
219
+ else:
220
+ cur_arguments = current_tool_call.get("arguments")
221
+ res = StreamingParseResult()
222
+
223
+ if cur_arguments:
224
+ # Calculate how much of the arguments we've already streamed
225
+ sent = len(self.streamed_args_for_tool[self.current_tool_id])
226
+ cur_args_json = json.dumps(cur_arguments)
227
+ prev_arguments = None
228
+ if self.current_tool_id < len(self.prev_tool_call_arr):
229
+ prev_arguments = self.prev_tool_call_arr[
230
+ self.current_tool_id
231
+ ].get("arguments")
232
+
233
+ argument_diff = None
234
+
235
+ # If the current tool's JSON is complete, send all remaining arguments
236
+ if is_current_complete:
237
+ argument_diff = cur_args_json[sent:]
238
+ completing_tool_id = (
239
+ self.current_tool_id
240
+ ) # Save the ID of the tool that's completing
241
+
242
+ # Only remove the processed portion, keep unprocessed content
243
+ self._buffer = current_text[start_idx + end_idx :]
244
+
245
+ if self.current_tool_id < len(self.prev_tool_call_arr):
246
+ self.prev_tool_call_arr[self.current_tool_id].clear()
247
+ self.current_tool_name_sent = False
248
+ self.streamed_args_for_tool[self.current_tool_id] = ""
249
+ self.current_tool_id += 1
250
+
251
+ # If the tool is still being parsed, send incremental changes
252
+ elif prev_arguments:
253
+ prev_args_json = json.dumps(prev_arguments)
254
+ if cur_args_json != prev_args_json:
255
+ prefix = _find_common_prefix(prev_args_json, cur_args_json)
256
+ argument_diff = prefix[sent:]
257
+
258
+ # Send the argument diff if there's something new
259
+ if argument_diff is not None:
260
+ # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
261
+ tool_index_to_use = (
262
+ completing_tool_id
263
+ if is_current_complete
264
+ else self.current_tool_id
265
+ )
266
+ res = StreamingParseResult(
267
+ calls=[
268
+ ToolCallItem(
269
+ tool_index=tool_index_to_use,
270
+ parameters=argument_diff,
271
+ )
272
+ ],
273
+ )
274
+ if not is_current_complete:
275
+ self.streamed_args_for_tool[
276
+ self.current_tool_id
277
+ ] += argument_diff
278
+
279
+ # Update prev_tool_call_arr with current state
280
+ if self.current_tool_id >= 0:
281
+ # Ensure prev_tool_call_arr is large enough
282
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
283
+ self.prev_tool_call_arr.append({})
284
+ self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
285
+
286
+ return res
287
+
288
+ except Exception as e:
289
+ logger.error(f"Error in parse_streaming_increment: {e}")
290
+ return StreamingParseResult()
291
+
292
+ @abstractmethod
293
+ def has_tool_call(self, text: str) -> bool:
294
+ raise NotImplementedError()
295
+
296
+ @abstractmethod
297
+ def structure_info(self) -> _GetInfoFunc:
298
+ raise NotImplementedError()
299
+
300
+ @abstractmethod
301
+ def build_ebnf(self, tools: List[Tool]) -> str:
302
+ raise NotImplementedError()
@@ -0,0 +1,34 @@
1
+ from dataclasses import dataclass
2
+ from typing import Callable, List, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class ToolCallItem(BaseModel):
8
+ """Simple encapsulation of the parsed ToolCall result for easier usage in streaming contexts."""
9
+
10
+ tool_index: int
11
+ name: Optional[str] = None
12
+ parameters: str # JSON string
13
+
14
+
15
+ class StreamingParseResult(BaseModel):
16
+ """Result of streaming incremental parsing."""
17
+
18
+ normal_text: str = ""
19
+ calls: List[ToolCallItem] = []
20
+
21
+
22
+ @dataclass
23
+ class StructureInfo:
24
+ begin: str
25
+ end: str
26
+ trigger: str
27
+
28
+
29
+ """
30
+ Helper alias of function
31
+ Usually it is a function that takes a name string and returns a StructureInfo object,
32
+ which can be used to construct a structural_tag object
33
+ """
34
+ _GetInfoFunc = Callable[[str], StructureInfo]