sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ class SessionReqNode:
54
54
  prefix += " -- " + self.childs[0].req.rid
55
55
  ret = self.childs[0]._str_helper(prefix)
56
56
  for child in self.childs[1:]:
57
- prefix = " " * len(origin_prefix) + " \- " + child.req.rid
57
+ prefix = " " * len(origin_prefix) + r" \- " + child.req.rid
58
58
  ret += child._str_helper(prefix)
59
59
  return ret
60
60
 
@@ -16,7 +16,9 @@
16
16
  import asyncio
17
17
  import copy
18
18
  import dataclasses
19
+ import json
19
20
  import logging
21
+ import math
20
22
  import os
21
23
  import pickle
22
24
  import signal
@@ -41,6 +43,7 @@ from typing import (
41
43
  )
42
44
 
43
45
  import fastapi
46
+ import torch
44
47
  import uvloop
45
48
  import zmq
46
49
  import zmq.asyncio
@@ -90,6 +93,8 @@ from sglang.srt.managers.io_struct import (
90
93
  ResumeMemoryOccupationReqInput,
91
94
  ResumeMemoryOccupationReqOutput,
92
95
  SessionParams,
96
+ SetInternalStateReq,
97
+ SetInternalStateReqOutput,
93
98
  SlowDownReqInput,
94
99
  SlowDownReqOutput,
95
100
  TokenizedEmbeddingReqInput,
@@ -111,6 +116,7 @@ from sglang.srt.sampling.sampling_params import SamplingParams
111
116
  from sglang.srt.server_args import PortArgs, ServerArgs
112
117
  from sglang.srt.utils import (
113
118
  dataclass_to_string_truncated,
119
+ get_bool_env_var,
114
120
  get_zmq_socket,
115
121
  kill_process_tree,
116
122
  )
@@ -169,6 +175,11 @@ class TokenizerManager:
169
175
  self.enable_metrics = server_args.enable_metrics
170
176
  self.log_requests = server_args.log_requests
171
177
  self.log_requests_level = server_args.log_requests_level
178
+ self.preferred_sampling_params = (
179
+ json.loads(server_args.preferred_sampling_params)
180
+ if server_args.preferred_sampling_params
181
+ else None
182
+ )
172
183
 
173
184
  # Init inter-process communication
174
185
  context = zmq.asyncio.Context(2)
@@ -213,7 +224,7 @@ class TokenizerManager:
213
224
  self.tokenizer = get_tokenizer_from_processor(self.processor)
214
225
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
215
226
  else:
216
- self.mm_processor = get_dummy_processor()
227
+ self.mm_processor = None
217
228
 
218
229
  if server_args.skip_tokenizer_init:
219
230
  self.tokenizer = self.processor = None
@@ -228,6 +239,7 @@ class TokenizerManager:
228
239
  # Store states
229
240
  self.no_create_loop = False
230
241
  self.rid_to_state: Dict[str, ReqState] = {}
242
+ self.health_check_failed = False
231
243
  self.gracefully_exit = False
232
244
  self.last_receive_tstamp = 0
233
245
  self.dump_requests_folder = "" # By default do not dump
@@ -255,6 +267,10 @@ class TokenizerManager:
255
267
  "model_name": self.server_args.served_model_name,
256
268
  # TODO: Add lora name/path in the future,
257
269
  },
270
+ bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
271
+ bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
272
+ bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
273
+ collect_tokens_histogram=self.server_args.collect_tokens_histogram,
258
274
  )
259
275
 
260
276
  # Communicators
@@ -282,12 +298,16 @@ class TokenizerManager:
282
298
  self.flush_cache_communicator = _Communicator(
283
299
  self.send_to_scheduler, server_args.dp_size
284
300
  )
285
- self.start_profile_communicator = _Communicator(
301
+ self.profile_communicator = _Communicator(
286
302
  self.send_to_scheduler, server_args.dp_size
287
303
  )
304
+ self.health_check_communitcator = _Communicator(self.send_to_scheduler, 1)
288
305
  self.get_internal_state_communicator = _Communicator(
289
306
  self.send_to_scheduler, server_args.dp_size
290
307
  )
308
+ self.set_internal_state_communicator = _Communicator(
309
+ self.send_to_scheduler, server_args.dp_size
310
+ )
291
311
  self.expert_distribution_communicator = _Communicator(
292
312
  self.send_to_scheduler, server_args.dp_size
293
313
  )
@@ -343,12 +363,16 @@ class TokenizerManager:
343
363
  ),
344
364
  (
345
365
  ProfileReqOutput,
346
- self.start_profile_communicator.handle_recv,
366
+ self.profile_communicator.handle_recv,
347
367
  ),
348
368
  (
349
369
  GetInternalStateReqOutput,
350
370
  self.get_internal_state_communicator.handle_recv,
351
371
  ),
372
+ (
373
+ SetInternalStateReqOutput,
374
+ self.set_internal_state_communicator.handle_recv,
375
+ ),
352
376
  (
353
377
  ExpertDistributionReqOutput,
354
378
  self.expert_distribution_communicator.handle_recv,
@@ -374,6 +398,9 @@ class TokenizerManager:
374
398
  self.server_args.disaggregation_bootstrap_port
375
399
  )
376
400
 
401
+ self.current_load = 0
402
+ self.current_load_lock = asyncio.Lock()
403
+
377
404
  async def generate_request(
378
405
  self,
379
406
  obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -401,8 +428,8 @@ class TokenizerManager:
401
428
  is_single = obj.is_single
402
429
  if is_single:
403
430
  tokenized_obj = await self._tokenize_one_request(obj)
404
- self._send_one_request(obj, tokenized_obj, created_time)
405
- async for response in self._wait_one_response(obj, request):
431
+ state = self._send_one_request(obj, tokenized_obj, created_time)
432
+ async for response in self._wait_one_response(obj, state, request):
406
433
  yield response
407
434
  else:
408
435
  async for response in self._handle_batch_request(
@@ -438,14 +465,17 @@ class TokenizerManager:
438
465
  )
439
466
  input_ids = self.tokenizer.encode(input_text)
440
467
 
441
- image_inputs: Dict = await self.mm_processor.process_mm_data_async(
442
- image_data=obj.image_data,
443
- input_text=input_text or input_ids,
444
- request_obj=obj,
445
- max_req_input_len=self.max_req_input_len,
446
- )
447
- if image_inputs and "input_ids" in image_inputs:
448
- input_ids = image_inputs["input_ids"]
468
+ if self.mm_processor and obj.contains_mm_input():
469
+ image_inputs = await self.mm_processor.process_mm_data_async(
470
+ image_data=obj.image_data,
471
+ input_text=input_text or input_ids,
472
+ request_obj=obj,
473
+ max_req_input_len=self.max_req_input_len,
474
+ )
475
+ if image_inputs and "input_ids" in image_inputs:
476
+ input_ids = image_inputs["input_ids"]
477
+ else:
478
+ image_inputs: Optional[Dict] = None
449
479
 
450
480
  self._validate_token_len(obj, input_ids)
451
481
  return self._create_tokenized_object(
@@ -508,7 +538,14 @@ class TokenizerManager:
508
538
  "Please set `--enable-custom-logits-processor` to enable this feature."
509
539
  )
510
540
 
511
- sampling_params = SamplingParams(**obj.sampling_params)
541
+ # Parse sampling parameters
542
+ # Note: if there are preferred sampling params, we use them if they are not
543
+ # explicitly passed in sampling_params
544
+ if self.preferred_sampling_params:
545
+ sampling_kwargs = {**self.preferred_sampling_params, **obj.sampling_params}
546
+ else:
547
+ sampling_kwargs = obj.sampling_params
548
+ sampling_params = SamplingParams(**sampling_kwargs)
512
549
  sampling_params.normalize(self.tokenizer)
513
550
  sampling_params.verify()
514
551
 
@@ -533,6 +570,7 @@ class TokenizerManager:
533
570
  session_params=session_params,
534
571
  custom_logit_processor=obj.custom_logit_processor,
535
572
  return_hidden_states=obj.return_hidden_states,
573
+ data_parallel_rank=obj.data_parallel_rank,
536
574
  )
537
575
  elif isinstance(obj, EmbeddingReqInput):
538
576
  tokenized_obj = TokenizedEmbeddingReqInput(
@@ -598,15 +636,15 @@ class TokenizerManager:
598
636
  self.send_to_scheduler.send_pyobj(tokenized_obj)
599
637
  state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
600
638
  self.rid_to_state[obj.rid] = state
639
+ return state
601
640
 
602
641
  async def _wait_one_response(
603
642
  self,
604
643
  obj: Union[GenerateReqInput, EmbeddingReqInput],
644
+ state: ReqState,
605
645
  request: Optional[fastapi.Request] = None,
606
646
  ):
607
647
  """Wait for the response of one request."""
608
- state = self.rid_to_state[obj.rid]
609
-
610
648
  while True:
611
649
  try:
612
650
  await asyncio.wait_for(state.event.wait(), timeout=4)
@@ -667,7 +705,6 @@ class TokenizerManager:
667
705
 
668
706
  generators = []
669
707
  rids = []
670
-
671
708
  if getattr(obj, "parallel_sample_num", 1) == 1:
672
709
  if self.server_args.enable_tokenizer_batch_encode:
673
710
  # Validate batch tokenization constraints
@@ -677,16 +714,16 @@ class TokenizerManager:
677
714
 
678
715
  for i, tokenized_obj in enumerate(tokenized_objs):
679
716
  tmp_obj = obj[i]
680
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
681
- generators.append(self._wait_one_response(tmp_obj, request))
717
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
718
+ generators.append(self._wait_one_response(tmp_obj, state, request))
682
719
  rids.append(tmp_obj.rid)
683
720
  else:
684
721
  # Sequential tokenization and processing
685
722
  for i in range(batch_size):
686
723
  tmp_obj = obj[i]
687
724
  tokenized_obj = await self._tokenize_one_request(tmp_obj)
688
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
689
- generators.append(self._wait_one_response(tmp_obj, request))
725
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
726
+ generators.append(self._wait_one_response(tmp_obj, state, request))
690
727
  rids.append(tmp_obj.rid)
691
728
  else:
692
729
  # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
@@ -711,8 +748,8 @@ class TokenizerManager:
711
748
  tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
712
749
  tokenized_obj.sampling_params.max_new_tokens = 0
713
750
  tokenized_obj.stream = False
714
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
715
- await self._wait_one_response(tmp_obj, request).__anext__()
751
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
752
+ await self._wait_one_response(tmp_obj, state, request).__anext__()
716
753
 
717
754
  # Expand requests, assign new rids for them, and send them
718
755
  for i in range(batch_size):
@@ -720,8 +757,8 @@ class TokenizerManager:
720
757
  tmp_obj = copy.copy(objs[i])
721
758
  tokenized_obj = copy.copy(tokenized_objs[i])
722
759
  tokenized_obj.rid = tmp_obj.regenerate_rid()
723
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
724
- generators.append(self._wait_one_response(tmp_obj, request))
760
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
761
+ generators.append(self._wait_one_response(tmp_obj, state, request))
725
762
  rids.append(tmp_obj.rid)
726
763
 
727
764
  # Wait for all requests
@@ -757,6 +794,9 @@ class TokenizerManager:
757
794
  req = AbortReq(rid)
758
795
  self.send_to_scheduler.send_pyobj(req)
759
796
 
797
+ if self.enable_metrics:
798
+ self.metrics_collector.observe_one_aborted_request()
799
+
760
800
  async def start_profile(
761
801
  self,
762
802
  output_dir: Optional[str] = None,
@@ -764,7 +804,11 @@ class TokenizerManager:
764
804
  activities: Optional[List[str]] = None,
765
805
  with_stack: Optional[bool] = None,
766
806
  record_shapes: Optional[bool] = None,
807
+ profile_by_stage: bool = False,
767
808
  ):
809
+ self.auto_create_handle_loop()
810
+ env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
811
+ with_stack = False if with_stack is False or env_with_stack is False else True
768
812
  req = ProfileReq(
769
813
  type=ProfileReqType.START_PROFILE,
770
814
  output_dir=output_dir,
@@ -772,24 +816,32 @@ class TokenizerManager:
772
816
  activities=activities,
773
817
  with_stack=with_stack,
774
818
  record_shapes=record_shapes,
819
+ profile_by_stage=profile_by_stage,
775
820
  profile_id=str(time.time()),
776
821
  )
777
- result = (await self.start_profile_communicator(req))[0]
822
+ return await self._execute_profile(req)
823
+
824
+ async def stop_profile(self):
825
+ self.auto_create_handle_loop()
826
+ req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
827
+ return await self._execute_profile(req)
828
+
829
+ async def _execute_profile(self, req: ProfileReq):
830
+ result = (await self.profile_communicator(req))[0]
778
831
  if not result.success:
779
832
  raise RuntimeError(result.message)
780
833
  return result
781
834
 
782
- def stop_profile(self):
783
- req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
784
- self.send_to_scheduler.send_pyobj(req)
785
-
786
835
  async def start_expert_distribution_record(self):
836
+ self.auto_create_handle_loop()
787
837
  await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
788
838
 
789
839
  async def stop_expert_distribution_record(self):
840
+ self.auto_create_handle_loop()
790
841
  await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
791
842
 
792
843
  async def dump_expert_distribution_record(self):
844
+ self.auto_create_handle_loop()
793
845
  await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
794
846
 
795
847
  async def update_weights_from_disk(
@@ -804,7 +856,7 @@ class TokenizerManager:
804
856
  obj.load_format = self.server_args.load_format
805
857
  logger.info("Start update_weights. Load format=%s", obj.load_format)
806
858
 
807
- if True:
859
+ if True: # Keep this redundant check to simplify some internal code sync
808
860
  # Hold the lock if it is not async. This means that weight sync
809
861
  # cannot run while requests are in progress.
810
862
  async with self.model_update_lock.writer_lock:
@@ -856,8 +908,8 @@ class TokenizerManager:
856
908
  ) -> Tuple[bool, str]:
857
909
  self.auto_create_handle_loop()
858
910
  assert (
859
- self.server_args.dp_size == 1
860
- ), "dp_size must be for update weights from distributed"
911
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
912
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
861
913
 
862
914
  # This means that weight sync
863
915
  # cannot run while requests are in progress.
@@ -872,8 +924,8 @@ class TokenizerManager:
872
924
  ) -> Tuple[bool, str]:
873
925
  self.auto_create_handle_loop()
874
926
  assert (
875
- self.server_args.dp_size == 1
876
- ), "dp_size must be 1 for update weights from distributed"
927
+ self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
928
+ ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
877
929
 
878
930
  # This means that weight sync
879
931
  # cannot run while requests are in progress.
@@ -946,6 +998,22 @@ class TokenizerManager:
946
998
  # Many DP ranks
947
999
  return [res.internal_state for res in responses]
948
1000
 
1001
+ async def get_load(self) -> dict:
1002
+ # TODO(lsyin): fake load report server
1003
+ if not self.current_load_lock.locked():
1004
+ async with self.current_load_lock:
1005
+ internal_state = await self.get_internal_state()
1006
+ self.current_load = internal_state[0]["load"]
1007
+ return {"load": self.current_load}
1008
+
1009
+ async def set_internal_state(
1010
+ self, obj: SetInternalStateReq
1011
+ ) -> SetInternalStateReqOutput:
1012
+ responses: List[SetInternalStateReqOutput] = (
1013
+ await self.set_internal_state_communicator(obj)
1014
+ )
1015
+ return [res.internal_state for res in responses]
1016
+
949
1017
  def get_log_request_metadata(self):
950
1018
  max_length = None
951
1019
  skip_names = None
@@ -1015,11 +1083,17 @@ class TokenizerManager:
1015
1083
  loop.create_task(print_exception_wrapper(self.handle_loop))
1016
1084
  )
1017
1085
 
1086
+ self.event_loop = loop
1087
+
1018
1088
  # We cannot add signal handler when the tokenizer manager is not in
1019
1089
  # the main thread due to the CPython limitation.
1020
1090
  if threading.current_thread() is threading.main_thread():
1021
1091
  signal_handler = SignalHandler(self)
1022
- loop.add_signal_handler(signal.SIGTERM, signal_handler.signal_handler)
1092
+ loop.add_signal_handler(signal.SIGTERM, signal_handler.sigterm_handler)
1093
+ # Update the signal handler for the process. It overrides the sigquit handler in the launch phase.
1094
+ loop.add_signal_handler(
1095
+ signal.SIGQUIT, signal_handler.running_phase_sigquit_handler
1096
+ )
1023
1097
  else:
1024
1098
  logger.warning(
1025
1099
  "Signal handler is not added because the tokenizer manager is "
@@ -1037,6 +1111,15 @@ class TokenizerManager:
1037
1111
  # Drain requests
1038
1112
  while True:
1039
1113
  remain_num_req = len(self.rid_to_state)
1114
+
1115
+ if self.health_check_failed:
1116
+ # if health check failed, we should exit immediately
1117
+ logger.error(
1118
+ "Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
1119
+ remain_num_req,
1120
+ )
1121
+ break
1122
+
1040
1123
  logger.info(
1041
1124
  f"Gracefully exiting... remaining number of requests {remain_num_req}"
1042
1125
  )
@@ -1120,7 +1203,16 @@ class TokenizerManager:
1120
1203
  "meta_info": meta_info,
1121
1204
  }
1122
1205
  elif isinstance(recv_obj, BatchMultimodalOut):
1123
- raise NotImplementedError()
1206
+ if isinstance(recv_obj.outputs[i], str):
1207
+ out_dict = {
1208
+ "text": recv_obj.outputs[i],
1209
+ "meta_info": meta_info,
1210
+ }
1211
+ else:
1212
+ out_dict = {
1213
+ "outputs": json.dumps(recv_obj.outputs[i]),
1214
+ "meta_info": meta_info,
1215
+ }
1124
1216
  else:
1125
1217
  assert isinstance(recv_obj, BatchEmbeddingOut)
1126
1218
  out_dict = {
@@ -1331,7 +1423,7 @@ class TokenizerManager:
1331
1423
  asyncio.create_task(asyncio.to_thread(background_task))
1332
1424
 
1333
1425
  def _handle_abort_req(self, recv_obj):
1334
- self.rid_to_state.pop(recv_obj.rid)
1426
+ self.rid_to_state.pop(recv_obj.rid, None)
1335
1427
 
1336
1428
  def _handle_open_session_req_output(self, recv_obj):
1337
1429
  self.session_futures[recv_obj.session_id].set_result(
@@ -1347,6 +1439,100 @@ class TokenizerManager:
1347
1439
  if len(self.model_update_tmp) == self.server_args.dp_size:
1348
1440
  self.model_update_result.set_result(self.model_update_tmp)
1349
1441
 
1442
+ async def score_request(
1443
+ self,
1444
+ query: Optional[Union[str, List[int]]] = None,
1445
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
1446
+ label_token_ids: Optional[List[int]] = None,
1447
+ apply_softmax: bool = False,
1448
+ item_first: bool = False,
1449
+ request: Optional[Any] = None,
1450
+ ) -> List[List[float]]:
1451
+ """
1452
+ See Engine.score() for more details.
1453
+ """
1454
+ if label_token_ids is None:
1455
+ raise ValueError("label_token_ids must be provided")
1456
+
1457
+ if self.tokenizer is not None:
1458
+ vocab_size = self.tokenizer.vocab_size
1459
+ for token_id in label_token_ids:
1460
+ if token_id >= vocab_size:
1461
+ raise ValueError(
1462
+ f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
1463
+ )
1464
+
1465
+ # Handle string or tokenized query/items
1466
+ if isinstance(query, str) and (
1467
+ isinstance(items, str)
1468
+ or (isinstance(items, list) and (not items or isinstance(items[0], str)))
1469
+ ):
1470
+ # Both query and items are text
1471
+ items_list = [items] if isinstance(items, str) else items
1472
+ if item_first:
1473
+ prompts = [f"{item}{query}" for item in items_list]
1474
+ else:
1475
+ prompts = [f"{query}{item}" for item in items_list]
1476
+ batch_request = GenerateReqInput(
1477
+ text=prompts,
1478
+ return_logprob=True,
1479
+ token_ids_logprob=label_token_ids,
1480
+ stream=False,
1481
+ sampling_params={"max_new_tokens": 1},
1482
+ )
1483
+ elif (
1484
+ isinstance(query, list)
1485
+ and isinstance(items, list)
1486
+ and items
1487
+ and isinstance(items[0], list)
1488
+ ):
1489
+ # Both query and items are token IDs
1490
+ if item_first:
1491
+ input_ids_list = [item + query for item in items]
1492
+ else:
1493
+ input_ids_list = [query + item for item in items]
1494
+ batch_request = GenerateReqInput(
1495
+ input_ids=input_ids_list,
1496
+ return_logprob=True,
1497
+ token_ids_logprob=label_token_ids,
1498
+ stream=False,
1499
+ sampling_params={"max_new_tokens": 1},
1500
+ )
1501
+ else:
1502
+ raise ValueError(
1503
+ "Invalid combination of query/items types for score_request."
1504
+ )
1505
+
1506
+ results = await self.generate_request(batch_request, request).__anext__()
1507
+ scores = []
1508
+
1509
+ for result in results:
1510
+ # Get logprobs for each token
1511
+ logprobs = {}
1512
+ for logprob, token_id, _ in result["meta_info"].get(
1513
+ "output_token_ids_logprobs", []
1514
+ )[0]:
1515
+ if token_id in label_token_ids:
1516
+ logprobs[token_id] = logprob
1517
+
1518
+ # Get scores in order of label_token_ids
1519
+ score_list = [
1520
+ logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
1521
+ ]
1522
+
1523
+ # Apply softmax to logprobs if needed
1524
+ if apply_softmax:
1525
+ score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
1526
+ else:
1527
+ # Convert logprobs to probabilities if not using softmax
1528
+ score_list = [
1529
+ math.exp(x) if x != float("-inf") else 0.0 for x in score_list
1530
+ ]
1531
+
1532
+ scores.append(score_list)
1533
+
1534
+ return scores
1535
+
1350
1536
 
1351
1537
  async def print_exception_wrapper(func):
1352
1538
  """
@@ -1366,12 +1552,18 @@ class SignalHandler:
1366
1552
  def __init__(self, tokenizer_manager: TokenizerManager):
1367
1553
  self.tokenizer_manager = tokenizer_manager
1368
1554
 
1369
- def signal_handler(self, signum=None, frame=None):
1555
+ def sigterm_handler(self, signum=None, frame=None):
1370
1556
  logger.warning(
1371
1557
  f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
1372
1558
  )
1373
1559
  self.tokenizer_manager.gracefully_exit = True
1374
1560
 
1561
+ def running_phase_sigquit_handler(self, signum=None, frame=None):
1562
+ logger.error(
1563
+ "Received sigquit from a child process. It usually means the child failed."
1564
+ )
1565
+ kill_process_tree(os.getpid())
1566
+
1375
1567
 
1376
1568
  T = TypeVar("T")
1377
1569
 
@@ -35,10 +35,6 @@ def validate_input_length(
35
35
  f"the maximum allowed length ({max_req_input_len} tokens). "
36
36
  f"Use a shorter input or enable --allow-auto-truncate."
37
37
  )
38
- logger.error(error_msg)
39
- req.finished_reason = FINISH_ABORT(
40
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
41
- )
42
38
  return error_msg
43
39
 
44
40
  return None
@@ -48,3 +48,6 @@ class BasePrefixCache(ABC):
48
48
 
49
49
  def pretty_print(self):
50
50
  raise NotImplementedError()
51
+
52
+ def take_events(self):
53
+ return []
@@ -38,7 +38,9 @@ class ChunkCache(BasePrefixCache):
38
38
 
39
39
  def cache_finished_req(self, req: Req):
40
40
  kv_indices = self.req_to_token_pool.req_to_token[
41
- req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
41
+ req.req_pool_idx,
42
+ # For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
43
+ : len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0),
42
44
  ]
43
45
  self.req_to_token_pool.free(req.req_pool_idx)
44
46
  self.token_to_kv_pool_allocator.free(kv_indices)
@@ -335,13 +335,13 @@ class HiRadixCache(RadixCache):
335
335
  return value, last_node
336
336
 
337
337
  def _match_prefix_helper(self, node: TreeNode, key: List):
338
- node.last_access_time = time.time()
338
+ node.last_access_time = time.monotonic()
339
339
  child_key = self.get_child_key_fn(key)
340
340
  value = []
341
341
 
342
342
  while len(key) > 0 and child_key in node.children.keys():
343
343
  child = node.children[child_key]
344
- child.last_access_time = time.time()
344
+ child.last_access_time = time.monotonic()
345
345
  prefix_len = self.key_match_fn(child.key, key)
346
346
  if prefix_len < len(child.key):
347
347
  new_node = self._split_node(child.key, child, prefix_len)
@@ -386,7 +386,7 @@ class HiRadixCache(RadixCache):
386
386
  return new_node
387
387
 
388
388
  def _insert_helper(self, node: TreeNode, key: List, value):
389
- node.last_access_time = time.time()
389
+ node.last_access_time = time.monotonic()
390
390
  if len(key) == 0:
391
391
  return 0
392
392
 
@@ -395,7 +395,7 @@ class HiRadixCache(RadixCache):
395
395
 
396
396
  while len(key) > 0 and child_key in node.children.keys():
397
397
  node = node.children[child_key]
398
- node.last_access_time = time.time()
398
+ node.last_access_time = time.monotonic()
399
399
  prefix_len = self.key_match_fn(node.key, key)
400
400
 
401
401
  if prefix_len == len(node.key):