sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ import queue
9
9
  import socket
10
10
  import struct
11
11
  import threading
12
+ import time
13
+ from collections import defaultdict
12
14
  from functools import cache
13
15
  from typing import Dict, List, Optional, Tuple, Union
14
16
 
@@ -27,30 +29,33 @@ from sglang.srt.disaggregation.base.conn import (
27
29
  KVPoll,
28
30
  )
29
31
  from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
30
- from sglang.srt.disaggregation.utils import DisaggregationMode
32
+ from sglang.srt.disaggregation.utils import (
33
+ DisaggregationMode,
34
+ FastQueue,
35
+ group_concurrent_contiguous,
36
+ )
31
37
  from sglang.srt.server_args import ServerArgs
32
- from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
38
+ from sglang.srt.utils import (
39
+ get_free_port,
40
+ get_int_env_var,
41
+ get_ip,
42
+ get_local_ip_by_remote,
43
+ )
33
44
 
34
45
  logger = logging.getLogger(__name__)
35
46
 
36
47
 
37
- def group_concurrent_contiguous(
38
- src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
39
- ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
40
- """Vectorised NumPy implementation."""
41
- if src_indices.size == 0:
42
- return [], []
43
-
44
- brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
45
- src_groups = np.split(src_indices, brk)
46
- dst_groups = np.split(dst_indices, brk)
47
-
48
- src_groups = [g.tolist() for g in src_groups]
49
- dst_groups = [g.tolist() for g in dst_groups]
48
+ class KVTransferError(Exception):
49
+ def __init__(self, bootstrap_room: int, failure_reason: str):
50
+ super().__init__(failure_reason)
51
+ self.bootstrap_room = bootstrap_room
52
+ self.failure_reason = failure_reason
50
53
 
51
- return src_groups, dst_groups
54
+ def __str__(self):
55
+ return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
52
56
 
53
57
 
58
+ # prefill
54
59
  @dataclasses.dataclass
55
60
  class TransferKVChunk:
56
61
  room: int
@@ -60,6 +65,7 @@ class TransferKVChunk:
60
65
  prefill_aux_index: Optional[int]
61
66
 
62
67
 
68
+ # decode
63
69
  @dataclasses.dataclass
64
70
  class TransferInfo:
65
71
  room: int
@@ -93,6 +99,7 @@ class TransferInfo:
93
99
  )
94
100
 
95
101
 
102
+ # decode
96
103
  @dataclasses.dataclass
97
104
  class KVArgsRegisterInfo:
98
105
  room: str
@@ -145,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
145
152
  self.server_socket = zmq.Context().socket(zmq.PULL)
146
153
  self.register_buffer_to_engine()
147
154
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
148
- self.transfer_queue = queue.Queue()
149
155
  self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
150
156
  self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
151
157
  self.start_prefill_thread()
152
158
  self._register_to_bootstrap()
153
-
159
+ self.session_failures = defaultdict(int)
160
+ self.failed_sessions = set()
161
+ self.session_lock = threading.Lock()
154
162
  # Determine the number of threads to use for kv sender
155
163
  cpu_count = os.cpu_count()
156
- self.executor = concurrent.futures.ThreadPoolExecutor(
157
- min(cpu_count // 4, 16)
164
+ transfer_thread_pool_size = get_int_env_var(
165
+ "SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
166
+ min(max(4, int(0.75 * cpu_count) // 8), 12),
167
+ )
168
+ transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
169
+ self.transfer_queues: List[FastQueue] = [
170
+ FastQueue() for _ in range(transfer_queue_size)
171
+ ]
172
+ assert transfer_thread_pool_size >= transfer_queue_size, (
173
+ f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
174
+ f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
175
+ )
176
+ self.executors = [
177
+ concurrent.futures.ThreadPoolExecutor(
178
+ transfer_thread_pool_size // transfer_queue_size
179
+ )
180
+ for _ in range(transfer_queue_size)
181
+ ]
182
+ for queue, executor in zip(self.transfer_queues, self.executors):
183
+ threading.Thread(
184
+ target=self.transfer_worker, args=(queue, executor), daemon=True
185
+ ).start()
186
+
187
+ self.bootstrap_time_out = get_int_env_var(
188
+ "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
158
189
  )
159
190
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
191
+ self.heartbeat_failures = {}
192
+ self.session_pool = defaultdict(requests.Session)
193
+ self.session_pool_lock = threading.Lock()
194
+ self.addr_to_rooms_tracker = defaultdict(set)
195
+ self.connection_lock = threading.Lock()
196
+ # Heartbeat interval should be at least 2 seconds
197
+ self.heartbeat_interval = max(
198
+ float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
199
+ )
200
+ # Heartbeat failure should be at least 1
201
+ self.max_failures = max(
202
+ get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
203
+ )
160
204
  self.start_decode_thread()
161
205
  self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
162
206
  self.prefill_tp_size_table: Dict[str, int] = {}
@@ -166,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
166
210
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
167
211
  )
168
212
 
213
+ self.failure_records: Dict[int, str] = {}
214
+ self.failure_lock = threading.Lock()
215
+
169
216
  def register_buffer_to_engine(self):
170
217
  for kv_data_ptr, kv_data_len in zip(
171
218
  self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -189,6 +236,7 @@ class MooncakeKVManager(BaseKVManager):
189
236
  prefill_kv_indices: npt.NDArray[np.int64],
190
237
  dst_kv_ptrs: list[int],
191
238
  dst_kv_indices: npt.NDArray[np.int64],
239
+ executor: concurrent.futures.ThreadPoolExecutor,
192
240
  ):
193
241
  # Group by indices
194
242
  prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
@@ -220,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
220
268
  return 0
221
269
 
222
270
  futures = [
223
- self.executor.submit(
271
+ executor.submit(
224
272
  process_layer,
225
273
  src_ptr,
226
274
  dst_ptr,
@@ -232,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
232
280
  for future in concurrent.futures.as_completed(futures):
233
281
  status = future.result()
234
282
  if status != 0:
235
- # Immediate shutdown on first error (existing tasks will finish)
236
- self.executor.shutdown(wait=False)
237
283
  for f in futures:
238
284
  f.cancel()
239
285
  return status
@@ -252,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
252
298
  self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
253
299
  )
254
300
  decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
255
- # TODO: mooncake transfer engine can do async transfer. Do async later
256
- # Not sure about the amount of aux data, maybe transfer it by zmq is more effective
257
301
  status = self.engine.transfer_sync(
258
302
  mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
259
303
  )
260
304
  return status
261
305
 
262
- def sync_status_to_decode_endpoint(self, remote: str, dst_port: int, room: int):
306
+ def sync_status_to_decode_endpoint(
307
+ self, remote: str, dst_port: int, room: int, status: int
308
+ ):
263
309
  if ":" in remote:
264
310
  remote = remote.split(":")[0]
265
311
  self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
266
312
  [
267
313
  str(room).encode("ascii"),
268
- str(self.check_status(room)).encode("ascii"),
314
+ str(status).encode("ascii"),
269
315
  ]
270
316
  )
271
317
 
318
+ def transfer_worker(
319
+ self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
320
+ ):
321
+ while True:
322
+ try:
323
+ kv_chunk: TransferKVChunk = queue.get()
324
+ reqs_to_be_processed = (
325
+ self.transfer_infos[kv_chunk.room].values()
326
+ if kv_chunk.room in self.transfer_infos
327
+ else []
328
+ )
329
+ polls = []
330
+ dst_ranks_infos = []
331
+ for req in reqs_to_be_processed:
332
+ if not req.is_dummy:
333
+ # Early exit if the request has failed
334
+ with self.session_lock:
335
+ if req.mooncake_session_id in self.failed_sessions:
336
+ self.record_failure(
337
+ kv_chunk.room,
338
+ f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
339
+ )
340
+ self.update_status(kv_chunk.room, KVPoll.Failed)
341
+ self.sync_status_to_decode_endpoint(
342
+ req.endpoint,
343
+ req.dst_port,
344
+ req.room,
345
+ KVPoll.Failed,
346
+ )
347
+ break
348
+
349
+ chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
350
+
351
+ # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
352
+ # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
353
+ if len(chunked_dst_kv_indice) < len(
354
+ kv_chunk.prefill_kv_indices
355
+ ):
356
+ kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
357
+ : len(chunked_dst_kv_indice)
358
+ ]
359
+ logger.warning(
360
+ f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
361
+ )
362
+
363
+ ret = self.send_kvcache(
364
+ req.mooncake_session_id,
365
+ kv_chunk.prefill_kv_indices,
366
+ self.decode_kv_args_table[
367
+ req.mooncake_session_id
368
+ ].dst_kv_ptrs,
369
+ chunked_dst_kv_indice,
370
+ executor,
371
+ )
372
+ if ret != 0:
373
+ with self.session_lock:
374
+ self.session_failures[req.mooncake_session_id] += 1
375
+ # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
376
+ if self.session_failures[req.mooncake_session_id] >= 1:
377
+ self.failed_sessions.add(req.mooncake_session_id)
378
+ logger.error(
379
+ f"Session {req.mooncake_session_id} failed."
380
+ )
381
+ self.record_failure(
382
+ kv_chunk.room,
383
+ f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
384
+ )
385
+ self.update_status(kv_chunk.room, KVPoll.Failed)
386
+ self.sync_status_to_decode_endpoint(
387
+ req.endpoint, req.dst_port, req.room, KVPoll.Failed
388
+ )
389
+ break
390
+
391
+ if kv_chunk.is_last:
392
+ # Only the last chunk we need to send the aux data
393
+ ret = self.send_aux(
394
+ req.mooncake_session_id,
395
+ kv_chunk.prefill_aux_index,
396
+ self.decode_kv_args_table[
397
+ req.mooncake_session_id
398
+ ].dst_aux_ptrs,
399
+ req.dst_aux_index,
400
+ )
401
+ polls.append(True if ret == 0 else False)
402
+ dst_ranks_infos.append(
403
+ (req.endpoint, req.dst_port, req.room)
404
+ )
405
+
406
+ # Only sync status when all the dst ranks have received the kvcache
407
+ if len(polls) == req.required_dst_info_num:
408
+ status = KVPoll.Success if all(polls) else KVPoll.Failed
409
+ self.update_status(req.room, status)
410
+ for endpoint, dst_port, room in dst_ranks_infos:
411
+ self.sync_status_to_decode_endpoint(
412
+ endpoint, dst_port, room, status
413
+ )
414
+ else:
415
+ # Dummy request means the decode instance is not used, so its status can be marked as success directly
416
+ # Dummy request does not need to sync status to decode endpoint
417
+ if kv_chunk.is_last and req.room in self.request_status:
418
+ self.update_status(req.room, KVPoll.Success)
419
+
420
+ if (
421
+ kv_chunk.room not in self.request_status
422
+ or self.check_status(kv_chunk.room) == KVPoll.Success
423
+ ):
424
+ if kv_chunk.room in self.transfer_infos:
425
+ self.transfer_infos.pop(kv_chunk.room)
426
+
427
+ except Exception as e:
428
+ # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
429
+ raise RuntimeError(
430
+ f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
431
+ )
432
+
272
433
  def start_prefill_thread(self):
273
434
  self.rank_port = get_free_port()
274
435
  self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
@@ -284,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
284
445
  self.decode_kv_args_table[mooncake_session_id] = (
285
446
  KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
286
447
  )
448
+ with self.session_lock:
449
+ if mooncake_session_id in self.failed_sessions:
450
+ self.failed_sessions.remove(mooncake_session_id)
451
+ if mooncake_session_id in self.session_failures:
452
+ del self.session_failures[mooncake_session_id]
287
453
  logger.debug(
288
454
  f"Register KVArgs from {mooncake_session_id} successfully"
289
455
  )
@@ -301,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
301
467
  if len(self.transfer_infos[room]) == required_dst_info_num:
302
468
  self.update_status(room, KVPoll.WaitingForInput)
303
469
 
304
- def transfer_thread():
305
- # TODO: Shall we use KVPoll.Transferring state?
306
- while True:
307
- try:
308
- kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
309
- reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
310
- polls = []
311
- dst_ranks_infos = []
312
- for req in reqs_to_be_processed:
313
- if not req.is_dummy:
314
- chunked_dst_kv_indice = req.dst_kv_indices[
315
- kv_chunk.index_slice
316
- ]
317
- assert len(chunked_dst_kv_indice) == len(
318
- kv_chunk.prefill_kv_indices
319
- ), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
320
-
321
- ret = self.send_kvcache(
322
- req.mooncake_session_id,
323
- kv_chunk.prefill_kv_indices,
324
- self.decode_kv_args_table[
325
- req.mooncake_session_id
326
- ].dst_kv_ptrs,
327
- chunked_dst_kv_indice,
328
- )
329
- if ret != 0:
330
- self.update_status(kv_chunk.room, KVPoll.Failed)
331
- self.sync_status_to_decode_endpoint(
332
- req.endpoint, req.dst_port, req.room
333
- )
334
- continue
335
-
336
- if kv_chunk.is_last:
337
- # Only the last chunk we need to send the aux data
338
- ret = self.send_aux(
339
- req.mooncake_session_id,
340
- kv_chunk.prefill_aux_index,
341
- self.decode_kv_args_table[
342
- req.mooncake_session_id
343
- ].dst_aux_ptrs,
344
- req.dst_aux_index,
345
- )
346
- polls.append(True if ret == 0 else False)
347
- dst_ranks_infos.append(
348
- (req.endpoint, req.dst_port, req.room)
349
- )
350
-
351
- # Only sync status when all the dst ranks have received the kvcache
352
- if len(polls) == req.required_dst_info_num:
353
- self.update_status(
354
- req.room,
355
- KVPoll.Success if all(polls) else KVPoll.Failed,
356
- )
357
- for endpoint, dst_port, room in dst_ranks_infos:
358
- self.sync_status_to_decode_endpoint(
359
- endpoint, dst_port, room
360
- )
361
- else:
362
- # Dummy request means the decode instance is not used, so its status can be marked as success directly
363
- # Dummy request does not need to sync status to decode endpoint
364
- if kv_chunk.is_last:
365
- self.update_status(req.room, KVPoll.Success)
366
-
367
- if self.check_status(kv_chunk.room) == KVPoll.Success:
368
- self.transfer_infos.pop(kv_chunk.room)
369
-
370
- except queue.Empty:
371
- continue
372
-
373
470
  threading.Thread(target=bootstrap_thread).start()
374
- threading.Thread(target=transfer_thread).start()
375
471
 
376
472
  def start_decode_thread(self):
377
473
  self.rank_port = get_free_port()
@@ -382,9 +478,69 @@ class MooncakeKVManager(BaseKVManager):
382
478
  (bootstrap_room, status) = self.server_socket.recv_multipart()
383
479
  status = int(status.decode("ascii"))
384
480
  bootstrap_room = int(bootstrap_room.decode("ascii"))
481
+ if status == KVPoll.Failed:
482
+ self.record_failure(
483
+ bootstrap_room,
484
+ f"Failed to get kvcache from prefill instance, it might be dead",
485
+ )
385
486
  self.update_status(bootstrap_room, status)
386
487
 
488
+ def heartbeat_checker():
489
+ while True:
490
+ time.sleep(self.heartbeat_interval)
491
+ with self.connection_lock:
492
+ addresses = list(self.prefill_dp_size_table.keys())
493
+
494
+ for bootstrap_addr in addresses:
495
+ session = None
496
+ try:
497
+ with self.session_pool_lock:
498
+ session = self.session_pool[bootstrap_addr]
499
+ response = session.get(
500
+ f"http://{bootstrap_addr}/health",
501
+ timeout=(2, 3),
502
+ headers={"Connection": "keep-alive"},
503
+ )
504
+ if response.status_code == 200:
505
+ self.heartbeat_failures[bootstrap_addr] = 0
506
+
507
+ current_rooms = self.addr_to_rooms_tracker[
508
+ bootstrap_addr
509
+ ].copy()
510
+
511
+ for bootstrap_room in current_rooms:
512
+ # Remove KVPoll.Success requests from the tracker
513
+ if bootstrap_room not in self.request_status:
514
+ self.addr_to_rooms_tracker[bootstrap_addr].discard(
515
+ bootstrap_room
516
+ )
517
+ else:
518
+ logger.info(
519
+ f"Attempting to reconnect to {bootstrap_addr}..."
520
+ )
521
+ self.heartbeat_failures[bootstrap_addr] = (
522
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
523
+ )
524
+ with self.session_pool_lock:
525
+ if bootstrap_addr in self.session_pool:
526
+ del self.session_pool[bootstrap_addr]
527
+ except Exception:
528
+ logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
529
+ self.heartbeat_failures[bootstrap_addr] = (
530
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
531
+ )
532
+
533
+ if (
534
+ self.heartbeat_failures.get(bootstrap_addr, 0)
535
+ >= self.max_failures
536
+ ):
537
+ self._handle_node_failure(bootstrap_addr)
538
+ with self.session_pool_lock:
539
+ if bootstrap_addr in self.session_pool:
540
+ del self.session_pool[bootstrap_addr]
541
+
387
542
  threading.Thread(target=decode_thread).start()
543
+ threading.Thread(target=heartbeat_checker).start()
388
544
 
389
545
  def add_transfer_request(
390
546
  self,
@@ -397,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
397
553
  assert self.disaggregation_mode == DisaggregationMode.PREFILL
398
554
  assert not is_last or (is_last and aux_index is not None)
399
555
 
400
- self.transfer_queue.put(
556
+ if (
557
+ bootstrap_room not in self.request_status
558
+ or self.check_status(bootstrap_room) == KVPoll.Failed
559
+ ):
560
+ logger.debug(
561
+ "Request with bootstrap_room=%s already failed", bootstrap_room
562
+ )
563
+ return
564
+
565
+ if bootstrap_room not in self.transfer_infos:
566
+ # This means that the current rank is a dummy rank for this request,
567
+ # and it has already been marked as success, so there is no need to
568
+ # add further chunks into the transfer queue.
569
+ return
570
+
571
+ # NOTE(shangming): sharding according to the dst_infos to make sure
572
+ # requests with the same dst_sessions will be added into the same
573
+ # queue, which enables early abort with failed sessions.
574
+ dst_infos = self.transfer_infos[bootstrap_room].keys()
575
+ session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
576
+ shard_idx = session_port_sum % len(self.transfer_queues)
577
+
578
+ self.transfer_queues[shard_idx].put(
401
579
  TransferKVChunk(
402
580
  room=bootstrap_room,
403
581
  prefill_kv_indices=kv_indices,
@@ -406,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
406
584
  prefill_aux_index=aux_index,
407
585
  )
408
586
  )
409
- self.update_status(bootstrap_room, KVPoll.WaitingForInput)
410
587
 
411
588
  def check_status(self, bootstrap_room: int):
412
589
  return self.request_status[bootstrap_room]
@@ -415,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
415
592
  if bootstrap_room not in self.request_status:
416
593
  self.request_status[bootstrap_room] = status
417
594
  else:
418
- # NOTE: The prefill engine could recv bootstrapping first
419
- self.request_status[bootstrap_room] = max(
420
- self.request_status[bootstrap_room], status
421
- )
595
+ # NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
596
+ if status == KVPoll.Failed:
597
+ self.request_status[bootstrap_room] = KVPoll.Failed
598
+ else:
599
+ self.request_status[bootstrap_room] = max(
600
+ self.request_status[bootstrap_room], status
601
+ )
602
+
603
+ def record_failure(self, bootstrap_room: int, failure_reason: str):
604
+ with self.failure_lock:
605
+ self.failure_records[bootstrap_room] = failure_reason
422
606
 
423
607
  def get_session_id(self):
424
608
  return self.engine.get_session_id()
@@ -442,15 +626,52 @@ class MooncakeKVManager(BaseKVManager):
442
626
  }
443
627
 
444
628
  try:
445
- response = requests.put(url, json=payload)
629
+ response = requests.put(url, json=payload, timeout=5)
446
630
  if response.status_code == 200:
447
631
  logger.debug("Prefill successfully registered to bootstrap server.")
448
632
  else:
449
633
  logger.error(
450
- f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
634
+ f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
451
635
  )
452
636
  except Exception as e:
453
- logger.error(f"Prefill Failed to register to bootstrap server: {e}")
637
+ logger.error(
638
+ f"Prefill instance failed to register to bootstrap server: {e}"
639
+ )
640
+
641
+ def _handle_node_failure(self, failed_bootstrap_addr):
642
+ with self.connection_lock:
643
+ keys_to_remove = [
644
+ k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
645
+ ]
646
+ for k in keys_to_remove:
647
+ del self.connection_pool[k]
648
+ if failed_bootstrap_addr in self.prefill_tp_size_table:
649
+ del self.prefill_tp_size_table[failed_bootstrap_addr]
650
+ if failed_bootstrap_addr in self.prefill_dp_size_table:
651
+ del self.prefill_dp_size_table[failed_bootstrap_addr]
652
+
653
+ possible_affected_rooms = self.addr_to_rooms_tracker.get(
654
+ failed_bootstrap_addr, []
655
+ )
656
+ if failed_bootstrap_addr in self.addr_to_rooms_tracker:
657
+ del self.addr_to_rooms_tracker[failed_bootstrap_addr]
658
+
659
+ # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
660
+ affected_rooms = []
661
+ for room in possible_affected_rooms:
662
+ if (
663
+ room in self.request_status
664
+ and self.check_status(room) != KVPoll.Success
665
+ ):
666
+ self.record_failure(
667
+ room,
668
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
669
+ )
670
+ self.update_status(room, KVPoll.Failed)
671
+ affected_rooms.append(room)
672
+ logger.error(
673
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
674
+ )
454
675
 
455
676
 
456
677
  class MooncakeKVSender(BaseKVSender):
@@ -463,18 +684,24 @@ class MooncakeKVSender(BaseKVSender):
463
684
  self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
464
685
  self.aux_index = None
465
686
  self.bootstrap_server_url = bootstrap_addr
466
- self.session_id = self.kv_mgr.get_session_id()
687
+ self.conclude_state = None
688
+ self.init_time = None
689
+ # inner state
690
+ self.curr_idx = 0
467
691
 
468
692
  def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
469
693
  self.num_kv_indices = num_kv_indices
470
694
  self.aux_index = aux_index
695
+ self.init_time = time.time()
471
696
 
472
697
  def send(
473
698
  self,
474
699
  kv_indices: npt.NDArray[np.int64],
475
- index_slice: slice,
476
- is_last: bool,
477
700
  ):
701
+ index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
702
+ self.curr_idx += len(kv_indices)
703
+ is_last = self.curr_idx == self.num_kv_indices
704
+
478
705
  if not is_last:
479
706
  self.kv_mgr.add_transfer_request(
480
707
  self.bootstrap_room, kv_indices, index_slice, False
@@ -489,10 +716,42 @@ class MooncakeKVSender(BaseKVSender):
489
716
  )
490
717
 
491
718
  def poll(self) -> KVPoll:
492
- return self.kv_mgr.check_status(self.bootstrap_room)
719
+ if self.conclude_state is None:
720
+ status = self.kv_mgr.check_status(self.bootstrap_room)
721
+ if status in (KVPoll.Success, KVPoll.Failed):
722
+ self.conclude_state = status
723
+ elif status == KVPoll.Bootstrapping:
724
+ if self.init_time is not None:
725
+ now = time.time()
726
+ elapsed = now - self.init_time
727
+ if elapsed >= self.kv_mgr.bootstrap_time_out:
728
+ self.kv_mgr.record_failure(
729
+ self.bootstrap_room,
730
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
731
+ )
732
+ self.conclude_state = KVPoll.Failed
733
+ return KVPoll.Failed
734
+
735
+ return status
736
+ else:
737
+ return self.conclude_state
738
+
739
+ def clear(self) -> None:
740
+ if self.bootstrap_room in self.kv_mgr.request_status:
741
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
493
742
 
494
743
  def failure_exception(self):
495
- raise Exception("Fake KVSender Exception")
744
+ self.clear()
745
+
746
+ # Explicitly set the status to failure since this request has failed in another rank
747
+ if self.conclude_state is None:
748
+ self.conclude_state = KVPoll.Failed
749
+
750
+ with self.kv_mgr.failure_lock:
751
+ failure_reason = self.kv_mgr.failure_records.pop(
752
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
753
+ )
754
+ raise KVTransferError(self.bootstrap_room, failure_reason)
496
755
 
497
756
 
498
757
  class MooncakeKVReceiver(BaseKVReceiver):
@@ -506,22 +765,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
506
765
  mgr: MooncakeKVManager,
507
766
  bootstrap_addr: str,
508
767
  bootstrap_room: Optional[int] = None,
768
+ data_parallel_rank: Optional[int] = None,
509
769
  ):
510
770
  self.bootstrap_room = bootstrap_room
511
771
  self.bootstrap_addr = bootstrap_addr
512
772
  self.kv_mgr = mgr
513
773
  self.session_id = self.kv_mgr.get_session_id()
514
- self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
774
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
775
+ self.conclude_state = None
776
+ self.data_parallel_rank = data_parallel_rank
515
777
 
516
778
  if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
517
779
  self.prefill_tp_size, self.prefill_dp_size = (
518
- self._get_prefill_dp_size_from_server()
780
+ self._get_prefill_parallel_info_from_server()
519
781
  )
520
782
  if self.prefill_tp_size is None or self.prefill_dp_size is None:
521
- logger.error(
522
- f"Could not fetch prefill parallel info for bootstrap_addr: {self.bootstrap_addr}"
783
+ self.kv_mgr.record_failure(
784
+ self.bootstrap_room,
785
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
523
786
  )
787
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
788
+ return
524
789
  else:
790
+ logger.debug(
791
+ f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
792
+ )
525
793
  self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
526
794
  self.prefill_tp_size
527
795
  )
@@ -579,7 +847,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
579
847
  self.target_tp_rank = self.target_tp_ranks[0]
580
848
  self.required_dst_info_num = 1
581
849
 
582
- self.target_dp_group = bootstrap_room % self.prefill_dp_size
850
+ if self.data_parallel_rank is not None:
851
+ logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
852
+ self.target_dp_group = self.data_parallel_rank
853
+ else:
854
+ self.target_dp_group = bootstrap_room % self.prefill_dp_size
583
855
 
584
856
  # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
585
857
  bootstrap_key = (
@@ -599,32 +871,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
599
871
  target_tp_rank == self.target_tp_rank
600
872
  or self.target_tp_rank is None
601
873
  )
874
+ logger.debug(
875
+ f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
876
+ )
602
877
  bootstrap_infos.append(bootstrap_info)
603
878
  else:
604
- logger.error(
605
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}"
879
+ self.kv_mgr.record_failure(
880
+ self.bootstrap_room,
881
+ f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
606
882
  )
883
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
884
+ return
885
+
607
886
  self.bootstrap_infos = bootstrap_infos
887
+ self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
608
888
 
609
- if len(self.bootstrap_infos) == 0:
610
- logger.error(
611
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
612
- )
613
- else:
614
- self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
615
- # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
616
- self._register_kv_args()
889
+ # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
890
+ self._register_kv_args()
617
891
  else:
618
892
  self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
619
893
 
620
894
  assert len(self.bootstrap_infos) > 0
621
- self.kv_mgr.update_status(bootstrap_room, KVPoll.WaitingForInput)
895
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
896
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
622
897
 
623
898
  def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
624
899
  """Fetch the bootstrap info from the bootstrap server."""
625
900
  try:
626
901
  url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
627
- response = requests.get(url)
902
+ response = requests.get(url, timeout=5)
628
903
  if response.status_code == 200:
629
904
  bootstrap_info = response.json()
630
905
  return bootstrap_info
@@ -637,7 +912,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
637
912
  logger.error(f"Error fetching prefill info from bootstrap: {e}")
638
913
  return None
639
914
 
640
- def _get_prefill_dp_size_from_server(self) -> int:
915
+ def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
641
916
  """Fetch the prefill parallel info from the bootstrap server."""
642
917
  try:
643
918
  url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
@@ -651,10 +926,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
651
926
  logger.error(
652
927
  f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
653
928
  )
654
- return None
929
+ return None, None
655
930
  except Exception as e:
656
931
  logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
657
- return None
932
+ return None, None
658
933
 
659
934
  def _register_kv_args(self):
660
935
  for bootstrap_info in self.bootstrap_infos:
@@ -696,9 +971,6 @@ class MooncakeKVReceiver(BaseKVReceiver):
696
971
  self.prefill_server_url = (
697
972
  f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
698
973
  )
699
- logger.debug(
700
- f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
701
- )
702
974
  is_dummy = bootstrap_info["is_dummy"]
703
975
 
704
976
  sock, lock = self._connect("tcp://" + self.prefill_server_url)
@@ -716,10 +988,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
716
988
  )
717
989
 
718
990
  def poll(self) -> KVPoll:
719
- return self.kv_mgr.check_status(self.bootstrap_room)
991
+ if self.conclude_state is None:
992
+ status = self.kv_mgr.check_status(self.bootstrap_room)
993
+ if status in (KVPoll.Success, KVPoll.Failed):
994
+ self.conclude_state = status
995
+
996
+ return status
997
+ else:
998
+ return self.conclude_state
999
+
1000
+ def clear(self) -> None:
1001
+ if self.bootstrap_room in self.kv_mgr.request_status:
1002
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
720
1003
 
721
1004
  def failure_exception(self):
722
- raise Exception("Fake KVReceiver Exception")
1005
+ self.clear()
1006
+
1007
+ # Explicitly set the status to failure since this request has failed in another rank
1008
+ if self.conclude_state is None:
1009
+ self.conclude_state = KVPoll.Failed
1010
+
1011
+ with self.kv_mgr.failure_lock:
1012
+ failure_reason = self.kv_mgr.failure_records.pop(
1013
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
1014
+ )
1015
+ raise KVTransferError(self.bootstrap_room, failure_reason)
723
1016
 
724
1017
 
725
1018
  class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
@@ -743,6 +1036,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
743
1036
 
744
1037
  def _setup_routes(self):
745
1038
  self.app.router.add_route("*", "/route", self._handle_route)
1039
+ self.app.router.add_get("/health", self._handle_health_check)
1040
+
1041
+ async def _handle_health_check(self, request):
1042
+ return web.Response(text="OK", status=200)
746
1043
 
747
1044
  async def _handle_route(self, request: web.Request):
748
1045
  method = request.method
@@ -771,14 +1068,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
771
1068
  self.dp_size = dp_size
772
1069
 
773
1070
  tp_size_per_dp_rank = tp_size // dp_size
774
- if self.tp_size_per_dp_rank == None:
1071
+ if self.tp_size_per_dp_rank is None:
775
1072
  self.tp_size_per_dp_rank = tp_size_per_dp_rank
776
1073
 
777
- # Add lock to make sure thread-safe
778
1074
  if role == "Prefill":
779
1075
  dp_group = engine_rank // tp_size_per_dp_rank
780
1076
  tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
781
1077
 
1078
+ # Add lock to make sure thread-safe
782
1079
  async with self.lock:
783
1080
  if dp_group not in self.prefill_port_table:
784
1081
  self.prefill_port_table[dp_group] = {}
@@ -788,7 +1085,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
788
1085
  "rank_port": rank_port,
789
1086
  }
790
1087
  logger.debug(
791
- f"Register Prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
1088
+ f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
792
1089
  )
793
1090
 
794
1091
  return web.Response(text="OK", status=200)
@@ -824,7 +1121,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
824
1121
  self._loop = asyncio.new_event_loop()
825
1122
  asyncio.set_event_loop(self._loop)
826
1123
 
827
- self._runner = web.AppRunner(self.app)
1124
+ access_log = None
1125
+ if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
1126
+ access_log = self.app.logger
1127
+
1128
+ self._runner = web.AppRunner(self.app, access_log=access_log)
828
1129
  self._loop.run_until_complete(self._runner.setup())
829
1130
 
830
1131
  site = web.TCPSite(self._runner, port=self.port)