sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ import queue
9
9
  import socket
10
10
  import struct
11
11
  import threading
12
+ import time
13
+ from collections import defaultdict
12
14
  from functools import cache
13
15
  from typing import Dict, List, Optional, Tuple, Union
14
16
 
@@ -26,36 +28,38 @@ from sglang.srt.disaggregation.base.conn import (
26
28
  KVArgs,
27
29
  KVPoll,
28
30
  )
31
+ from sglang.srt.disaggregation.common.utils import (
32
+ FastQueue,
33
+ group_concurrent_contiguous,
34
+ )
29
35
  from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
30
36
  from sglang.srt.disaggregation.utils import DisaggregationMode
31
37
  from sglang.srt.server_args import ServerArgs
32
- from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
38
+ from sglang.srt.utils import (
39
+ get_free_port,
40
+ get_int_env_var,
41
+ get_ip,
42
+ get_local_ip_by_remote,
43
+ )
33
44
 
34
45
  logger = logging.getLogger(__name__)
35
46
 
36
47
 
37
- def group_concurrent_contiguous(
38
- src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
39
- ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
40
- """Vectorised NumPy implementation."""
41
- if src_indices.size == 0:
42
- return [], []
43
-
44
- brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
45
- src_groups = np.split(src_indices, brk)
46
- dst_groups = np.split(dst_indices, brk)
47
-
48
- src_groups = [g.tolist() for g in src_groups]
49
- dst_groups = [g.tolist() for g in dst_groups]
48
+ class KVTransferError(Exception):
49
+ def __init__(self, bootstrap_room: int, failure_reason: str):
50
+ super().__init__(failure_reason)
51
+ self.bootstrap_room = bootstrap_room
52
+ self.failure_reason = failure_reason
50
53
 
51
- return src_groups, dst_groups
54
+ def __str__(self):
55
+ return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
52
56
 
53
57
 
54
58
  # prefill
55
59
  @dataclasses.dataclass
56
60
  class TransferKVChunk:
57
61
  room: int
58
- prefill_kv_indices: npt.NDArray[np.int64]
62
+ prefill_kv_indices: npt.NDArray[np.int32]
59
63
  index_slice: slice
60
64
  is_last: bool
61
65
  prefill_aux_index: Optional[int]
@@ -68,7 +72,7 @@ class TransferInfo:
68
72
  endpoint: str
69
73
  dst_port: int
70
74
  mooncake_session_id: str
71
- dst_kv_indices: npt.NDArray[np.int64]
75
+ dst_kv_indices: npt.NDArray[np.int32]
72
76
  dst_aux_index: int
73
77
  required_dst_info_num: int
74
78
  is_dummy: bool
@@ -77,10 +81,10 @@ class TransferInfo:
77
81
  def from_zmq(cls, msg: List[bytes]):
78
82
  if msg[4] == b"" and msg[5] == b"":
79
83
  is_dummy = True
80
- dst_kv_indices = np.array([], dtype=np.int64)
84
+ dst_kv_indices = np.array([], dtype=np.int32)
81
85
  dst_aux_index = None
82
86
  else:
83
- dst_kv_indices = np.frombuffer(msg[4], dtype=np.int64)
87
+ dst_kv_indices = np.frombuffer(msg[4], dtype=np.int32)
84
88
  dst_aux_index = int(msg[5].decode("ascii"))
85
89
  is_dummy = False
86
90
  return cls(
@@ -148,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
148
152
  self.server_socket = zmq.Context().socket(zmq.PULL)
149
153
  self.register_buffer_to_engine()
150
154
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
151
- self.transfer_queue = queue.Queue()
152
155
  self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
153
156
  self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
154
157
  self.start_prefill_thread()
155
158
  self._register_to_bootstrap()
156
-
159
+ self.session_failures = defaultdict(int)
160
+ self.failed_sessions = set()
161
+ self.session_lock = threading.Lock()
157
162
  # Determine the number of threads to use for kv sender
158
163
  cpu_count = os.cpu_count()
159
- self.executor = concurrent.futures.ThreadPoolExecutor(
160
- min(cpu_count // 4, 16)
164
+ transfer_thread_pool_size = get_int_env_var(
165
+ "SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
166
+ min(max(4, int(0.75 * cpu_count) // 8), 12),
167
+ )
168
+ transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
169
+ self.transfer_queues: List[FastQueue] = [
170
+ FastQueue() for _ in range(transfer_queue_size)
171
+ ]
172
+ assert transfer_thread_pool_size >= transfer_queue_size, (
173
+ f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
174
+ f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
175
+ )
176
+ self.executors = [
177
+ concurrent.futures.ThreadPoolExecutor(
178
+ transfer_thread_pool_size // transfer_queue_size
179
+ )
180
+ for _ in range(transfer_queue_size)
181
+ ]
182
+ for queue, executor in zip(self.transfer_queues, self.executors):
183
+ threading.Thread(
184
+ target=self.transfer_worker, args=(queue, executor), daemon=True
185
+ ).start()
186
+
187
+ self.bootstrap_time_out = get_int_env_var(
188
+ "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
161
189
  )
162
190
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
191
+ self.heartbeat_failures = {}
192
+ self.session_pool = defaultdict(requests.Session)
193
+ self.session_pool_lock = threading.Lock()
194
+ self.addr_to_rooms_tracker = defaultdict(set)
195
+ self.connection_lock = threading.Lock()
196
+ # Heartbeat interval should be at least 2 seconds
197
+ self.heartbeat_interval = max(
198
+ float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
199
+ )
200
+ # Heartbeat failure should be at least 1
201
+ self.max_failures = max(
202
+ get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
203
+ )
163
204
  self.start_decode_thread()
164
205
  self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
165
206
  self.prefill_tp_size_table: Dict[str, int] = {}
@@ -169,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
169
210
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
170
211
  )
171
212
 
213
+ self.failure_records: Dict[int, str] = {}
214
+ self.failure_lock = threading.Lock()
215
+
172
216
  def register_buffer_to_engine(self):
173
217
  for kv_data_ptr, kv_data_len in zip(
174
218
  self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -189,9 +233,10 @@ class MooncakeKVManager(BaseKVManager):
189
233
  def send_kvcache(
190
234
  self,
191
235
  mooncake_session_id: str,
192
- prefill_kv_indices: npt.NDArray[np.int64],
236
+ prefill_kv_indices: npt.NDArray[np.int32],
193
237
  dst_kv_ptrs: list[int],
194
- dst_kv_indices: npt.NDArray[np.int64],
238
+ dst_kv_indices: npt.NDArray[np.int32],
239
+ executor: concurrent.futures.ThreadPoolExecutor,
195
240
  ):
196
241
  # Group by indices
197
242
  prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
@@ -223,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
223
268
  return 0
224
269
 
225
270
  futures = [
226
- self.executor.submit(
271
+ executor.submit(
227
272
  process_layer,
228
273
  src_ptr,
229
274
  dst_ptr,
@@ -235,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
235
280
  for future in concurrent.futures.as_completed(futures):
236
281
  status = future.result()
237
282
  if status != 0:
238
- # Immediate shutdown on first error (existing tasks will finish)
239
- self.executor.shutdown(wait=False)
240
283
  for f in futures:
241
284
  f.cancel()
242
285
  return status
@@ -255,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
255
298
  self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
256
299
  )
257
300
  decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
258
- # TODO: mooncake transfer engine can do async transfer. Do async later
259
- # Not sure about the amount of aux data, maybe transfer it by zmq is more effective
260
301
  status = self.engine.transfer_sync(
261
302
  mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
262
303
  )
263
304
  return status
264
305
 
265
- def sync_status_to_decode_endpoint(self, remote: str, dst_port: int, room: int):
306
+ def sync_status_to_decode_endpoint(
307
+ self, remote: str, dst_port: int, room: int, status: int
308
+ ):
266
309
  if ":" in remote:
267
310
  remote = remote.split(":")[0]
268
311
  self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
269
312
  [
270
313
  str(room).encode("ascii"),
271
- str(self.check_status(room)).encode("ascii"),
314
+ str(status).encode("ascii"),
272
315
  ]
273
316
  )
274
317
 
318
+ def transfer_worker(
319
+ self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
320
+ ):
321
+ while True:
322
+ try:
323
+ kv_chunk: TransferKVChunk = queue.get()
324
+ reqs_to_be_processed = (
325
+ self.transfer_infos[kv_chunk.room].values()
326
+ if kv_chunk.room in self.transfer_infos
327
+ else []
328
+ )
329
+ polls = []
330
+ dst_ranks_infos = []
331
+ for req in reqs_to_be_processed:
332
+ if not req.is_dummy:
333
+ # Early exit if the request has failed
334
+ with self.session_lock:
335
+ if req.mooncake_session_id in self.failed_sessions:
336
+ self.record_failure(
337
+ kv_chunk.room,
338
+ f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
339
+ )
340
+ self.update_status(kv_chunk.room, KVPoll.Failed)
341
+ self.sync_status_to_decode_endpoint(
342
+ req.endpoint,
343
+ req.dst_port,
344
+ req.room,
345
+ KVPoll.Failed,
346
+ )
347
+ break
348
+
349
+ chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
350
+
351
+ # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
352
+ # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
353
+ if len(chunked_dst_kv_indice) < len(
354
+ kv_chunk.prefill_kv_indices
355
+ ):
356
+ kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
357
+ : len(chunked_dst_kv_indice)
358
+ ]
359
+ logger.warning(
360
+ f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
361
+ )
362
+
363
+ ret = self.send_kvcache(
364
+ req.mooncake_session_id,
365
+ kv_chunk.prefill_kv_indices,
366
+ self.decode_kv_args_table[
367
+ req.mooncake_session_id
368
+ ].dst_kv_ptrs,
369
+ chunked_dst_kv_indice,
370
+ executor,
371
+ )
372
+ if ret != 0:
373
+ with self.session_lock:
374
+ self.session_failures[req.mooncake_session_id] += 1
375
+ # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
376
+ if self.session_failures[req.mooncake_session_id] >= 1:
377
+ self.failed_sessions.add(req.mooncake_session_id)
378
+ logger.error(
379
+ f"Session {req.mooncake_session_id} failed."
380
+ )
381
+ self.record_failure(
382
+ kv_chunk.room,
383
+ f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
384
+ )
385
+ self.update_status(kv_chunk.room, KVPoll.Failed)
386
+ self.sync_status_to_decode_endpoint(
387
+ req.endpoint, req.dst_port, req.room, KVPoll.Failed
388
+ )
389
+ break
390
+
391
+ if kv_chunk.is_last:
392
+ # Only the last chunk we need to send the aux data
393
+ ret = self.send_aux(
394
+ req.mooncake_session_id,
395
+ kv_chunk.prefill_aux_index,
396
+ self.decode_kv_args_table[
397
+ req.mooncake_session_id
398
+ ].dst_aux_ptrs,
399
+ req.dst_aux_index,
400
+ )
401
+ polls.append(True if ret == 0 else False)
402
+ dst_ranks_infos.append(
403
+ (req.endpoint, req.dst_port, req.room)
404
+ )
405
+
406
+ # Only sync status when all the dst ranks have received the kvcache
407
+ if len(polls) == req.required_dst_info_num:
408
+ status = KVPoll.Success if all(polls) else KVPoll.Failed
409
+ self.update_status(req.room, status)
410
+ for endpoint, dst_port, room in dst_ranks_infos:
411
+ self.sync_status_to_decode_endpoint(
412
+ endpoint, dst_port, room, status
413
+ )
414
+ else:
415
+ # Dummy request means the decode instance is not used, so its status can be marked as success directly
416
+ # Dummy request does not need to sync status to decode endpoint
417
+ if kv_chunk.is_last and req.room in self.request_status:
418
+ self.update_status(req.room, KVPoll.Success)
419
+
420
+ if (
421
+ kv_chunk.room not in self.request_status
422
+ or self.check_status(kv_chunk.room) == KVPoll.Success
423
+ ):
424
+ if kv_chunk.room in self.transfer_infos:
425
+ self.transfer_infos.pop(kv_chunk.room)
426
+
427
+ except Exception as e:
428
+ # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
429
+ raise RuntimeError(
430
+ f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
431
+ )
432
+
275
433
  def start_prefill_thread(self):
276
434
  self.rank_port = get_free_port()
277
435
  self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
@@ -287,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
287
445
  self.decode_kv_args_table[mooncake_session_id] = (
288
446
  KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
289
447
  )
448
+ with self.session_lock:
449
+ if mooncake_session_id in self.failed_sessions:
450
+ self.failed_sessions.remove(mooncake_session_id)
451
+ if mooncake_session_id in self.session_failures:
452
+ del self.session_failures[mooncake_session_id]
290
453
  logger.debug(
291
454
  f"Register KVArgs from {mooncake_session_id} successfully"
292
455
  )
@@ -304,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
304
467
  if len(self.transfer_infos[room]) == required_dst_info_num:
305
468
  self.update_status(room, KVPoll.WaitingForInput)
306
469
 
307
- def transfer_thread():
308
- # TODO: Shall we use KVPoll.Transferring state?
309
- while True:
310
- try:
311
- kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
312
- reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
313
- polls = []
314
- dst_ranks_infos = []
315
- for req in reqs_to_be_processed:
316
- if not req.is_dummy:
317
- chunked_dst_kv_indice = req.dst_kv_indices[
318
- kv_chunk.index_slice
319
- ]
320
- assert len(chunked_dst_kv_indice) == len(
321
- kv_chunk.prefill_kv_indices
322
- ), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
323
-
324
- ret = self.send_kvcache(
325
- req.mooncake_session_id,
326
- kv_chunk.prefill_kv_indices,
327
- self.decode_kv_args_table[
328
- req.mooncake_session_id
329
- ].dst_kv_ptrs,
330
- chunked_dst_kv_indice,
331
- )
332
- if ret != 0:
333
- self.update_status(kv_chunk.room, KVPoll.Failed)
334
- self.sync_status_to_decode_endpoint(
335
- req.endpoint, req.dst_port, req.room
336
- )
337
- continue
338
-
339
- if kv_chunk.is_last:
340
- # Only the last chunk we need to send the aux data
341
- ret = self.send_aux(
342
- req.mooncake_session_id,
343
- kv_chunk.prefill_aux_index,
344
- self.decode_kv_args_table[
345
- req.mooncake_session_id
346
- ].dst_aux_ptrs,
347
- req.dst_aux_index,
348
- )
349
- polls.append(True if ret == 0 else False)
350
- dst_ranks_infos.append(
351
- (req.endpoint, req.dst_port, req.room)
352
- )
353
-
354
- # Only sync status when all the dst ranks have received the kvcache
355
- if len(polls) == req.required_dst_info_num:
356
- self.update_status(
357
- req.room,
358
- KVPoll.Success if all(polls) else KVPoll.Failed,
359
- )
360
- for endpoint, dst_port, room in dst_ranks_infos:
361
- self.sync_status_to_decode_endpoint(
362
- endpoint, dst_port, room
363
- )
364
- else:
365
- # Dummy request means the decode instance is not used, so its status can be marked as success directly
366
- # Dummy request does not need to sync status to decode endpoint
367
- if kv_chunk.is_last:
368
- self.update_status(req.room, KVPoll.Success)
369
-
370
- if self.check_status(kv_chunk.room) == KVPoll.Success:
371
- self.transfer_infos.pop(kv_chunk.room)
372
-
373
- except queue.Empty:
374
- continue
375
-
376
470
  threading.Thread(target=bootstrap_thread).start()
377
- threading.Thread(target=transfer_thread).start()
378
471
 
379
472
  def start_decode_thread(self):
380
473
  self.rank_port = get_free_port()
@@ -385,14 +478,74 @@ class MooncakeKVManager(BaseKVManager):
385
478
  (bootstrap_room, status) = self.server_socket.recv_multipart()
386
479
  status = int(status.decode("ascii"))
387
480
  bootstrap_room = int(bootstrap_room.decode("ascii"))
481
+ if status == KVPoll.Failed:
482
+ self.record_failure(
483
+ bootstrap_room,
484
+ f"Failed to get kvcache from prefill instance, it might be dead",
485
+ )
388
486
  self.update_status(bootstrap_room, status)
389
487
 
488
+ def heartbeat_checker():
489
+ while True:
490
+ time.sleep(self.heartbeat_interval)
491
+ with self.connection_lock:
492
+ addresses = list(self.prefill_dp_size_table.keys())
493
+
494
+ for bootstrap_addr in addresses:
495
+ session = None
496
+ try:
497
+ with self.session_pool_lock:
498
+ session = self.session_pool[bootstrap_addr]
499
+ response = session.get(
500
+ f"http://{bootstrap_addr}/health",
501
+ timeout=(2, 3),
502
+ headers={"Connection": "keep-alive"},
503
+ )
504
+ if response.status_code == 200:
505
+ self.heartbeat_failures[bootstrap_addr] = 0
506
+
507
+ current_rooms = self.addr_to_rooms_tracker[
508
+ bootstrap_addr
509
+ ].copy()
510
+
511
+ for bootstrap_room in current_rooms:
512
+ # Remove KVPoll.Success requests from the tracker
513
+ if bootstrap_room not in self.request_status:
514
+ self.addr_to_rooms_tracker[bootstrap_addr].discard(
515
+ bootstrap_room
516
+ )
517
+ else:
518
+ logger.info(
519
+ f"Attempting to reconnect to {bootstrap_addr}..."
520
+ )
521
+ self.heartbeat_failures[bootstrap_addr] = (
522
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
523
+ )
524
+ with self.session_pool_lock:
525
+ if bootstrap_addr in self.session_pool:
526
+ del self.session_pool[bootstrap_addr]
527
+ except Exception:
528
+ logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
529
+ self.heartbeat_failures[bootstrap_addr] = (
530
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
531
+ )
532
+
533
+ if (
534
+ self.heartbeat_failures.get(bootstrap_addr, 0)
535
+ >= self.max_failures
536
+ ):
537
+ self._handle_node_failure(bootstrap_addr)
538
+ with self.session_pool_lock:
539
+ if bootstrap_addr in self.session_pool:
540
+ del self.session_pool[bootstrap_addr]
541
+
390
542
  threading.Thread(target=decode_thread).start()
543
+ threading.Thread(target=heartbeat_checker).start()
391
544
 
392
545
  def add_transfer_request(
393
546
  self,
394
547
  bootstrap_room: int,
395
- kv_indices: npt.NDArray[np.int64],
548
+ kv_indices: npt.NDArray[np.int32],
396
549
  index_slice: slice,
397
550
  is_last: bool,
398
551
  aux_index: Optional[int] = None,
@@ -400,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
400
553
  assert self.disaggregation_mode == DisaggregationMode.PREFILL
401
554
  assert not is_last or (is_last and aux_index is not None)
402
555
 
403
- self.transfer_queue.put(
556
+ if (
557
+ bootstrap_room not in self.request_status
558
+ or self.check_status(bootstrap_room) == KVPoll.Failed
559
+ ):
560
+ logger.debug(
561
+ "Request with bootstrap_room=%s already failed", bootstrap_room
562
+ )
563
+ return
564
+
565
+ if bootstrap_room not in self.transfer_infos:
566
+ # This means that the current rank is a dummy rank for this request,
567
+ # and it has already been marked as success, so there is no need to
568
+ # add further chunks into the transfer queue.
569
+ return
570
+
571
+ # NOTE(shangming): sharding according to the dst_infos to make sure
572
+ # requests with the same dst_sessions will be added into the same
573
+ # queue, which enables early abort with failed sessions.
574
+ dst_infos = self.transfer_infos[bootstrap_room].keys()
575
+ session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
576
+ shard_idx = session_port_sum % len(self.transfer_queues)
577
+
578
+ self.transfer_queues[shard_idx].put(
404
579
  TransferKVChunk(
405
580
  room=bootstrap_room,
406
581
  prefill_kv_indices=kv_indices,
@@ -409,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
409
584
  prefill_aux_index=aux_index,
410
585
  )
411
586
  )
412
- self.update_status(bootstrap_room, KVPoll.WaitingForInput)
413
587
 
414
588
  def check_status(self, bootstrap_room: int):
415
589
  return self.request_status[bootstrap_room]
@@ -418,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
418
592
  if bootstrap_room not in self.request_status:
419
593
  self.request_status[bootstrap_room] = status
420
594
  else:
421
- # NOTE: The prefill engine could recv bootstrapping first
422
- self.request_status[bootstrap_room] = max(
423
- self.request_status[bootstrap_room], status
424
- )
595
+ # NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
596
+ if status == KVPoll.Failed:
597
+ self.request_status[bootstrap_room] = KVPoll.Failed
598
+ else:
599
+ self.request_status[bootstrap_room] = max(
600
+ self.request_status[bootstrap_room], status
601
+ )
602
+
603
+ def record_failure(self, bootstrap_room: int, failure_reason: str):
604
+ with self.failure_lock:
605
+ self.failure_records[bootstrap_room] = failure_reason
425
606
 
426
607
  def get_session_id(self):
427
608
  return self.engine.get_session_id()
@@ -445,38 +626,82 @@ class MooncakeKVManager(BaseKVManager):
445
626
  }
446
627
 
447
628
  try:
448
- response = requests.put(url, json=payload)
629
+ response = requests.put(url, json=payload, timeout=5)
449
630
  if response.status_code == 200:
450
631
  logger.debug("Prefill successfully registered to bootstrap server.")
451
632
  else:
452
633
  logger.error(
453
- f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
634
+ f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
454
635
  )
455
636
  except Exception as e:
456
- logger.error(f"Prefill Failed to register to bootstrap server: {e}")
637
+ logger.error(
638
+ f"Prefill instance failed to register to bootstrap server: {e}"
639
+ )
640
+
641
+ def _handle_node_failure(self, failed_bootstrap_addr):
642
+ with self.connection_lock:
643
+ keys_to_remove = [
644
+ k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
645
+ ]
646
+ for k in keys_to_remove:
647
+ del self.connection_pool[k]
648
+ if failed_bootstrap_addr in self.prefill_tp_size_table:
649
+ del self.prefill_tp_size_table[failed_bootstrap_addr]
650
+ if failed_bootstrap_addr in self.prefill_dp_size_table:
651
+ del self.prefill_dp_size_table[failed_bootstrap_addr]
652
+
653
+ possible_affected_rooms = self.addr_to_rooms_tracker.get(
654
+ failed_bootstrap_addr, []
655
+ )
656
+ if failed_bootstrap_addr in self.addr_to_rooms_tracker:
657
+ del self.addr_to_rooms_tracker[failed_bootstrap_addr]
658
+
659
+ # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
660
+ affected_rooms = []
661
+ for room in possible_affected_rooms:
662
+ if (
663
+ room in self.request_status
664
+ and self.check_status(room) != KVPoll.Success
665
+ ):
666
+ self.record_failure(
667
+ room,
668
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
669
+ )
670
+ self.update_status(room, KVPoll.Failed)
671
+ affected_rooms.append(room)
672
+ logger.error(
673
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
674
+ )
457
675
 
458
676
 
459
677
  class MooncakeKVSender(BaseKVSender):
460
678
 
461
679
  def __init__(
462
- self, mgr: MooncakeKVManager, bootstrap_addr: str, bootstrap_room: int
680
+ self,
681
+ mgr: MooncakeKVManager,
682
+ bootstrap_addr: str,
683
+ bootstrap_room: int,
684
+ dest_tp_ranks: List[int],
685
+ pp_rank: int,
463
686
  ):
464
687
  self.kv_mgr = mgr
465
688
  self.bootstrap_room = bootstrap_room
466
689
  self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
467
690
  self.aux_index = None
468
691
  self.bootstrap_server_url = bootstrap_addr
469
- self.session_id = self.kv_mgr.get_session_id()
692
+ self.conclude_state = None
693
+ self.init_time = None
470
694
  # inner state
471
695
  self.curr_idx = 0
472
696
 
473
697
  def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
474
698
  self.num_kv_indices = num_kv_indices
475
699
  self.aux_index = aux_index
700
+ self.init_time = time.time()
476
701
 
477
702
  def send(
478
703
  self,
479
- kv_indices: npt.NDArray[np.int64],
704
+ kv_indices: npt.NDArray[np.int32],
480
705
  ):
481
706
  index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
482
707
  self.curr_idx += len(kv_indices)
@@ -496,11 +721,42 @@ class MooncakeKVSender(BaseKVSender):
496
721
  )
497
722
 
498
723
  def poll(self) -> KVPoll:
499
- return self.kv_mgr.check_status(self.bootstrap_room)
724
+ if self.conclude_state is None:
725
+ status = self.kv_mgr.check_status(self.bootstrap_room)
726
+ if status in (KVPoll.Success, KVPoll.Failed):
727
+ self.conclude_state = status
728
+ elif status == KVPoll.Bootstrapping:
729
+ if self.init_time is not None:
730
+ now = time.time()
731
+ elapsed = now - self.init_time
732
+ if elapsed >= self.kv_mgr.bootstrap_time_out:
733
+ self.kv_mgr.record_failure(
734
+ self.bootstrap_room,
735
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
736
+ )
737
+ self.conclude_state = KVPoll.Failed
738
+ return KVPoll.Failed
739
+
740
+ return status
741
+ else:
742
+ return self.conclude_state
743
+
744
+ def clear(self) -> None:
745
+ if self.bootstrap_room in self.kv_mgr.request_status:
746
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
500
747
 
501
748
  def failure_exception(self):
502
- # TODO: raise a real exception
503
- raise Exception("Fake KVSender Exception")
749
+ self.clear()
750
+
751
+ # Explicitly set the status to failure since this request has failed in another rank
752
+ if self.conclude_state is None:
753
+ self.conclude_state = KVPoll.Failed
754
+
755
+ with self.kv_mgr.failure_lock:
756
+ failure_reason = self.kv_mgr.failure_records.pop(
757
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
758
+ )
759
+ raise KVTransferError(self.bootstrap_room, failure_reason)
504
760
 
505
761
 
506
762
  class MooncakeKVReceiver(BaseKVReceiver):
@@ -514,22 +770,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
514
770
  mgr: MooncakeKVManager,
515
771
  bootstrap_addr: str,
516
772
  bootstrap_room: Optional[int] = None,
773
+ data_parallel_rank: Optional[int] = None,
517
774
  ):
518
775
  self.bootstrap_room = bootstrap_room
519
776
  self.bootstrap_addr = bootstrap_addr
520
777
  self.kv_mgr = mgr
521
778
  self.session_id = self.kv_mgr.get_session_id()
522
- self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
779
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
780
+ self.conclude_state = None
781
+ self.data_parallel_rank = data_parallel_rank
523
782
 
524
783
  if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
525
784
  self.prefill_tp_size, self.prefill_dp_size = (
526
- self._get_prefill_dp_size_from_server()
785
+ self._get_prefill_parallel_info_from_server()
527
786
  )
528
787
  if self.prefill_tp_size is None or self.prefill_dp_size is None:
529
- logger.error(
530
- f"Could not fetch prefill parallel info for bootstrap_addr: {self.bootstrap_addr}"
788
+ self.kv_mgr.record_failure(
789
+ self.bootstrap_room,
790
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
531
791
  )
792
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
793
+ return
532
794
  else:
795
+ logger.debug(
796
+ f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
797
+ )
533
798
  self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
534
799
  self.prefill_tp_size
535
800
  )
@@ -587,7 +852,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
587
852
  self.target_tp_rank = self.target_tp_ranks[0]
588
853
  self.required_dst_info_num = 1
589
854
 
590
- self.target_dp_group = bootstrap_room % self.prefill_dp_size
855
+ if self.data_parallel_rank is not None:
856
+ logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
857
+ self.target_dp_group = self.data_parallel_rank
858
+ else:
859
+ self.target_dp_group = bootstrap_room % self.prefill_dp_size
591
860
 
592
861
  # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
593
862
  bootstrap_key = (
@@ -607,32 +876,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
607
876
  target_tp_rank == self.target_tp_rank
608
877
  or self.target_tp_rank is None
609
878
  )
879
+ logger.debug(
880
+ f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
881
+ )
610
882
  bootstrap_infos.append(bootstrap_info)
611
883
  else:
612
- logger.error(
613
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}"
884
+ self.kv_mgr.record_failure(
885
+ self.bootstrap_room,
886
+ f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
614
887
  )
888
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
889
+ return
890
+
615
891
  self.bootstrap_infos = bootstrap_infos
892
+ self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
616
893
 
617
- if len(self.bootstrap_infos) == 0:
618
- logger.error(
619
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
620
- )
621
- else:
622
- self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
623
- # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
624
- self._register_kv_args()
894
+ # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
895
+ self._register_kv_args()
625
896
  else:
626
897
  self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
627
898
 
628
899
  assert len(self.bootstrap_infos) > 0
629
- self.kv_mgr.update_status(bootstrap_room, KVPoll.WaitingForInput)
900
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
901
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
630
902
 
631
903
  def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
632
904
  """Fetch the bootstrap info from the bootstrap server."""
633
905
  try:
634
906
  url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
635
- response = requests.get(url)
907
+ response = requests.get(url, timeout=5)
636
908
  if response.status_code == 200:
637
909
  bootstrap_info = response.json()
638
910
  return bootstrap_info
@@ -645,7 +917,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
645
917
  logger.error(f"Error fetching prefill info from bootstrap: {e}")
646
918
  return None
647
919
 
648
- def _get_prefill_dp_size_from_server(self) -> int:
920
+ def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
649
921
  """Fetch the prefill parallel info from the bootstrap server."""
650
922
  try:
651
923
  url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
@@ -659,10 +931,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
659
931
  logger.error(
660
932
  f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
661
933
  )
662
- return None
934
+ return None, None
663
935
  except Exception as e:
664
936
  logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
665
- return None
937
+ return None, None
666
938
 
667
939
  def _register_kv_args(self):
668
940
  for bootstrap_info in self.bootstrap_infos:
@@ -699,14 +971,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
699
971
  cls._socket_locks[endpoint] = threading.Lock()
700
972
  return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
701
973
 
702
- def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
974
+ def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
703
975
  for bootstrap_info in self.bootstrap_infos:
704
976
  self.prefill_server_url = (
705
977
  f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
706
978
  )
707
- logger.debug(
708
- f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
709
- )
710
979
  is_dummy = bootstrap_info["is_dummy"]
711
980
 
712
981
  sock, lock = self._connect("tcp://" + self.prefill_server_url)
@@ -724,11 +993,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
724
993
  )
725
994
 
726
995
  def poll(self) -> KVPoll:
727
- return self.kv_mgr.check_status(self.bootstrap_room)
996
+ if self.conclude_state is None:
997
+ status = self.kv_mgr.check_status(self.bootstrap_room)
998
+ if status in (KVPoll.Success, KVPoll.Failed):
999
+ self.conclude_state = status
1000
+
1001
+ return status
1002
+ else:
1003
+ return self.conclude_state
1004
+
1005
+ def clear(self) -> None:
1006
+ if self.bootstrap_room in self.kv_mgr.request_status:
1007
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
728
1008
 
729
1009
  def failure_exception(self):
730
- # TODO: raise a real exception
731
- raise Exception("Fake KVReceiver Exception")
1010
+ self.clear()
1011
+
1012
+ # Explicitly set the status to failure since this request has failed in another rank
1013
+ if self.conclude_state is None:
1014
+ self.conclude_state = KVPoll.Failed
1015
+
1016
+ with self.kv_mgr.failure_lock:
1017
+ failure_reason = self.kv_mgr.failure_records.pop(
1018
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
1019
+ )
1020
+ raise KVTransferError(self.bootstrap_room, failure_reason)
732
1021
 
733
1022
 
734
1023
  class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
@@ -752,6 +1041,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
752
1041
 
753
1042
  def _setup_routes(self):
754
1043
  self.app.router.add_route("*", "/route", self._handle_route)
1044
+ self.app.router.add_get("/health", self._handle_health_check)
1045
+
1046
+ async def _handle_health_check(self, request):
1047
+ return web.Response(text="OK", status=200)
755
1048
 
756
1049
  async def _handle_route(self, request: web.Request):
757
1050
  method = request.method
@@ -780,14 +1073,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
780
1073
  self.dp_size = dp_size
781
1074
 
782
1075
  tp_size_per_dp_rank = tp_size // dp_size
783
- if self.tp_size_per_dp_rank == None:
1076
+ if self.tp_size_per_dp_rank is None:
784
1077
  self.tp_size_per_dp_rank = tp_size_per_dp_rank
785
1078
 
786
- # Add lock to make sure thread-safe
787
1079
  if role == "Prefill":
788
1080
  dp_group = engine_rank // tp_size_per_dp_rank
789
1081
  tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
790
1082
 
1083
+ # Add lock to make sure thread-safe
791
1084
  async with self.lock:
792
1085
  if dp_group not in self.prefill_port_table:
793
1086
  self.prefill_port_table[dp_group] = {}
@@ -797,7 +1090,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
797
1090
  "rank_port": rank_port,
798
1091
  }
799
1092
  logger.debug(
800
- f"Register Prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
1093
+ f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
801
1094
  )
802
1095
 
803
1096
  return web.Response(text="OK", status=200)
@@ -833,7 +1126,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
833
1126
  self._loop = asyncio.new_event_loop()
834
1127
  asyncio.set_event_loop(self._loop)
835
1128
 
836
- self._runner = web.AppRunner(self.app)
1129
+ access_log = None
1130
+ if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
1131
+ access_log = self.app.logger
1132
+
1133
+ self._runner = web.AppRunner(self.app, access_log=access_log)
837
1134
  self._loop.run_until_complete(self._runner.setup())
838
1135
 
839
1136
  site = web.TCPSite(self._runner, port=self.port)