sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ import queue
9
9
  import socket
10
10
  import struct
11
11
  import threading
12
+ import time
13
+ from collections import defaultdict
12
14
  from functools import cache
13
15
  from typing import Dict, List, Optional, Tuple, Union
14
16
 
@@ -27,28 +29,30 @@ from sglang.srt.disaggregation.base.conn import (
27
29
  KVPoll,
28
30
  )
29
31
  from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
30
- from sglang.srt.disaggregation.utils import DisaggregationMode
32
+ from sglang.srt.disaggregation.utils import (
33
+ DisaggregationMode,
34
+ FastQueue,
35
+ group_concurrent_contiguous,
36
+ )
31
37
  from sglang.srt.server_args import ServerArgs
32
- from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
38
+ from sglang.srt.utils import (
39
+ get_free_port,
40
+ get_int_env_var,
41
+ get_ip,
42
+ get_local_ip_by_remote,
43
+ )
33
44
 
34
45
  logger = logging.getLogger(__name__)
35
46
 
36
47
 
37
- def group_concurrent_contiguous(
38
- src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
39
- ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
40
- """Vectorised NumPy implementation."""
41
- if src_indices.size == 0:
42
- return [], []
43
-
44
- brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
45
- src_groups = np.split(src_indices, brk)
46
- dst_groups = np.split(dst_indices, brk)
47
-
48
- src_groups = [g.tolist() for g in src_groups]
49
- dst_groups = [g.tolist() for g in dst_groups]
48
+ class KVTransferError(Exception):
49
+ def __init__(self, bootstrap_room: int, failure_reason: str):
50
+ super().__init__(failure_reason)
51
+ self.bootstrap_room = bootstrap_room
52
+ self.failure_reason = failure_reason
50
53
 
51
- return src_groups, dst_groups
54
+ def __str__(self):
55
+ return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
52
56
 
53
57
 
54
58
  # prefill
@@ -148,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
148
152
  self.server_socket = zmq.Context().socket(zmq.PULL)
149
153
  self.register_buffer_to_engine()
150
154
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
151
- self.transfer_queue = queue.Queue()
152
155
  self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
153
156
  self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
154
157
  self.start_prefill_thread()
155
158
  self._register_to_bootstrap()
156
-
159
+ self.session_failures = defaultdict(int)
160
+ self.failed_sessions = set()
161
+ self.session_lock = threading.Lock()
157
162
  # Determine the number of threads to use for kv sender
158
163
  cpu_count = os.cpu_count()
159
- self.executor = concurrent.futures.ThreadPoolExecutor(
160
- min(cpu_count // 4, 16)
164
+ transfer_thread_pool_size = get_int_env_var(
165
+ "SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
166
+ min(max(4, int(0.75 * cpu_count) // 8), 12),
167
+ )
168
+ transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
169
+ self.transfer_queues: List[FastQueue] = [
170
+ FastQueue() for _ in range(transfer_queue_size)
171
+ ]
172
+ assert transfer_thread_pool_size >= transfer_queue_size, (
173
+ f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
174
+ f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
175
+ )
176
+ self.executors = [
177
+ concurrent.futures.ThreadPoolExecutor(
178
+ transfer_thread_pool_size // transfer_queue_size
179
+ )
180
+ for _ in range(transfer_queue_size)
181
+ ]
182
+ for queue, executor in zip(self.transfer_queues, self.executors):
183
+ threading.Thread(
184
+ target=self.transfer_worker, args=(queue, executor), daemon=True
185
+ ).start()
186
+
187
+ self.bootstrap_time_out = get_int_env_var(
188
+ "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
161
189
  )
162
190
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
191
+ self.heartbeat_failures = {}
192
+ self.session_pool = defaultdict(requests.Session)
193
+ self.session_pool_lock = threading.Lock()
194
+ self.addr_to_rooms_tracker = defaultdict(set)
195
+ self.connection_lock = threading.Lock()
196
+ # Heartbeat interval should be at least 2 seconds
197
+ self.heartbeat_interval = max(
198
+ float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
199
+ )
200
+ # Heartbeat failure should be at least 1
201
+ self.max_failures = max(
202
+ get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
203
+ )
163
204
  self.start_decode_thread()
164
205
  self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
165
206
  self.prefill_tp_size_table: Dict[str, int] = {}
@@ -169,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
169
210
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
170
211
  )
171
212
 
213
+ self.failure_records: Dict[int, str] = {}
214
+ self.failure_lock = threading.Lock()
215
+
172
216
  def register_buffer_to_engine(self):
173
217
  for kv_data_ptr, kv_data_len in zip(
174
218
  self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -192,6 +236,7 @@ class MooncakeKVManager(BaseKVManager):
192
236
  prefill_kv_indices: npt.NDArray[np.int64],
193
237
  dst_kv_ptrs: list[int],
194
238
  dst_kv_indices: npt.NDArray[np.int64],
239
+ executor: concurrent.futures.ThreadPoolExecutor,
195
240
  ):
196
241
  # Group by indices
197
242
  prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
@@ -223,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
223
268
  return 0
224
269
 
225
270
  futures = [
226
- self.executor.submit(
271
+ executor.submit(
227
272
  process_layer,
228
273
  src_ptr,
229
274
  dst_ptr,
@@ -235,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
235
280
  for future in concurrent.futures.as_completed(futures):
236
281
  status = future.result()
237
282
  if status != 0:
238
- # Immediate shutdown on first error (existing tasks will finish)
239
- self.executor.shutdown(wait=False)
240
283
  for f in futures:
241
284
  f.cancel()
242
285
  return status
@@ -255,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
255
298
  self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
256
299
  )
257
300
  decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
258
- # TODO: mooncake transfer engine can do async transfer. Do async later
259
- # Not sure about the amount of aux data, maybe transfer it by zmq is more effective
260
301
  status = self.engine.transfer_sync(
261
302
  mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
262
303
  )
263
304
  return status
264
305
 
265
- def sync_status_to_decode_endpoint(self, remote: str, dst_port: int, room: int):
306
+ def sync_status_to_decode_endpoint(
307
+ self, remote: str, dst_port: int, room: int, status: int
308
+ ):
266
309
  if ":" in remote:
267
310
  remote = remote.split(":")[0]
268
311
  self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
269
312
  [
270
313
  str(room).encode("ascii"),
271
- str(self.check_status(room)).encode("ascii"),
314
+ str(status).encode("ascii"),
272
315
  ]
273
316
  )
274
317
 
318
+ def transfer_worker(
319
+ self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
320
+ ):
321
+ while True:
322
+ try:
323
+ kv_chunk: TransferKVChunk = queue.get()
324
+ reqs_to_be_processed = (
325
+ self.transfer_infos[kv_chunk.room].values()
326
+ if kv_chunk.room in self.transfer_infos
327
+ else []
328
+ )
329
+ polls = []
330
+ dst_ranks_infos = []
331
+ for req in reqs_to_be_processed:
332
+ if not req.is_dummy:
333
+ # Early exit if the request has failed
334
+ with self.session_lock:
335
+ if req.mooncake_session_id in self.failed_sessions:
336
+ self.record_failure(
337
+ kv_chunk.room,
338
+ f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
339
+ )
340
+ self.update_status(kv_chunk.room, KVPoll.Failed)
341
+ self.sync_status_to_decode_endpoint(
342
+ req.endpoint,
343
+ req.dst_port,
344
+ req.room,
345
+ KVPoll.Failed,
346
+ )
347
+ break
348
+
349
+ chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
350
+
351
+ # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
352
+ # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
353
+ if len(chunked_dst_kv_indice) < len(
354
+ kv_chunk.prefill_kv_indices
355
+ ):
356
+ kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
357
+ : len(chunked_dst_kv_indice)
358
+ ]
359
+ logger.warning(
360
+ f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
361
+ )
362
+
363
+ ret = self.send_kvcache(
364
+ req.mooncake_session_id,
365
+ kv_chunk.prefill_kv_indices,
366
+ self.decode_kv_args_table[
367
+ req.mooncake_session_id
368
+ ].dst_kv_ptrs,
369
+ chunked_dst_kv_indice,
370
+ executor,
371
+ )
372
+ if ret != 0:
373
+ with self.session_lock:
374
+ self.session_failures[req.mooncake_session_id] += 1
375
+ # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
376
+ if self.session_failures[req.mooncake_session_id] >= 1:
377
+ self.failed_sessions.add(req.mooncake_session_id)
378
+ logger.error(
379
+ f"Session {req.mooncake_session_id} failed."
380
+ )
381
+ self.record_failure(
382
+ kv_chunk.room,
383
+ f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
384
+ )
385
+ self.update_status(kv_chunk.room, KVPoll.Failed)
386
+ self.sync_status_to_decode_endpoint(
387
+ req.endpoint, req.dst_port, req.room, KVPoll.Failed
388
+ )
389
+ break
390
+
391
+ if kv_chunk.is_last:
392
+ # Only the last chunk we need to send the aux data
393
+ ret = self.send_aux(
394
+ req.mooncake_session_id,
395
+ kv_chunk.prefill_aux_index,
396
+ self.decode_kv_args_table[
397
+ req.mooncake_session_id
398
+ ].dst_aux_ptrs,
399
+ req.dst_aux_index,
400
+ )
401
+ polls.append(True if ret == 0 else False)
402
+ dst_ranks_infos.append(
403
+ (req.endpoint, req.dst_port, req.room)
404
+ )
405
+
406
+ # Only sync status when all the dst ranks have received the kvcache
407
+ if len(polls) == req.required_dst_info_num:
408
+ status = KVPoll.Success if all(polls) else KVPoll.Failed
409
+ self.update_status(req.room, status)
410
+ for endpoint, dst_port, room in dst_ranks_infos:
411
+ self.sync_status_to_decode_endpoint(
412
+ endpoint, dst_port, room, status
413
+ )
414
+ else:
415
+ # Dummy request means the decode instance is not used, so its status can be marked as success directly
416
+ # Dummy request does not need to sync status to decode endpoint
417
+ if kv_chunk.is_last and req.room in self.request_status:
418
+ self.update_status(req.room, KVPoll.Success)
419
+
420
+ if (
421
+ kv_chunk.room not in self.request_status
422
+ or self.check_status(kv_chunk.room) == KVPoll.Success
423
+ ):
424
+ if kv_chunk.room in self.transfer_infos:
425
+ self.transfer_infos.pop(kv_chunk.room)
426
+
427
+ except Exception as e:
428
+ # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
429
+ raise RuntimeError(
430
+ f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
431
+ )
432
+
275
433
  def start_prefill_thread(self):
276
434
  self.rank_port = get_free_port()
277
435
  self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
@@ -287,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
287
445
  self.decode_kv_args_table[mooncake_session_id] = (
288
446
  KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
289
447
  )
448
+ with self.session_lock:
449
+ if mooncake_session_id in self.failed_sessions:
450
+ self.failed_sessions.remove(mooncake_session_id)
451
+ if mooncake_session_id in self.session_failures:
452
+ del self.session_failures[mooncake_session_id]
290
453
  logger.debug(
291
454
  f"Register KVArgs from {mooncake_session_id} successfully"
292
455
  )
@@ -304,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
304
467
  if len(self.transfer_infos[room]) == required_dst_info_num:
305
468
  self.update_status(room, KVPoll.WaitingForInput)
306
469
 
307
- def transfer_thread():
308
- # TODO: Shall we use KVPoll.Transferring state?
309
- while True:
310
- try:
311
- kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
312
- reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
313
- polls = []
314
- dst_ranks_infos = []
315
- for req in reqs_to_be_processed:
316
- if not req.is_dummy:
317
- chunked_dst_kv_indice = req.dst_kv_indices[
318
- kv_chunk.index_slice
319
- ]
320
- assert len(chunked_dst_kv_indice) == len(
321
- kv_chunk.prefill_kv_indices
322
- ), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
323
-
324
- ret = self.send_kvcache(
325
- req.mooncake_session_id,
326
- kv_chunk.prefill_kv_indices,
327
- self.decode_kv_args_table[
328
- req.mooncake_session_id
329
- ].dst_kv_ptrs,
330
- chunked_dst_kv_indice,
331
- )
332
- if ret != 0:
333
- self.update_status(kv_chunk.room, KVPoll.Failed)
334
- self.sync_status_to_decode_endpoint(
335
- req.endpoint, req.dst_port, req.room
336
- )
337
- continue
338
-
339
- if kv_chunk.is_last:
340
- # Only the last chunk we need to send the aux data
341
- ret = self.send_aux(
342
- req.mooncake_session_id,
343
- kv_chunk.prefill_aux_index,
344
- self.decode_kv_args_table[
345
- req.mooncake_session_id
346
- ].dst_aux_ptrs,
347
- req.dst_aux_index,
348
- )
349
- polls.append(True if ret == 0 else False)
350
- dst_ranks_infos.append(
351
- (req.endpoint, req.dst_port, req.room)
352
- )
353
-
354
- # Only sync status when all the dst ranks have received the kvcache
355
- if len(polls) == req.required_dst_info_num:
356
- self.update_status(
357
- req.room,
358
- KVPoll.Success if all(polls) else KVPoll.Failed,
359
- )
360
- for endpoint, dst_port, room in dst_ranks_infos:
361
- self.sync_status_to_decode_endpoint(
362
- endpoint, dst_port, room
363
- )
364
- else:
365
- # Dummy request means the decode instance is not used, so its status can be marked as success directly
366
- # Dummy request does not need to sync status to decode endpoint
367
- if kv_chunk.is_last:
368
- self.update_status(req.room, KVPoll.Success)
369
-
370
- if self.check_status(kv_chunk.room) == KVPoll.Success:
371
- self.transfer_infos.pop(kv_chunk.room)
372
-
373
- except queue.Empty:
374
- continue
375
-
376
470
  threading.Thread(target=bootstrap_thread).start()
377
- threading.Thread(target=transfer_thread).start()
378
471
 
379
472
  def start_decode_thread(self):
380
473
  self.rank_port = get_free_port()
@@ -385,9 +478,69 @@ class MooncakeKVManager(BaseKVManager):
385
478
  (bootstrap_room, status) = self.server_socket.recv_multipart()
386
479
  status = int(status.decode("ascii"))
387
480
  bootstrap_room = int(bootstrap_room.decode("ascii"))
481
+ if status == KVPoll.Failed:
482
+ self.record_failure(
483
+ bootstrap_room,
484
+ f"Failed to get kvcache from prefill instance, it might be dead",
485
+ )
388
486
  self.update_status(bootstrap_room, status)
389
487
 
488
+ def heartbeat_checker():
489
+ while True:
490
+ time.sleep(self.heartbeat_interval)
491
+ with self.connection_lock:
492
+ addresses = list(self.prefill_dp_size_table.keys())
493
+
494
+ for bootstrap_addr in addresses:
495
+ session = None
496
+ try:
497
+ with self.session_pool_lock:
498
+ session = self.session_pool[bootstrap_addr]
499
+ response = session.get(
500
+ f"http://{bootstrap_addr}/health",
501
+ timeout=(2, 3),
502
+ headers={"Connection": "keep-alive"},
503
+ )
504
+ if response.status_code == 200:
505
+ self.heartbeat_failures[bootstrap_addr] = 0
506
+
507
+ current_rooms = self.addr_to_rooms_tracker[
508
+ bootstrap_addr
509
+ ].copy()
510
+
511
+ for bootstrap_room in current_rooms:
512
+ # Remove KVPoll.Success requests from the tracker
513
+ if bootstrap_room not in self.request_status:
514
+ self.addr_to_rooms_tracker[bootstrap_addr].discard(
515
+ bootstrap_room
516
+ )
517
+ else:
518
+ logger.info(
519
+ f"Attempting to reconnect to {bootstrap_addr}..."
520
+ )
521
+ self.heartbeat_failures[bootstrap_addr] = (
522
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
523
+ )
524
+ with self.session_pool_lock:
525
+ if bootstrap_addr in self.session_pool:
526
+ del self.session_pool[bootstrap_addr]
527
+ except Exception:
528
+ logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
529
+ self.heartbeat_failures[bootstrap_addr] = (
530
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
531
+ )
532
+
533
+ if (
534
+ self.heartbeat_failures.get(bootstrap_addr, 0)
535
+ >= self.max_failures
536
+ ):
537
+ self._handle_node_failure(bootstrap_addr)
538
+ with self.session_pool_lock:
539
+ if bootstrap_addr in self.session_pool:
540
+ del self.session_pool[bootstrap_addr]
541
+
390
542
  threading.Thread(target=decode_thread).start()
543
+ threading.Thread(target=heartbeat_checker).start()
391
544
 
392
545
  def add_transfer_request(
393
546
  self,
@@ -400,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
400
553
  assert self.disaggregation_mode == DisaggregationMode.PREFILL
401
554
  assert not is_last or (is_last and aux_index is not None)
402
555
 
403
- self.transfer_queue.put(
556
+ if (
557
+ bootstrap_room not in self.request_status
558
+ or self.check_status(bootstrap_room) == KVPoll.Failed
559
+ ):
560
+ logger.debug(
561
+ "Request with bootstrap_room=%s already failed", bootstrap_room
562
+ )
563
+ return
564
+
565
+ if bootstrap_room not in self.transfer_infos:
566
+ # This means that the current rank is a dummy rank for this request,
567
+ # and it has already been marked as success, so there is no need to
568
+ # add further chunks into the transfer queue.
569
+ return
570
+
571
+ # NOTE(shangming): sharding according to the dst_infos to make sure
572
+ # requests with the same dst_sessions will be added into the same
573
+ # queue, which enables early abort with failed sessions.
574
+ dst_infos = self.transfer_infos[bootstrap_room].keys()
575
+ session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
576
+ shard_idx = session_port_sum % len(self.transfer_queues)
577
+
578
+ self.transfer_queues[shard_idx].put(
404
579
  TransferKVChunk(
405
580
  room=bootstrap_room,
406
581
  prefill_kv_indices=kv_indices,
@@ -409,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
409
584
  prefill_aux_index=aux_index,
410
585
  )
411
586
  )
412
- self.update_status(bootstrap_room, KVPoll.WaitingForInput)
413
587
 
414
588
  def check_status(self, bootstrap_room: int):
415
589
  return self.request_status[bootstrap_room]
@@ -418,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
418
592
  if bootstrap_room not in self.request_status:
419
593
  self.request_status[bootstrap_room] = status
420
594
  else:
421
- # NOTE: The prefill engine could recv bootstrapping first
422
- self.request_status[bootstrap_room] = max(
423
- self.request_status[bootstrap_room], status
424
- )
595
+ # NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
596
+ if status == KVPoll.Failed:
597
+ self.request_status[bootstrap_room] = KVPoll.Failed
598
+ else:
599
+ self.request_status[bootstrap_room] = max(
600
+ self.request_status[bootstrap_room], status
601
+ )
602
+
603
+ def record_failure(self, bootstrap_room: int, failure_reason: str):
604
+ with self.failure_lock:
605
+ self.failure_records[bootstrap_room] = failure_reason
425
606
 
426
607
  def get_session_id(self):
427
608
  return self.engine.get_session_id()
@@ -445,15 +626,52 @@ class MooncakeKVManager(BaseKVManager):
445
626
  }
446
627
 
447
628
  try:
448
- response = requests.put(url, json=payload)
629
+ response = requests.put(url, json=payload, timeout=5)
449
630
  if response.status_code == 200:
450
631
  logger.debug("Prefill successfully registered to bootstrap server.")
451
632
  else:
452
633
  logger.error(
453
- f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
634
+ f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
454
635
  )
455
636
  except Exception as e:
456
- logger.error(f"Prefill Failed to register to bootstrap server: {e}")
637
+ logger.error(
638
+ f"Prefill instance failed to register to bootstrap server: {e}"
639
+ )
640
+
641
+ def _handle_node_failure(self, failed_bootstrap_addr):
642
+ with self.connection_lock:
643
+ keys_to_remove = [
644
+ k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
645
+ ]
646
+ for k in keys_to_remove:
647
+ del self.connection_pool[k]
648
+ if failed_bootstrap_addr in self.prefill_tp_size_table:
649
+ del self.prefill_tp_size_table[failed_bootstrap_addr]
650
+ if failed_bootstrap_addr in self.prefill_dp_size_table:
651
+ del self.prefill_dp_size_table[failed_bootstrap_addr]
652
+
653
+ possible_affected_rooms = self.addr_to_rooms_tracker.get(
654
+ failed_bootstrap_addr, []
655
+ )
656
+ if failed_bootstrap_addr in self.addr_to_rooms_tracker:
657
+ del self.addr_to_rooms_tracker[failed_bootstrap_addr]
658
+
659
+ # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
660
+ affected_rooms = []
661
+ for room in possible_affected_rooms:
662
+ if (
663
+ room in self.request_status
664
+ and self.check_status(room) != KVPoll.Success
665
+ ):
666
+ self.record_failure(
667
+ room,
668
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
669
+ )
670
+ self.update_status(room, KVPoll.Failed)
671
+ affected_rooms.append(room)
672
+ logger.error(
673
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
674
+ )
457
675
 
458
676
 
459
677
  class MooncakeKVSender(BaseKVSender):
@@ -466,13 +684,15 @@ class MooncakeKVSender(BaseKVSender):
466
684
  self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
467
685
  self.aux_index = None
468
686
  self.bootstrap_server_url = bootstrap_addr
469
- self.session_id = self.kv_mgr.get_session_id()
687
+ self.conclude_state = None
688
+ self.init_time = None
470
689
  # inner state
471
690
  self.curr_idx = 0
472
691
 
473
692
  def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
474
693
  self.num_kv_indices = num_kv_indices
475
694
  self.aux_index = aux_index
695
+ self.init_time = time.time()
476
696
 
477
697
  def send(
478
698
  self,
@@ -496,11 +716,42 @@ class MooncakeKVSender(BaseKVSender):
496
716
  )
497
717
 
498
718
  def poll(self) -> KVPoll:
499
- return self.kv_mgr.check_status(self.bootstrap_room)
719
+ if self.conclude_state is None:
720
+ status = self.kv_mgr.check_status(self.bootstrap_room)
721
+ if status in (KVPoll.Success, KVPoll.Failed):
722
+ self.conclude_state = status
723
+ elif status == KVPoll.Bootstrapping:
724
+ if self.init_time is not None:
725
+ now = time.time()
726
+ elapsed = now - self.init_time
727
+ if elapsed >= self.kv_mgr.bootstrap_time_out:
728
+ self.kv_mgr.record_failure(
729
+ self.bootstrap_room,
730
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
731
+ )
732
+ self.conclude_state = KVPoll.Failed
733
+ return KVPoll.Failed
734
+
735
+ return status
736
+ else:
737
+ return self.conclude_state
738
+
739
+ def clear(self) -> None:
740
+ if self.bootstrap_room in self.kv_mgr.request_status:
741
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
500
742
 
501
743
  def failure_exception(self):
502
- # TODO: raise a real exception
503
- raise Exception("Fake KVSender Exception")
744
+ self.clear()
745
+
746
+ # Explicitly set the status to failure since this request has failed in another rank
747
+ if self.conclude_state is None:
748
+ self.conclude_state = KVPoll.Failed
749
+
750
+ with self.kv_mgr.failure_lock:
751
+ failure_reason = self.kv_mgr.failure_records.pop(
752
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
753
+ )
754
+ raise KVTransferError(self.bootstrap_room, failure_reason)
504
755
 
505
756
 
506
757
  class MooncakeKVReceiver(BaseKVReceiver):
@@ -514,22 +765,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
514
765
  mgr: MooncakeKVManager,
515
766
  bootstrap_addr: str,
516
767
  bootstrap_room: Optional[int] = None,
768
+ data_parallel_rank: Optional[int] = None,
517
769
  ):
518
770
  self.bootstrap_room = bootstrap_room
519
771
  self.bootstrap_addr = bootstrap_addr
520
772
  self.kv_mgr = mgr
521
773
  self.session_id = self.kv_mgr.get_session_id()
522
- self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
774
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
775
+ self.conclude_state = None
776
+ self.data_parallel_rank = data_parallel_rank
523
777
 
524
778
  if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
525
779
  self.prefill_tp_size, self.prefill_dp_size = (
526
- self._get_prefill_dp_size_from_server()
780
+ self._get_prefill_parallel_info_from_server()
527
781
  )
528
782
  if self.prefill_tp_size is None or self.prefill_dp_size is None:
529
- logger.error(
530
- f"Could not fetch prefill parallel info for bootstrap_addr: {self.bootstrap_addr}"
783
+ self.kv_mgr.record_failure(
784
+ self.bootstrap_room,
785
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
531
786
  )
787
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
788
+ return
532
789
  else:
790
+ logger.debug(
791
+ f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
792
+ )
533
793
  self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
534
794
  self.prefill_tp_size
535
795
  )
@@ -587,7 +847,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
587
847
  self.target_tp_rank = self.target_tp_ranks[0]
588
848
  self.required_dst_info_num = 1
589
849
 
590
- self.target_dp_group = bootstrap_room % self.prefill_dp_size
850
+ if self.data_parallel_rank is not None:
851
+ logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
852
+ self.target_dp_group = self.data_parallel_rank
853
+ else:
854
+ self.target_dp_group = bootstrap_room % self.prefill_dp_size
591
855
 
592
856
  # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
593
857
  bootstrap_key = (
@@ -607,32 +871,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
607
871
  target_tp_rank == self.target_tp_rank
608
872
  or self.target_tp_rank is None
609
873
  )
874
+ logger.debug(
875
+ f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
876
+ )
610
877
  bootstrap_infos.append(bootstrap_info)
611
878
  else:
612
- logger.error(
613
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}"
879
+ self.kv_mgr.record_failure(
880
+ self.bootstrap_room,
881
+ f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
614
882
  )
883
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
884
+ return
885
+
615
886
  self.bootstrap_infos = bootstrap_infos
887
+ self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
616
888
 
617
- if len(self.bootstrap_infos) == 0:
618
- logger.error(
619
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
620
- )
621
- else:
622
- self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
623
- # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
624
- self._register_kv_args()
889
+ # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
890
+ self._register_kv_args()
625
891
  else:
626
892
  self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
627
893
 
628
894
  assert len(self.bootstrap_infos) > 0
629
- self.kv_mgr.update_status(bootstrap_room, KVPoll.WaitingForInput)
895
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
896
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
630
897
 
631
898
  def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
632
899
  """Fetch the bootstrap info from the bootstrap server."""
633
900
  try:
634
901
  url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
635
- response = requests.get(url)
902
+ response = requests.get(url, timeout=5)
636
903
  if response.status_code == 200:
637
904
  bootstrap_info = response.json()
638
905
  return bootstrap_info
@@ -645,7 +912,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
645
912
  logger.error(f"Error fetching prefill info from bootstrap: {e}")
646
913
  return None
647
914
 
648
- def _get_prefill_dp_size_from_server(self) -> int:
915
+ def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
649
916
  """Fetch the prefill parallel info from the bootstrap server."""
650
917
  try:
651
918
  url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
@@ -659,10 +926,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
659
926
  logger.error(
660
927
  f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
661
928
  )
662
- return None
929
+ return None, None
663
930
  except Exception as e:
664
931
  logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
665
- return None
932
+ return None, None
666
933
 
667
934
  def _register_kv_args(self):
668
935
  for bootstrap_info in self.bootstrap_infos:
@@ -704,9 +971,6 @@ class MooncakeKVReceiver(BaseKVReceiver):
704
971
  self.prefill_server_url = (
705
972
  f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
706
973
  )
707
- logger.debug(
708
- f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
709
- )
710
974
  is_dummy = bootstrap_info["is_dummy"]
711
975
 
712
976
  sock, lock = self._connect("tcp://" + self.prefill_server_url)
@@ -724,11 +988,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
724
988
  )
725
989
 
726
990
  def poll(self) -> KVPoll:
727
- return self.kv_mgr.check_status(self.bootstrap_room)
991
+ if self.conclude_state is None:
992
+ status = self.kv_mgr.check_status(self.bootstrap_room)
993
+ if status in (KVPoll.Success, KVPoll.Failed):
994
+ self.conclude_state = status
995
+
996
+ return status
997
+ else:
998
+ return self.conclude_state
999
+
1000
+ def clear(self) -> None:
1001
+ if self.bootstrap_room in self.kv_mgr.request_status:
1002
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
728
1003
 
729
1004
  def failure_exception(self):
730
- # TODO: raise a real exception
731
- raise Exception("Fake KVReceiver Exception")
1005
+ self.clear()
1006
+
1007
+ # Explicitly set the status to failure since this request has failed in another rank
1008
+ if self.conclude_state is None:
1009
+ self.conclude_state = KVPoll.Failed
1010
+
1011
+ with self.kv_mgr.failure_lock:
1012
+ failure_reason = self.kv_mgr.failure_records.pop(
1013
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
1014
+ )
1015
+ raise KVTransferError(self.bootstrap_room, failure_reason)
732
1016
 
733
1017
 
734
1018
  class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
@@ -752,6 +1036,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
752
1036
 
753
1037
  def _setup_routes(self):
754
1038
  self.app.router.add_route("*", "/route", self._handle_route)
1039
+ self.app.router.add_get("/health", self._handle_health_check)
1040
+
1041
+ async def _handle_health_check(self, request):
1042
+ return web.Response(text="OK", status=200)
755
1043
 
756
1044
  async def _handle_route(self, request: web.Request):
757
1045
  method = request.method
@@ -780,14 +1068,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
780
1068
  self.dp_size = dp_size
781
1069
 
782
1070
  tp_size_per_dp_rank = tp_size // dp_size
783
- if self.tp_size_per_dp_rank == None:
1071
+ if self.tp_size_per_dp_rank is None:
784
1072
  self.tp_size_per_dp_rank = tp_size_per_dp_rank
785
1073
 
786
- # Add lock to make sure thread-safe
787
1074
  if role == "Prefill":
788
1075
  dp_group = engine_rank // tp_size_per_dp_rank
789
1076
  tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
790
1077
 
1078
+ # Add lock to make sure thread-safe
791
1079
  async with self.lock:
792
1080
  if dp_group not in self.prefill_port_table:
793
1081
  self.prefill_port_table[dp_group] = {}
@@ -797,7 +1085,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
797
1085
  "rank_port": rank_port,
798
1086
  }
799
1087
  logger.debug(
800
- f"Register Prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
1088
+ f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
801
1089
  )
802
1090
 
803
1091
  return web.Response(text="OK", status=200)
@@ -833,7 +1121,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
833
1121
  self._loop = asyncio.new_event_loop()
834
1122
  asyncio.set_event_loop(self._loop)
835
1123
 
836
- self._runner = web.AppRunner(self.app)
1124
+ access_log = None
1125
+ if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
1126
+ access_log = self.app.logger
1127
+
1128
+ self._runner = web.AppRunner(self.app, access_log=access_log)
837
1129
  self._loop.run_until_complete(self._runner.setup())
838
1130
 
839
1131
  site = web.TCPSite(self._runner, port=self.port)