sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -18,50 +18,23 @@ import requests
18
18
  import zmq
19
19
  from aiohttp import web
20
20
 
21
- from sglang.srt.disaggregation.base.conn import (
22
- BaseKVBootstrapServer,
23
- BaseKVManager,
24
- BaseKVReceiver,
25
- BaseKVSender,
26
- KVArgs,
27
- KVPoll,
21
+ from sglang.srt.disaggregation.base.conn import BaseKVSender, KVArgs, KVPoll
22
+ from sglang.srt.disaggregation.common.conn import (
23
+ CommonKVBootstrapServer,
24
+ CommonKVManager,
25
+ CommonKVReceiver,
26
+ )
27
+ from sglang.srt.disaggregation.utils import (
28
+ DisaggregationMode,
29
+ group_concurrent_contiguous,
28
30
  )
29
- from sglang.srt.disaggregation.utils import DisaggregationMode
30
31
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
32
+ from sglang.srt.utils import get_local_ip_by_remote
32
33
 
33
34
  logger = logging.getLogger(__name__)
34
35
 
35
36
  NixlEngineInfo: TypeAlias = Dict[str, Union[str, int]]
36
37
 
37
-
38
- # From Mooncake backend.
39
- def group_concurrent_contiguous(
40
- src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
41
- ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
42
- src_groups = []
43
- dst_groups = []
44
- current_src = [src_indices[0]]
45
- current_dst = [dst_indices[0]]
46
-
47
- for i in range(1, len(src_indices)):
48
- src_contiguous = src_indices[i] == src_indices[i - 1] + 1
49
- dst_contiguous = dst_indices[i] == dst_indices[i - 1] + 1
50
- if src_contiguous and dst_contiguous:
51
- current_src.append(src_indices[i])
52
- current_dst.append(dst_indices[i])
53
- else:
54
- src_groups.append(current_src)
55
- dst_groups.append(current_dst)
56
- current_src = [src_indices[i]]
57
- current_dst = [dst_indices[i]]
58
-
59
- src_groups.append(current_src)
60
- dst_groups.append(current_dst)
61
-
62
- return src_groups, dst_groups
63
-
64
-
65
38
  GUARD = "NixlMsgGuard".encode("ascii")
66
39
 
67
40
 
@@ -71,42 +44,32 @@ class TransferInfo:
71
44
  endpoint: str
72
45
  dst_port: int
73
46
  agent_metadata: bytes
47
+ agent_name: str
74
48
  dst_kv_ptrs: list[int]
75
49
  dst_kv_indices: npt.NDArray[np.int64]
76
50
  dst_aux_ptrs: list[int]
77
51
  dst_aux_index: int
78
52
  dst_gpu_id: int
53
+ required_dst_info_num: int
79
54
 
80
55
  def is_dummy(self):
81
- return self.endpoint == ""
56
+ return self.dst_kv_indices.size == 0
82
57
 
83
58
  @classmethod
84
59
  def from_zmq(cls, msg: List[bytes]):
85
- if len(msg) == 1:
86
- # dummy msg
87
- return cls(
88
- room=int(msg[0].decode("ascii")),
89
- endpoint="",
90
- dst_port=0,
91
- agent_metadata=b"",
92
- dst_kv_ptrs=[],
93
- dst_kv_indices=np.array([], dtype=np.int64),
94
- dst_aux_ptrs=[],
95
- dst_aux_index=0,
96
- dst_gpu_id=0,
97
- )
98
- else:
99
- return cls(
100
- room=int(msg[0].decode("ascii")),
101
- endpoint=msg[1].decode("ascii"),
102
- dst_port=int(msg[2].decode("ascii")),
103
- agent_metadata=msg[3],
104
- dst_kv_ptrs=list(struct.unpack(f"{len(msg[4])//8}Q", msg[4])),
105
- dst_kv_indices=np.frombuffer(msg[5], dtype=np.int64),
106
- dst_aux_ptrs=list(struct.unpack(f"{len(msg[6])//8}Q", msg[6])),
107
- dst_aux_index=int(msg[7].decode("ascii")),
108
- dst_gpu_id=int(msg[8].decode("ascii")),
109
- )
60
+ return cls(
61
+ room=int(msg[0].decode("ascii")),
62
+ endpoint=msg[1].decode("ascii"),
63
+ dst_port=int(msg[2].decode("ascii")),
64
+ agent_metadata=msg[3],
65
+ agent_name=msg[4].decode("ascii"),
66
+ dst_kv_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
67
+ dst_kv_indices=np.frombuffer(msg[6], dtype=np.int64),
68
+ dst_aux_ptrs=list(struct.unpack(f"{len(msg[7])//8}Q", msg[7])),
69
+ dst_aux_index=int(msg[8].decode("ascii")),
70
+ dst_gpu_id=int(msg[9].decode("ascii")),
71
+ required_dst_info_num=int(msg[10].decode("ascii")),
72
+ )
110
73
 
111
74
 
112
75
  @dataclasses.dataclass
@@ -126,7 +89,7 @@ class TransferStatus:
126
89
  return self.num_kvs_expected == len(self.received_kvs) and self.received_aux
127
90
 
128
91
 
129
- class NixlKVManager(BaseKVManager):
92
+ class NixlKVManager(CommonKVManager):
130
93
  def __init__(
131
94
  self,
132
95
  args: KVArgs,
@@ -134,6 +97,7 @@ class NixlKVManager(BaseKVManager):
134
97
  server_args: ServerArgs,
135
98
  is_mla_backend: Optional[bool] = False,
136
99
  ):
100
+ super().__init__(args, disaggregation_mode, server_args, is_mla_backend)
137
101
  try:
138
102
  from nixl._api import nixl_agent
139
103
  except ImportError as e:
@@ -143,38 +107,15 @@ class NixlKVManager(BaseKVManager):
143
107
  "to run SGLang with NixlTransferEngine."
144
108
  ) from e
145
109
  self.agent = nixl_agent(str(uuid.uuid4()))
146
- self.kv_args = args
147
- self.disaggregation_mode = disaggregation_mode
148
- # for p/d multi node infer
149
- self.bootstrap_port = server_args.disaggregation_bootstrap_port
150
- self.dist_init_addr = server_args.dist_init_addr
151
- self.tp_size = server_args.tp_size
152
-
153
- self.tp_rank = args.engine_rank
154
- self.enable_dp_attention = server_args.enable_dp_attention
155
- if self.enable_dp_attention:
156
- assert (
157
- server_args.dp_size > 1
158
- ), "If dp_attention is enabled, dp size must be greater than 1 in disaggregation mode."
159
- self.dp_size = server_args.dp_size
160
- self.tp_size_of_dp = server_args.tp_size // server_args.dp_size
161
- self.attn_tp_rank = args.engine_rank % self.tp_size_of_dp
162
- self.dp_rank = args.engine_rank // self.tp_size_of_dp
163
-
164
- self.rank_port = None
165
110
  self.server_socket = zmq.Context().socket(zmq.PULL)
166
111
  self.register_buffer_to_engine()
167
112
 
168
- self.rank_port = get_free_port()
169
113
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
114
+ self.request_status = {}
170
115
  self.transfer_infos: Dict[int, TransferInfo] = {}
171
- self.condition = threading.Condition()
172
- self.peer_names: Dict[int, str] = {}
116
+ self.peer_names: Dict[str, str] = {}
173
117
  self._start_bootstrap_thread()
174
- self._register_to_bootstrap()
175
118
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
176
- # bootstrap key -> (remote_engine_rank -> possible remote source info)
177
- self.prefill_peer_infos: Dict[str, list[Dict[int, NixlEngineInfo]]] = {}
178
119
  self.transfer_statuses: Dict[int, TransferStatus] = defaultdict(
179
120
  TransferStatus
180
121
  )
@@ -183,6 +124,18 @@ class NixlKVManager(BaseKVManager):
183
124
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
184
125
  )
185
126
 
127
+ def check_status(self, bootstrap_room: int):
128
+ return self.request_status[bootstrap_room]
129
+
130
+ def update_status(self, bootstrap_room: int, status: KVPoll):
131
+ if bootstrap_room not in self.request_status:
132
+ self.request_status[bootstrap_room] = status
133
+ else:
134
+ # NOTE: The prefill engine could recv bootstrapping first
135
+ self.request_status[bootstrap_room] = max(
136
+ self.request_status[bootstrap_room], status
137
+ )
138
+
186
139
  def register_buffer_to_engine(self):
187
140
  kv_addrs = []
188
141
  for kv_data_ptr, kv_data_len in zip(
@@ -203,16 +156,10 @@ class NixlKVManager(BaseKVManager):
203
156
  if not self.aux_descs:
204
157
  raise Exception("NIXL memory registration failed for aux tensors")
205
158
 
206
- @cache
207
- def _connect(self, endpoint: str):
208
- socket = zmq.Context().socket(zmq.PUSH)
209
- socket.connect(endpoint)
210
- return socket
211
-
212
- def _add_remote(self, room: int, agent_metadata: bytes):
213
- if room not in self.peer_names:
214
- self.peer_names[room] = self.agent.add_remote_agent(agent_metadata)
215
- return self.peer_names[room]
159
+ def _add_remote(self, agent_name: str, agent_metadata: bytes):
160
+ if agent_name not in self.peer_names:
161
+ self.peer_names[agent_name] = self.agent.add_remote_agent(agent_metadata)
162
+ return self.peer_names[agent_name]
216
163
 
217
164
  def send_kvcache(
218
165
  self,
@@ -310,40 +257,38 @@ class NixlKVManager(BaseKVManager):
310
257
  assert self.disaggregation_mode == DisaggregationMode.PREFILL
311
258
  assert not is_last or (is_last and aux_index is not None)
312
259
 
313
- # Wait for transfer info to be populated by bootstrap thread.
314
- with self.condition:
315
- self.condition.wait_for(lambda: bootstrap_room in self.transfer_infos)
316
- req = self.transfer_infos[bootstrap_room]
317
- assert bootstrap_room == req.room
260
+ reqs_to_be_processed = self.transfer_infos[bootstrap_room].values()
261
+ handles = []
262
+ for req in reqs_to_be_processed:
263
+ assert bootstrap_room == req.room
264
+ if req.is_dummy():
265
+ continue
318
266
 
319
- if req.is_dummy():
320
- return []
267
+ peer_name = self._add_remote(req.agent_name, req.agent_metadata)
268
+ chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
269
+ assert len(chunked_dst_kv_indice) == len(kv_indices)
321
270
 
322
- peer_name = self._add_remote(bootstrap_room, req.agent_metadata)
323
- chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
324
- assert len(chunked_dst_kv_indice) == len(kv_indices)
325
-
326
- notif = "_".join([str(req.room), "kv", str(chunk_id), str(int(is_last))])
327
- kv_xfer_handle = self.send_kvcache(
328
- peer_name,
329
- kv_indices,
330
- req.dst_kv_ptrs,
331
- chunked_dst_kv_indice,
332
- req.dst_gpu_id,
333
- notif,
334
- )
335
- handles = [kv_xfer_handle]
336
- # Only the last chunk we need to send the aux data.
337
- if is_last:
338
- assert aux_index is not None
339
- aux_xfer_handle = self.send_aux(
271
+ notif = "_".join([str(req.room), "kv", str(chunk_id), str(int(is_last))])
272
+ kv_xfer_handle = self.send_kvcache(
340
273
  peer_name,
341
- aux_index,
342
- req.dst_aux_ptrs,
343
- req.dst_aux_index,
344
- str(req.room) + "_aux",
274
+ kv_indices,
275
+ req.dst_kv_ptrs,
276
+ chunked_dst_kv_indice,
277
+ req.dst_gpu_id,
278
+ notif,
345
279
  )
346
- handles.append(aux_xfer_handle)
280
+ handles.append(kv_xfer_handle)
281
+ # Only the last chunk we need to send the aux data.
282
+ if is_last:
283
+ assert aux_index is not None
284
+ aux_xfer_handle = self.send_aux(
285
+ peer_name,
286
+ aux_index,
287
+ req.dst_aux_ptrs,
288
+ req.dst_aux_index,
289
+ str(req.room) + "_aux",
290
+ )
291
+ handles.append(aux_xfer_handle)
347
292
  return handles
348
293
 
349
294
  def update_transfer_status(self):
@@ -358,7 +303,7 @@ class NixlKVManager(BaseKVManager):
358
303
  room = int(components[0])
359
304
  if components[1] == "kv":
360
305
  chunk_id = int(components[2])
361
- is_last = bool(components[3])
306
+ is_last = bool(int(components[3]))
362
307
  self.transfer_statuses[room].received_kvs.add(chunk_id)
363
308
  if is_last:
364
309
  self.transfer_statuses[room].num_kvs_expected = chunk_id + 1
@@ -370,34 +315,6 @@ class NixlKVManager(BaseKVManager):
370
315
  return False
371
316
  return self.transfer_statuses[room].is_done()
372
317
 
373
- def _register_to_bootstrap(self):
374
- """Register KVSender to bootstrap server via HTTP POST."""
375
- if self.dist_init_addr:
376
- ip_address = socket.gethostbyname(self.dist_init_addr.split(":")[0])
377
- else:
378
- ip_address = get_ip()
379
-
380
- bootstrap_server_url = f"{ip_address}:{self.bootstrap_port}"
381
- url = f"http://{bootstrap_server_url}/route"
382
- payload = {
383
- "role": "Prefill",
384
- "rank_ip": get_local_ip_by_remote(),
385
- "rank_port": self.rank_port,
386
- "engine_rank": self.kv_args.engine_rank,
387
- "agent_name": self.agent.name,
388
- }
389
-
390
- try:
391
- response = requests.put(url, json=payload)
392
- if response.status_code == 200:
393
- logger.debug("Prefill successfully registered to bootstrap server.")
394
- else:
395
- logger.error(
396
- f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
397
- )
398
- except Exception as e:
399
- logger.error(f"Prefill Failed to register to bootstrap server: {e}")
400
-
401
318
  def _start_bootstrap_thread(self):
402
319
  self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
403
320
 
@@ -413,12 +330,20 @@ class NixlKVManager(BaseKVManager):
413
330
  ), f"First message should be {GUARD}. Foreign traffic?"
414
331
  waiting_req_bytes = waiting_req_bytes[1:]
415
332
  room = waiting_req_bytes[0].decode("ascii")
416
- if room == "None":
417
- continue
333
+
334
+ required_dst_info_num = int(waiting_req_bytes[10].decode("ascii"))
418
335
  room = int(room)
419
- with self.condition:
420
- self.transfer_infos[room] = TransferInfo.from_zmq(waiting_req_bytes)
421
- self.condition.notify_all()
336
+ agent_name = waiting_req_bytes[4].decode("ascii")
337
+ if room not in self.transfer_infos:
338
+ self.transfer_infos[room] = {}
339
+ self.transfer_infos[room][agent_name] = TransferInfo.from_zmq(
340
+ waiting_req_bytes
341
+ )
342
+
343
+ logger.debug(f"got info {room=} {agent_name=} {required_dst_info_num=}")
344
+ if len(self.transfer_infos[room]) == required_dst_info_num:
345
+ logger.debug(f"{room=} is bootstrapped")
346
+ self.update_status(room, KVPoll.WaitingForInput)
422
347
 
423
348
  threading.Thread(target=bootstrap_thread).start()
424
349
 
@@ -433,6 +358,9 @@ class NixlKVSender(BaseKVSender):
433
358
  self.xfer_handles = []
434
359
  self.has_sent = False
435
360
  self.chunk_id = 0
361
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
362
+ # inner state
363
+ self.curr_idx = 0
436
364
 
437
365
  def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
438
366
  self.num_kv_indices = num_kv_indices
@@ -441,9 +369,11 @@ class NixlKVSender(BaseKVSender):
441
369
  def send(
442
370
  self,
443
371
  kv_indices: npt.NDArray[np.int64],
444
- index_slice: slice,
445
- is_last: bool,
446
372
  ):
373
+ index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
374
+ self.curr_idx += len(kv_indices)
375
+ is_last = self.curr_idx == self.num_kv_indices
376
+
447
377
  new_xfer_handles = self.kv_mgr.add_transfer_request(
448
378
  self.bootstrap_room,
449
379
  kv_indices,
@@ -459,7 +389,7 @@ class NixlKVSender(BaseKVSender):
459
389
 
460
390
  def poll(self) -> KVPoll:
461
391
  if not self.has_sent:
462
- return KVPoll.WaitingForInput # type: ignore
392
+ return self.kv_mgr.check_status(self.bootstrap_room)
463
393
  states = [self.kv_mgr.agent.check_xfer_state(x) for x in self.xfer_handles]
464
394
  if all([x == "DONE" for x in states]):
465
395
  return KVPoll.Success # type: ignore
@@ -471,128 +401,28 @@ class NixlKVSender(BaseKVSender):
471
401
  raise Exception("Fake KVSender Exception")
472
402
 
473
403
 
474
- class NixlKVReceiver(BaseKVReceiver):
475
-
404
+ class NixlKVReceiver(CommonKVReceiver):
476
405
  def __init__(
477
406
  self,
478
407
  mgr: NixlKVManager,
479
408
  bootstrap_addr: str,
480
409
  bootstrap_room: Optional[int] = None,
410
+ data_parallel_rank: Optional[int] = None,
481
411
  ):
482
- self.bootstrap_room = bootstrap_room
483
- self.bootstrap_addr = bootstrap_addr
484
- self.kv_mgr = mgr
485
412
  self.started_transfer = False
486
-
487
- # NOTE: key distinguished by bootstrap_addr and engine_rank
488
- bootstrap_key = f"{self.bootstrap_addr}_{self.kv_mgr.kv_args.engine_rank}"
489
-
490
- if bootstrap_key not in self.kv_mgr.prefill_peer_infos:
491
- self.bootstrap_info = self._get_bootstrap_info_from_server(
492
- self.kv_mgr.kv_args.engine_rank
493
- )
494
- if self.bootstrap_info is None:
495
- logger.error(
496
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
497
- )
498
- else:
499
- self.kv_mgr.prefill_peer_infos[bootstrap_key] = self.bootstrap_info
500
- else:
501
- self.bootstrap_info = self.kv_mgr.prefill_peer_infos[bootstrap_key]
502
- assert self.bootstrap_info is not None
503
-
504
- # return a list of remotes in a dict, [(remote_engine_rank -> NixlEngineInfo), ...]
505
- # In each dict, there are multiple possible remotes named "equal sources".
506
- # We only need to select one to split the traffic. i.e. we totally select len(list) remotes.
507
- def _get_bootstrap_info_from_server(
508
- self, engine_rank
509
- ) -> Optional[List[Dict[int, NixlEngineInfo]]]:
510
- """Fetch the bootstrap info from the bootstrap server."""
511
- try:
512
- if self.kv_mgr.enable_dp_attention:
513
- url = f"http://{self.bootstrap_addr}/route"
514
- response = requests.get(url)
515
- if response.status_code != 200:
516
- logger.error(
517
- f"Failed to get prefill server info: {response.status_code}, {response.text}"
518
- )
519
- return None
520
-
521
- bootstrap_info = response.json()
522
- assert isinstance(bootstrap_info, dict)
523
- bootstrap_info = {int(k): v for k, v in bootstrap_info.items()}
524
-
525
- # split out who need to send to this rank.
526
- # currently for dpsk mla model, those ranks share the same latent cache.
527
- # pick one as the real source
528
-
529
- prefill_tp_size = len(bootstrap_info.keys())
530
-
531
- assert (
532
- prefill_tp_size >= self.kv_mgr.tp_size_of_dp
533
- ), f"Only support Prefill TP size >= Decode TP size of DP, now we have {prefill_tp_size} vs {self.kv_mgr.tp_size_of_dp}"
534
-
535
- num_remote_tp_rank_we_managed = (
536
- prefill_tp_size // self.kv_mgr.tp_size_of_dp
537
- )
538
-
539
- # We handle [num * self.attn_tp_rank, num * self.attn_tp_rank + num)
540
- remote_tp_ranks = list(range(0, prefill_tp_size))
541
- # split it into tp_size_of_dp parts and get our part
542
- remote_tp_ranks_grouped = [
543
- remote_tp_ranks[i : i + num_remote_tp_rank_we_managed]
544
- for i in range(0, prefill_tp_size, self.kv_mgr.tp_size_of_dp)
545
- ]
546
- managed_ranks = remote_tp_ranks_grouped[self.kv_mgr.attn_tp_rank]
547
-
548
- assert len(managed_ranks) == num_remote_tp_rank_we_managed
549
-
550
- logger.debug(
551
- f"Rank {self.kv_mgr.kv_args.engine_rank} source can be {managed_ranks}"
552
- )
553
-
554
- return [
555
- {
556
- rk: bootstrap_info[rk]
557
- for rk in bootstrap_info.keys()
558
- if rk in managed_ranks
559
- }
560
- ]
561
- else:
562
- url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}"
563
- response = requests.get(url)
564
- if response.status_code == 200:
565
- bootstrap_info = response.json()
566
- return [{engine_rank: bootstrap_info}]
567
- else:
568
- logger.error(
569
- f"Failed to get prefill server info: {response.status_code}, {response.text}"
570
- )
571
- return None
572
- except Exception as e:
573
- logger.error(f"Error fetching prefill info from bootstrap: {e}")
574
- return None
575
-
576
- @cache
577
- def _connect(self, endpoint: str):
578
- socket = zmq.Context().socket(zmq.PUSH)
579
- socket.connect(endpoint)
580
- return socket
413
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, data_parallel_rank)
581
414
 
582
415
  def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
583
-
584
- assert self.bootstrap_info is not None
585
- assert self.bootstrap_room is not None
586
-
587
- for equal_sources in self.bootstrap_info:
588
- remote_rank = list(equal_sources.keys())[
589
- self.bootstrap_room % len(equal_sources)
590
- ]
591
- self.prefill_server_url = f"{equal_sources[remote_rank]['rank_ip']}:{equal_sources[remote_rank]['rank_port']}"
416
+ for bootstrap_info in self.bootstrap_infos:
417
+ self.prefill_server_url = (
418
+ f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
419
+ )
592
420
  logger.debug(
593
- f"Fetched bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}, source: {remote_rank}, all: {list(equal_sources.keys())}"
421
+ f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
594
422
  )
423
+ is_dummy = bootstrap_info["is_dummy"]
595
424
 
425
+ # TODO: send_kv_args earlier
596
426
  packed_kv_data_ptrs = b"".join(
597
427
  struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
598
428
  )
@@ -603,30 +433,22 @@ class NixlKVReceiver(BaseKVReceiver):
603
433
  logger.debug(
604
434
  f"Sending to {self.prefill_server_url} with bootstrap room {self.bootstrap_room}"
605
435
  )
606
- self._connect("tcp://" + self.prefill_server_url).send_multipart(
607
- [
608
- GUARD,
609
- str(self.bootstrap_room).encode("ascii"),
610
- get_local_ip_by_remote().encode("ascii"),
611
- str(self.kv_mgr.rank_port).encode("ascii"),
612
- self.kv_mgr.agent.get_agent_metadata(),
613
- packed_kv_data_ptrs,
614
- kv_indices.tobytes(),
615
- packed_aux_data_ptrs,
616
- str(aux_index).encode("ascii"),
617
- str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
618
- ]
619
- )
620
-
621
- for dummy_rank in equal_sources.keys():
622
- if dummy_rank == remote_rank:
623
- continue
624
- dummy_info = equal_sources[dummy_rank]
625
- dummy_url = f"{dummy_info['rank_ip']}:{dummy_info['rank_port']}"
626
- self._connect("tcp://" + dummy_url).send_multipart(
436
+ sock, lock = self._connect("tcp://" + self.prefill_server_url)
437
+ with lock:
438
+ sock.send_multipart(
627
439
  [
628
440
  GUARD,
629
441
  str(self.bootstrap_room).encode("ascii"),
442
+ get_local_ip_by_remote().encode("ascii"),
443
+ str(self.kv_mgr.rank_port).encode("ascii"),
444
+ self.kv_mgr.agent.get_agent_metadata(),
445
+ self.kv_mgr.agent.name.encode("ascii"),
446
+ packed_kv_data_ptrs,
447
+ kv_indices.tobytes() if not is_dummy else b"",
448
+ packed_aux_data_ptrs,
449
+ str(aux_index).encode("ascii"),
450
+ str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
451
+ str(self.required_dst_info_num).encode("ascii"),
630
452
  ]
631
453
  )
632
454
 
@@ -642,152 +464,12 @@ class NixlKVReceiver(BaseKVReceiver):
642
464
  return KVPoll.Success # type: ignore
643
465
  return KVPoll.WaitingForInput # type: ignore
644
466
 
467
+ def _register_kv_args(self):
468
+ pass
469
+
645
470
  def failure_exception(self):
646
471
  raise Exception("Fake KVReceiver Exception")
647
472
 
648
473
 
649
- class NixlKVBootstrapServer(BaseKVBootstrapServer):
650
- def __init__(self, port: int):
651
- logger.debug(f"NixlKVBootstrapServer started on port {port}")
652
- self.port = port
653
- self.app = web.Application()
654
- self.store = dict()
655
- self.lock = asyncio.Lock()
656
- self._setup_routes()
657
- self.prefill_port_table: Dict[int, Dict[str, Union[str, int]]] = {}
658
-
659
- # Start bootstrap server
660
- self.thread = threading.Thread(target=self._run_server, daemon=True)
661
- self.run()
662
-
663
- def run(self):
664
- self.thread.start()
665
-
666
- def _setup_routes(self):
667
- self.app.router.add_route("*", "/metadata", self._handle_metadata)
668
- self.app.router.add_route("*", "/route", self._handle_route)
669
-
670
- async def _handle_metadata(self, request: web.Request):
671
- key = request.query.get("key", "")
672
-
673
- if request.method == "GET":
674
- return await self._handle_metadata_get(key)
675
- elif request.method == "PUT":
676
- return await self._handle_metadata_put(key, request)
677
- elif request.method == "DELETE":
678
- return await self._handle_metadata_delete(key)
679
- return web.Response(
680
- text="Method not allowed", status=405, content_type="application/json"
681
- )
682
-
683
- async def _handle_metadata_get(self, key):
684
- async with self.lock:
685
- value = self.store.get(key)
686
- if value is None:
687
- return web.Response(
688
- text="metadata not found", status=404, content_type="application/json"
689
- )
690
- return web.Response(body=value, status=200, content_type="application/json")
691
-
692
- async def _handle_metadata_put(self, key, request):
693
- data = await request.read()
694
- async with self.lock:
695
- self.store[key] = data
696
- return web.Response(
697
- text="metadata updated", status=200, content_type="application/json"
698
- )
699
-
700
- async def _handle_metadata_delete(self, key):
701
- async with self.lock:
702
- if key not in self.store:
703
- return web.Response(
704
- text="metadata not found",
705
- status=404,
706
- content_type="application/json",
707
- )
708
- del self.store[key]
709
- return web.Response(
710
- text="metadata deleted", status=200, content_type="application/json"
711
- )
712
-
713
- async def _handle_route(self, request: web.Request):
714
- method = request.method
715
- if method == "PUT":
716
- return await self._handle_route_put(request)
717
- elif method == "GET":
718
- return await self._handle_route_get(request)
719
- else:
720
- return web.Response(
721
- text="Method not allowed", status=405, content_type="application/json"
722
- )
723
-
724
- async def _handle_route_put(self, request: web.Request):
725
- data = await request.json()
726
- role = data["role"]
727
- rank_ip = data["rank_ip"]
728
- rank_port = int(data["rank_port"])
729
- engine_rank = int(data["engine_rank"])
730
- agent_name = data["agent_name"]
731
-
732
- if role == "Prefill":
733
- async with self.lock:
734
- self.prefill_port_table[engine_rank] = {
735
- "rank_ip": rank_ip,
736
- "rank_port": rank_port,
737
- "agent_name": agent_name,
738
- }
739
- logger.info(
740
- f"Registered Prefill boostrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port} and name: {agent_name}"
741
- )
742
-
743
- return web.Response(text="OK", status=200)
744
-
745
- async def _handle_route_get(self, request: web.Request):
746
- engine_rank = request.query.get("engine_rank")
747
- if not engine_rank:
748
- logger.debug(
749
- f"No engine_rank specified, return all {len(self.prefill_port_table)} engine infos as a dict"
750
- )
751
- # Return a dict of all engine_rank
752
- async with self.lock:
753
- bootstrap_info = self.prefill_port_table
754
- return web.json_response(bootstrap_info, status=200)
755
-
756
- # Find corresponding prefill info
757
- async with self.lock:
758
- bootstrap_info = self.prefill_port_table.get(int(engine_rank))
759
- if bootstrap_info is not None:
760
- return web.json_response(bootstrap_info, status=200)
761
- else:
762
- return web.Response(text="Not Found", status=404)
763
-
764
- def _run_server(self):
765
- try:
766
- # Event Loop
767
- self._loop = asyncio.new_event_loop()
768
- asyncio.set_event_loop(self._loop)
769
-
770
- self._runner = web.AppRunner(self.app)
771
- self._loop.run_until_complete(self._runner.setup())
772
-
773
- site = web.TCPSite(self._runner, port=self.port)
774
- self._loop.run_until_complete(site.start())
775
- self._loop.run_forever()
776
- except Exception as e:
777
- logger.error(f"Server error: {str(e)}")
778
- finally:
779
- # Cleanup
780
- self._loop.run_until_complete(self._runner.cleanup())
781
- self._loop.close()
782
-
783
- def close(self):
784
- """Shutdown"""
785
- if self._loop is not None and self._loop.is_running():
786
- self._loop.call_soon_threadsafe(self._loop.stop)
787
- logger.info("Stopping server loop...")
788
-
789
- if self.thread.is_alive():
790
- self.thread.join(timeout=2)
791
- logger.info("Server thread stopped")
792
-
793
- def poll(self) -> KVPoll: ...
474
+ class NixlKVBootstrapServer(CommonKVBootstrapServer):
475
+ pass