sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -18,40 +18,21 @@ import requests
18
18
  import zmq
19
19
  from aiohttp import web
20
20
 
21
- from sglang.srt.disaggregation.base.conn import (
22
- BaseKVBootstrapServer,
23
- BaseKVManager,
24
- BaseKVReceiver,
25
- BaseKVSender,
26
- KVArgs,
27
- KVPoll,
21
+ from sglang.srt.disaggregation.base.conn import BaseKVSender, KVArgs, KVPoll
22
+ from sglang.srt.disaggregation.common.conn import (
23
+ CommonKVBootstrapServer,
24
+ CommonKVManager,
25
+ CommonKVReceiver,
28
26
  )
27
+ from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
29
28
  from sglang.srt.disaggregation.utils import DisaggregationMode
30
29
  from sglang.srt.server_args import ServerArgs
31
- from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
30
+ from sglang.srt.utils import get_local_ip_by_remote
32
31
 
33
32
  logger = logging.getLogger(__name__)
34
33
 
35
34
  NixlEngineInfo: TypeAlias = Dict[str, Union[str, int]]
36
35
 
37
-
38
- def group_concurrent_contiguous(
39
- src_indices: npt.NDArray[np.int64], dst_indices: npt.NDArray[np.int64]
40
- ) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]:
41
- """Vectorised NumPy implementation."""
42
- if src_indices.size == 0:
43
- return [], []
44
-
45
- brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
46
- src_groups = np.split(src_indices, brk)
47
- dst_groups = np.split(dst_indices, brk)
48
-
49
- src_groups = [g.tolist() for g in src_groups]
50
- dst_groups = [g.tolist() for g in dst_groups]
51
-
52
- return src_groups, dst_groups
53
-
54
-
55
36
  GUARD = "NixlMsgGuard".encode("ascii")
56
37
 
57
38
 
@@ -61,42 +42,32 @@ class TransferInfo:
61
42
  endpoint: str
62
43
  dst_port: int
63
44
  agent_metadata: bytes
45
+ agent_name: str
64
46
  dst_kv_ptrs: list[int]
65
- dst_kv_indices: npt.NDArray[np.int64]
47
+ dst_kv_indices: npt.NDArray[np.int32]
66
48
  dst_aux_ptrs: list[int]
67
49
  dst_aux_index: int
68
50
  dst_gpu_id: int
51
+ required_dst_info_num: int
69
52
 
70
53
  def is_dummy(self):
71
- return self.endpoint == ""
54
+ return self.dst_kv_indices.size == 0
72
55
 
73
56
  @classmethod
74
57
  def from_zmq(cls, msg: List[bytes]):
75
- if len(msg) == 1:
76
- # dummy msg
77
- return cls(
78
- room=int(msg[0].decode("ascii")),
79
- endpoint="",
80
- dst_port=0,
81
- agent_metadata=b"",
82
- dst_kv_ptrs=[],
83
- dst_kv_indices=np.array([], dtype=np.int64),
84
- dst_aux_ptrs=[],
85
- dst_aux_index=0,
86
- dst_gpu_id=0,
87
- )
88
- else:
89
- return cls(
90
- room=int(msg[0].decode("ascii")),
91
- endpoint=msg[1].decode("ascii"),
92
- dst_port=int(msg[2].decode("ascii")),
93
- agent_metadata=msg[3],
94
- dst_kv_ptrs=list(struct.unpack(f"{len(msg[4])//8}Q", msg[4])),
95
- dst_kv_indices=np.frombuffer(msg[5], dtype=np.int64),
96
- dst_aux_ptrs=list(struct.unpack(f"{len(msg[6])//8}Q", msg[6])),
97
- dst_aux_index=int(msg[7].decode("ascii")),
98
- dst_gpu_id=int(msg[8].decode("ascii")),
99
- )
58
+ return cls(
59
+ room=int(msg[0].decode("ascii")),
60
+ endpoint=msg[1].decode("ascii"),
61
+ dst_port=int(msg[2].decode("ascii")),
62
+ agent_metadata=msg[3],
63
+ agent_name=msg[4].decode("ascii"),
64
+ dst_kv_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
65
+ dst_kv_indices=np.frombuffer(msg[6], dtype=np.int32),
66
+ dst_aux_ptrs=list(struct.unpack(f"{len(msg[7])//8}Q", msg[7])),
67
+ dst_aux_index=int(msg[8].decode("ascii")),
68
+ dst_gpu_id=int(msg[9].decode("ascii")),
69
+ required_dst_info_num=int(msg[10].decode("ascii")),
70
+ )
100
71
 
101
72
 
102
73
  @dataclasses.dataclass
@@ -116,7 +87,7 @@ class TransferStatus:
116
87
  return self.num_kvs_expected == len(self.received_kvs) and self.received_aux
117
88
 
118
89
 
119
- class NixlKVManager(BaseKVManager):
90
+ class NixlKVManager(CommonKVManager):
120
91
  def __init__(
121
92
  self,
122
93
  args: KVArgs,
@@ -124,6 +95,7 @@ class NixlKVManager(BaseKVManager):
124
95
  server_args: ServerArgs,
125
96
  is_mla_backend: Optional[bool] = False,
126
97
  ):
98
+ super().__init__(args, disaggregation_mode, server_args, is_mla_backend)
127
99
  try:
128
100
  from nixl._api import nixl_agent
129
101
  except ImportError as e:
@@ -133,38 +105,15 @@ class NixlKVManager(BaseKVManager):
133
105
  "to run SGLang with NixlTransferEngine."
134
106
  ) from e
135
107
  self.agent = nixl_agent(str(uuid.uuid4()))
136
- self.kv_args = args
137
- self.disaggregation_mode = disaggregation_mode
138
- # for p/d multi node infer
139
- self.bootstrap_port = server_args.disaggregation_bootstrap_port
140
- self.dist_init_addr = server_args.dist_init_addr
141
- self.tp_size = server_args.tp_size
142
-
143
- self.tp_rank = args.engine_rank
144
- self.enable_dp_attention = server_args.enable_dp_attention
145
- if self.enable_dp_attention:
146
- assert (
147
- server_args.dp_size > 1
148
- ), "If dp_attention is enabled, dp size must be greater than 1 in disaggregation mode."
149
- self.dp_size = server_args.dp_size
150
- self.tp_size_of_dp = server_args.tp_size // server_args.dp_size
151
- self.attn_tp_rank = args.engine_rank % self.tp_size_of_dp
152
- self.dp_rank = args.engine_rank // self.tp_size_of_dp
153
-
154
- self.rank_port = None
155
108
  self.server_socket = zmq.Context().socket(zmq.PULL)
156
109
  self.register_buffer_to_engine()
157
110
 
158
- self.rank_port = get_free_port()
159
111
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
112
+ self.request_status = {}
160
113
  self.transfer_infos: Dict[int, TransferInfo] = {}
161
- self.condition = threading.Condition()
162
- self.peer_names: Dict[int, str] = {}
114
+ self.peer_names: Dict[str, str] = {}
163
115
  self._start_bootstrap_thread()
164
- self._register_to_bootstrap()
165
116
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
166
- # bootstrap key -> (remote_engine_rank -> possible remote source info)
167
- self.prefill_peer_infos: Dict[str, list[Dict[int, NixlEngineInfo]]] = {}
168
117
  self.transfer_statuses: Dict[int, TransferStatus] = defaultdict(
169
118
  TransferStatus
170
119
  )
@@ -173,6 +122,18 @@ class NixlKVManager(BaseKVManager):
173
122
  f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
174
123
  )
175
124
 
125
+ def check_status(self, bootstrap_room: int):
126
+ return self.request_status[bootstrap_room]
127
+
128
+ def update_status(self, bootstrap_room: int, status: KVPoll):
129
+ if bootstrap_room not in self.request_status:
130
+ self.request_status[bootstrap_room] = status
131
+ else:
132
+ # NOTE: The prefill engine could recv bootstrapping first
133
+ self.request_status[bootstrap_room] = max(
134
+ self.request_status[bootstrap_room], status
135
+ )
136
+
176
137
  def register_buffer_to_engine(self):
177
138
  kv_addrs = []
178
139
  for kv_data_ptr, kv_data_len in zip(
@@ -193,23 +154,17 @@ class NixlKVManager(BaseKVManager):
193
154
  if not self.aux_descs:
194
155
  raise Exception("NIXL memory registration failed for aux tensors")
195
156
 
196
- @cache
197
- def _connect(self, endpoint: str):
198
- socket = zmq.Context().socket(zmq.PUSH)
199
- socket.connect(endpoint)
200
- return socket
201
-
202
- def _add_remote(self, room: int, agent_metadata: bytes):
203
- if room not in self.peer_names:
204
- self.peer_names[room] = self.agent.add_remote_agent(agent_metadata)
205
- return self.peer_names[room]
157
+ def _add_remote(self, agent_name: str, agent_metadata: bytes):
158
+ if agent_name not in self.peer_names:
159
+ self.peer_names[agent_name] = self.agent.add_remote_agent(agent_metadata)
160
+ return self.peer_names[agent_name]
206
161
 
207
162
  def send_kvcache(
208
163
  self,
209
164
  peer_name: str,
210
- prefill_kv_indices: npt.NDArray[np.int64],
165
+ prefill_kv_indices: npt.NDArray[np.int32],
211
166
  dst_kv_ptrs: list[int],
212
- dst_kv_indices: npt.NDArray[np.int64],
167
+ dst_kv_indices: npt.NDArray[np.int32],
213
168
  dst_gpu_id: int,
214
169
  notif: str,
215
170
  ):
@@ -291,7 +246,7 @@ class NixlKVManager(BaseKVManager):
291
246
  def add_transfer_request(
292
247
  self,
293
248
  bootstrap_room: int,
294
- kv_indices: npt.NDArray[np.int64],
249
+ kv_indices: npt.NDArray[np.int32],
295
250
  index_slice: slice,
296
251
  is_last: bool,
297
252
  chunk_id: int,
@@ -300,40 +255,38 @@ class NixlKVManager(BaseKVManager):
300
255
  assert self.disaggregation_mode == DisaggregationMode.PREFILL
301
256
  assert not is_last or (is_last and aux_index is not None)
302
257
 
303
- # Wait for transfer info to be populated by bootstrap thread.
304
- with self.condition:
305
- self.condition.wait_for(lambda: bootstrap_room in self.transfer_infos)
306
- req = self.transfer_infos[bootstrap_room]
307
- assert bootstrap_room == req.room
308
-
309
- if req.is_dummy():
310
- return []
258
+ reqs_to_be_processed = self.transfer_infos[bootstrap_room].values()
259
+ handles = []
260
+ for req in reqs_to_be_processed:
261
+ assert bootstrap_room == req.room
262
+ if req.is_dummy():
263
+ continue
311
264
 
312
- peer_name = self._add_remote(bootstrap_room, req.agent_metadata)
313
- chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
314
- assert len(chunked_dst_kv_indice) == len(kv_indices)
265
+ peer_name = self._add_remote(req.agent_name, req.agent_metadata)
266
+ chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
267
+ assert len(chunked_dst_kv_indice) == len(kv_indices)
315
268
 
316
- notif = "_".join([str(req.room), "kv", str(chunk_id), str(int(is_last))])
317
- kv_xfer_handle = self.send_kvcache(
318
- peer_name,
319
- kv_indices,
320
- req.dst_kv_ptrs,
321
- chunked_dst_kv_indice,
322
- req.dst_gpu_id,
323
- notif,
324
- )
325
- handles = [kv_xfer_handle]
326
- # Only the last chunk we need to send the aux data.
327
- if is_last:
328
- assert aux_index is not None
329
- aux_xfer_handle = self.send_aux(
269
+ notif = "_".join([str(req.room), "kv", str(chunk_id), str(int(is_last))])
270
+ kv_xfer_handle = self.send_kvcache(
330
271
  peer_name,
331
- aux_index,
332
- req.dst_aux_ptrs,
333
- req.dst_aux_index,
334
- str(req.room) + "_aux",
272
+ kv_indices,
273
+ req.dst_kv_ptrs,
274
+ chunked_dst_kv_indice,
275
+ req.dst_gpu_id,
276
+ notif,
335
277
  )
336
- handles.append(aux_xfer_handle)
278
+ handles.append(kv_xfer_handle)
279
+ # Only the last chunk we need to send the aux data.
280
+ if is_last:
281
+ assert aux_index is not None
282
+ aux_xfer_handle = self.send_aux(
283
+ peer_name,
284
+ aux_index,
285
+ req.dst_aux_ptrs,
286
+ req.dst_aux_index,
287
+ str(req.room) + "_aux",
288
+ )
289
+ handles.append(aux_xfer_handle)
337
290
  return handles
338
291
 
339
292
  def update_transfer_status(self):
@@ -348,7 +301,7 @@ class NixlKVManager(BaseKVManager):
348
301
  room = int(components[0])
349
302
  if components[1] == "kv":
350
303
  chunk_id = int(components[2])
351
- is_last = bool(components[3])
304
+ is_last = bool(int(components[3]))
352
305
  self.transfer_statuses[room].received_kvs.add(chunk_id)
353
306
  if is_last:
354
307
  self.transfer_statuses[room].num_kvs_expected = chunk_id + 1
@@ -360,34 +313,6 @@ class NixlKVManager(BaseKVManager):
360
313
  return False
361
314
  return self.transfer_statuses[room].is_done()
362
315
 
363
- def _register_to_bootstrap(self):
364
- """Register KVSender to bootstrap server via HTTP POST."""
365
- if self.dist_init_addr:
366
- ip_address = socket.gethostbyname(self.dist_init_addr.split(":")[0])
367
- else:
368
- ip_address = get_ip()
369
-
370
- bootstrap_server_url = f"{ip_address}:{self.bootstrap_port}"
371
- url = f"http://{bootstrap_server_url}/route"
372
- payload = {
373
- "role": "Prefill",
374
- "rank_ip": get_local_ip_by_remote(),
375
- "rank_port": self.rank_port,
376
- "engine_rank": self.kv_args.engine_rank,
377
- "agent_name": self.agent.name,
378
- }
379
-
380
- try:
381
- response = requests.put(url, json=payload)
382
- if response.status_code == 200:
383
- logger.debug("Prefill successfully registered to bootstrap server.")
384
- else:
385
- logger.error(
386
- f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
387
- )
388
- except Exception as e:
389
- logger.error(f"Prefill Failed to register to bootstrap server: {e}")
390
-
391
316
  def _start_bootstrap_thread(self):
392
317
  self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
393
318
 
@@ -403,19 +328,34 @@ class NixlKVManager(BaseKVManager):
403
328
  ), f"First message should be {GUARD}. Foreign traffic?"
404
329
  waiting_req_bytes = waiting_req_bytes[1:]
405
330
  room = waiting_req_bytes[0].decode("ascii")
406
- if room == "None":
407
- continue
331
+
332
+ required_dst_info_num = int(waiting_req_bytes[10].decode("ascii"))
408
333
  room = int(room)
409
- with self.condition:
410
- self.transfer_infos[room] = TransferInfo.from_zmq(waiting_req_bytes)
411
- self.condition.notify_all()
334
+ agent_name = waiting_req_bytes[4].decode("ascii")
335
+ if room not in self.transfer_infos:
336
+ self.transfer_infos[room] = {}
337
+ self.transfer_infos[room][agent_name] = TransferInfo.from_zmq(
338
+ waiting_req_bytes
339
+ )
340
+
341
+ logger.debug(f"got info {room=} {agent_name=} {required_dst_info_num=}")
342
+ if len(self.transfer_infos[room]) == required_dst_info_num:
343
+ logger.debug(f"{room=} is bootstrapped")
344
+ self.update_status(room, KVPoll.WaitingForInput)
412
345
 
413
346
  threading.Thread(target=bootstrap_thread).start()
414
347
 
415
348
 
416
349
  class NixlKVSender(BaseKVSender):
417
350
 
418
- def __init__(self, mgr: NixlKVManager, bootstrap_addr: str, bootstrap_room: int):
351
+ def __init__(
352
+ self,
353
+ mgr: NixlKVManager,
354
+ bootstrap_addr: str,
355
+ bootstrap_room: int,
356
+ dest_tp_ranks: List[int],
357
+ pp_rank: int,
358
+ ):
419
359
  self.kv_mgr = mgr
420
360
  self.bootstrap_room = bootstrap_room
421
361
  self.aux_index = None
@@ -423,6 +363,9 @@ class NixlKVSender(BaseKVSender):
423
363
  self.xfer_handles = []
424
364
  self.has_sent = False
425
365
  self.chunk_id = 0
366
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
367
+ # inner state
368
+ self.curr_idx = 0
426
369
 
427
370
  def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
428
371
  self.num_kv_indices = num_kv_indices
@@ -430,10 +373,12 @@ class NixlKVSender(BaseKVSender):
430
373
 
431
374
  def send(
432
375
  self,
433
- kv_indices: npt.NDArray[np.int64],
434
- index_slice: slice,
435
- is_last: bool,
376
+ kv_indices: npt.NDArray[np.int32],
436
377
  ):
378
+ index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
379
+ self.curr_idx += len(kv_indices)
380
+ is_last = self.curr_idx == self.num_kv_indices
381
+
437
382
  new_xfer_handles = self.kv_mgr.add_transfer_request(
438
383
  self.bootstrap_room,
439
384
  kv_indices,
@@ -449,7 +394,7 @@ class NixlKVSender(BaseKVSender):
449
394
 
450
395
  def poll(self) -> KVPoll:
451
396
  if not self.has_sent:
452
- return KVPoll.WaitingForInput # type: ignore
397
+ return self.kv_mgr.check_status(self.bootstrap_room)
453
398
  states = [self.kv_mgr.agent.check_xfer_state(x) for x in self.xfer_handles]
454
399
  if all([x == "DONE" for x in states]):
455
400
  return KVPoll.Success # type: ignore
@@ -461,128 +406,28 @@ class NixlKVSender(BaseKVSender):
461
406
  raise Exception("Fake KVSender Exception")
462
407
 
463
408
 
464
- class NixlKVReceiver(BaseKVReceiver):
465
-
409
+ class NixlKVReceiver(CommonKVReceiver):
466
410
  def __init__(
467
411
  self,
468
412
  mgr: NixlKVManager,
469
413
  bootstrap_addr: str,
470
414
  bootstrap_room: Optional[int] = None,
415
+ data_parallel_rank: Optional[int] = None,
471
416
  ):
472
- self.bootstrap_room = bootstrap_room
473
- self.bootstrap_addr = bootstrap_addr
474
- self.kv_mgr = mgr
475
417
  self.started_transfer = False
418
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, data_parallel_rank)
476
419
 
477
- # NOTE: key distinguished by bootstrap_addr and engine_rank
478
- bootstrap_key = f"{self.bootstrap_addr}_{self.kv_mgr.kv_args.engine_rank}"
479
-
480
- if bootstrap_key not in self.kv_mgr.prefill_peer_infos:
481
- self.bootstrap_info = self._get_bootstrap_info_from_server(
482
- self.kv_mgr.kv_args.engine_rank
420
+ def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
421
+ for bootstrap_info in self.bootstrap_infos:
422
+ self.prefill_server_url = (
423
+ f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
483
424
  )
484
- if self.bootstrap_info is None:
485
- logger.error(
486
- f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
487
- )
488
- else:
489
- self.kv_mgr.prefill_peer_infos[bootstrap_key] = self.bootstrap_info
490
- else:
491
- self.bootstrap_info = self.kv_mgr.prefill_peer_infos[bootstrap_key]
492
- assert self.bootstrap_info is not None
493
-
494
- # return a list of remotes in a dict, [(remote_engine_rank -> NixlEngineInfo), ...]
495
- # In each dict, there are multiple possible remotes named "equal sources".
496
- # We only need to select one to split the traffic. i.e. we totally select len(list) remotes.
497
- def _get_bootstrap_info_from_server(
498
- self, engine_rank
499
- ) -> Optional[List[Dict[int, NixlEngineInfo]]]:
500
- """Fetch the bootstrap info from the bootstrap server."""
501
- try:
502
- if self.kv_mgr.enable_dp_attention:
503
- url = f"http://{self.bootstrap_addr}/route"
504
- response = requests.get(url)
505
- if response.status_code != 200:
506
- logger.error(
507
- f"Failed to get prefill server info: {response.status_code}, {response.text}"
508
- )
509
- return None
510
-
511
- bootstrap_info = response.json()
512
- assert isinstance(bootstrap_info, dict)
513
- bootstrap_info = {int(k): v for k, v in bootstrap_info.items()}
514
-
515
- # split out who need to send to this rank.
516
- # currently for dpsk mla model, those ranks share the same latent cache.
517
- # pick one as the real source
518
-
519
- prefill_tp_size = len(bootstrap_info.keys())
520
-
521
- assert (
522
- prefill_tp_size >= self.kv_mgr.tp_size_of_dp
523
- ), f"Only support Prefill TP size >= Decode TP size of DP, now we have {prefill_tp_size} vs {self.kv_mgr.tp_size_of_dp}"
524
-
525
- num_remote_tp_rank_we_managed = (
526
- prefill_tp_size // self.kv_mgr.tp_size_of_dp
527
- )
528
-
529
- # We handle [num * self.attn_tp_rank, num * self.attn_tp_rank + num)
530
- remote_tp_ranks = list(range(0, prefill_tp_size))
531
- # split it into tp_size_of_dp parts and get our part
532
- remote_tp_ranks_grouped = [
533
- remote_tp_ranks[i : i + num_remote_tp_rank_we_managed]
534
- for i in range(0, prefill_tp_size, self.kv_mgr.tp_size_of_dp)
535
- ]
536
- managed_ranks = remote_tp_ranks_grouped[self.kv_mgr.attn_tp_rank]
537
-
538
- assert len(managed_ranks) == num_remote_tp_rank_we_managed
539
-
540
- logger.debug(
541
- f"Rank {self.kv_mgr.kv_args.engine_rank} source can be {managed_ranks}"
542
- )
543
-
544
- return [
545
- {
546
- rk: bootstrap_info[rk]
547
- for rk in bootstrap_info.keys()
548
- if rk in managed_ranks
549
- }
550
- ]
551
- else:
552
- url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}"
553
- response = requests.get(url)
554
- if response.status_code == 200:
555
- bootstrap_info = response.json()
556
- return [{engine_rank: bootstrap_info}]
557
- else:
558
- logger.error(
559
- f"Failed to get prefill server info: {response.status_code}, {response.text}"
560
- )
561
- return None
562
- except Exception as e:
563
- logger.error(f"Error fetching prefill info from bootstrap: {e}")
564
- return None
565
-
566
- @cache
567
- def _connect(self, endpoint: str):
568
- socket = zmq.Context().socket(zmq.PUSH)
569
- socket.connect(endpoint)
570
- return socket
571
-
572
- def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
573
-
574
- assert self.bootstrap_info is not None
575
- assert self.bootstrap_room is not None
576
-
577
- for equal_sources in self.bootstrap_info:
578
- remote_rank = list(equal_sources.keys())[
579
- self.bootstrap_room % len(equal_sources)
580
- ]
581
- self.prefill_server_url = f"{equal_sources[remote_rank]['rank_ip']}:{equal_sources[remote_rank]['rank_port']}"
582
425
  logger.debug(
583
- f"Fetched bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}, source: {remote_rank}, all: {list(equal_sources.keys())}"
426
+ f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
584
427
  )
428
+ is_dummy = bootstrap_info["is_dummy"]
585
429
 
430
+ # TODO: send_kv_args earlier
586
431
  packed_kv_data_ptrs = b"".join(
587
432
  struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
588
433
  )
@@ -593,30 +438,22 @@ class NixlKVReceiver(BaseKVReceiver):
593
438
  logger.debug(
594
439
  f"Sending to {self.prefill_server_url} with bootstrap room {self.bootstrap_room}"
595
440
  )
596
- self._connect("tcp://" + self.prefill_server_url).send_multipart(
597
- [
598
- GUARD,
599
- str(self.bootstrap_room).encode("ascii"),
600
- get_local_ip_by_remote().encode("ascii"),
601
- str(self.kv_mgr.rank_port).encode("ascii"),
602
- self.kv_mgr.agent.get_agent_metadata(),
603
- packed_kv_data_ptrs,
604
- kv_indices.tobytes(),
605
- packed_aux_data_ptrs,
606
- str(aux_index).encode("ascii"),
607
- str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
608
- ]
609
- )
610
-
611
- for dummy_rank in equal_sources.keys():
612
- if dummy_rank == remote_rank:
613
- continue
614
- dummy_info = equal_sources[dummy_rank]
615
- dummy_url = f"{dummy_info['rank_ip']}:{dummy_info['rank_port']}"
616
- self._connect("tcp://" + dummy_url).send_multipart(
441
+ sock, lock = self._connect("tcp://" + self.prefill_server_url)
442
+ with lock:
443
+ sock.send_multipart(
617
444
  [
618
445
  GUARD,
619
446
  str(self.bootstrap_room).encode("ascii"),
447
+ get_local_ip_by_remote().encode("ascii"),
448
+ str(self.kv_mgr.rank_port).encode("ascii"),
449
+ self.kv_mgr.agent.get_agent_metadata(),
450
+ self.kv_mgr.agent.name.encode("ascii"),
451
+ packed_kv_data_ptrs,
452
+ kv_indices.tobytes() if not is_dummy else b"",
453
+ packed_aux_data_ptrs,
454
+ str(aux_index).encode("ascii"),
455
+ str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
456
+ str(self.required_dst_info_num).encode("ascii"),
620
457
  ]
621
458
  )
622
459
 
@@ -632,152 +469,12 @@ class NixlKVReceiver(BaseKVReceiver):
632
469
  return KVPoll.Success # type: ignore
633
470
  return KVPoll.WaitingForInput # type: ignore
634
471
 
472
+ def _register_kv_args(self):
473
+ pass
474
+
635
475
  def failure_exception(self):
636
476
  raise Exception("Fake KVReceiver Exception")
637
477
 
638
478
 
639
- class NixlKVBootstrapServer(BaseKVBootstrapServer):
640
- def __init__(self, port: int):
641
- logger.debug(f"NixlKVBootstrapServer started on port {port}")
642
- self.port = port
643
- self.app = web.Application()
644
- self.store = dict()
645
- self.lock = asyncio.Lock()
646
- self._setup_routes()
647
- self.prefill_port_table: Dict[int, Dict[str, Union[str, int]]] = {}
648
-
649
- # Start bootstrap server
650
- self.thread = threading.Thread(target=self._run_server, daemon=True)
651
- self.run()
652
-
653
- def run(self):
654
- self.thread.start()
655
-
656
- def _setup_routes(self):
657
- self.app.router.add_route("*", "/metadata", self._handle_metadata)
658
- self.app.router.add_route("*", "/route", self._handle_route)
659
-
660
- async def _handle_metadata(self, request: web.Request):
661
- key = request.query.get("key", "")
662
-
663
- if request.method == "GET":
664
- return await self._handle_metadata_get(key)
665
- elif request.method == "PUT":
666
- return await self._handle_metadata_put(key, request)
667
- elif request.method == "DELETE":
668
- return await self._handle_metadata_delete(key)
669
- return web.Response(
670
- text="Method not allowed", status=405, content_type="application/json"
671
- )
672
-
673
- async def _handle_metadata_get(self, key):
674
- async with self.lock:
675
- value = self.store.get(key)
676
- if value is None:
677
- return web.Response(
678
- text="metadata not found", status=404, content_type="application/json"
679
- )
680
- return web.Response(body=value, status=200, content_type="application/json")
681
-
682
- async def _handle_metadata_put(self, key, request):
683
- data = await request.read()
684
- async with self.lock:
685
- self.store[key] = data
686
- return web.Response(
687
- text="metadata updated", status=200, content_type="application/json"
688
- )
689
-
690
- async def _handle_metadata_delete(self, key):
691
- async with self.lock:
692
- if key not in self.store:
693
- return web.Response(
694
- text="metadata not found",
695
- status=404,
696
- content_type="application/json",
697
- )
698
- del self.store[key]
699
- return web.Response(
700
- text="metadata deleted", status=200, content_type="application/json"
701
- )
702
-
703
- async def _handle_route(self, request: web.Request):
704
- method = request.method
705
- if method == "PUT":
706
- return await self._handle_route_put(request)
707
- elif method == "GET":
708
- return await self._handle_route_get(request)
709
- else:
710
- return web.Response(
711
- text="Method not allowed", status=405, content_type="application/json"
712
- )
713
-
714
- async def _handle_route_put(self, request: web.Request):
715
- data = await request.json()
716
- role = data["role"]
717
- rank_ip = data["rank_ip"]
718
- rank_port = int(data["rank_port"])
719
- engine_rank = int(data["engine_rank"])
720
- agent_name = data["agent_name"]
721
-
722
- if role == "Prefill":
723
- async with self.lock:
724
- self.prefill_port_table[engine_rank] = {
725
- "rank_ip": rank_ip,
726
- "rank_port": rank_port,
727
- "agent_name": agent_name,
728
- }
729
- logger.info(
730
- f"Registered Prefill boostrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port} and name: {agent_name}"
731
- )
732
-
733
- return web.Response(text="OK", status=200)
734
-
735
- async def _handle_route_get(self, request: web.Request):
736
- engine_rank = request.query.get("engine_rank")
737
- if not engine_rank:
738
- logger.debug(
739
- f"No engine_rank specified, return all {len(self.prefill_port_table)} engine infos as a dict"
740
- )
741
- # Return a dict of all engine_rank
742
- async with self.lock:
743
- bootstrap_info = self.prefill_port_table
744
- return web.json_response(bootstrap_info, status=200)
745
-
746
- # Find corresponding prefill info
747
- async with self.lock:
748
- bootstrap_info = self.prefill_port_table.get(int(engine_rank))
749
- if bootstrap_info is not None:
750
- return web.json_response(bootstrap_info, status=200)
751
- else:
752
- return web.Response(text="Not Found", status=404)
753
-
754
- def _run_server(self):
755
- try:
756
- # Event Loop
757
- self._loop = asyncio.new_event_loop()
758
- asyncio.set_event_loop(self._loop)
759
-
760
- self._runner = web.AppRunner(self.app)
761
- self._loop.run_until_complete(self._runner.setup())
762
-
763
- site = web.TCPSite(self._runner, port=self.port)
764
- self._loop.run_until_complete(site.start())
765
- self._loop.run_forever()
766
- except Exception as e:
767
- logger.error(f"Server error: {str(e)}")
768
- finally:
769
- # Cleanup
770
- self._loop.run_until_complete(self._runner.cleanup())
771
- self._loop.close()
772
-
773
- def close(self):
774
- """Shutdown"""
775
- if self._loop is not None and self._loop.is_running():
776
- self._loop.call_soon_threadsafe(self._loop.stop)
777
- logger.info("Stopping server loop...")
778
-
779
- if self.thread.is_alive():
780
- self.thread.join(timeout=2)
781
- logger.info("Server thread stopped")
782
-
783
- def poll(self) -> KVPoll: ...
479
+ class NixlKVBootstrapServer(CommonKVBootstrapServer):
480
+ pass