sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ import queue
|
|
9
9
|
import socket
|
10
10
|
import struct
|
11
11
|
import threading
|
12
|
+
import time
|
13
|
+
from collections import defaultdict
|
12
14
|
from functools import cache
|
13
15
|
from typing import Dict, List, Optional, Tuple, Union
|
14
16
|
|
@@ -27,30 +29,33 @@ from sglang.srt.disaggregation.base.conn import (
|
|
27
29
|
KVPoll,
|
28
30
|
)
|
29
31
|
from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
|
30
|
-
from sglang.srt.disaggregation.utils import
|
32
|
+
from sglang.srt.disaggregation.utils import (
|
33
|
+
DisaggregationMode,
|
34
|
+
FastQueue,
|
35
|
+
group_concurrent_contiguous,
|
36
|
+
)
|
31
37
|
from sglang.srt.server_args import ServerArgs
|
32
|
-
from sglang.srt.utils import
|
38
|
+
from sglang.srt.utils import (
|
39
|
+
get_free_port,
|
40
|
+
get_int_env_var,
|
41
|
+
get_ip,
|
42
|
+
get_local_ip_by_remote,
|
43
|
+
)
|
33
44
|
|
34
45
|
logger = logging.getLogger(__name__)
|
35
46
|
|
36
47
|
|
37
|
-
|
38
|
-
|
39
|
-
)
|
40
|
-
|
41
|
-
|
42
|
-
return [], []
|
43
|
-
|
44
|
-
brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
|
45
|
-
src_groups = np.split(src_indices, brk)
|
46
|
-
dst_groups = np.split(dst_indices, brk)
|
47
|
-
|
48
|
-
src_groups = [g.tolist() for g in src_groups]
|
49
|
-
dst_groups = [g.tolist() for g in dst_groups]
|
48
|
+
class KVTransferError(Exception):
|
49
|
+
def __init__(self, bootstrap_room: int, failure_reason: str):
|
50
|
+
super().__init__(failure_reason)
|
51
|
+
self.bootstrap_room = bootstrap_room
|
52
|
+
self.failure_reason = failure_reason
|
50
53
|
|
51
|
-
|
54
|
+
def __str__(self):
|
55
|
+
return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
|
52
56
|
|
53
57
|
|
58
|
+
# prefill
|
54
59
|
@dataclasses.dataclass
|
55
60
|
class TransferKVChunk:
|
56
61
|
room: int
|
@@ -60,6 +65,7 @@ class TransferKVChunk:
|
|
60
65
|
prefill_aux_index: Optional[int]
|
61
66
|
|
62
67
|
|
68
|
+
# decode
|
63
69
|
@dataclasses.dataclass
|
64
70
|
class TransferInfo:
|
65
71
|
room: int
|
@@ -93,6 +99,7 @@ class TransferInfo:
|
|
93
99
|
)
|
94
100
|
|
95
101
|
|
102
|
+
# decode
|
96
103
|
@dataclasses.dataclass
|
97
104
|
class KVArgsRegisterInfo:
|
98
105
|
room: str
|
@@ -145,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
|
|
145
152
|
self.server_socket = zmq.Context().socket(zmq.PULL)
|
146
153
|
self.register_buffer_to_engine()
|
147
154
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
148
|
-
self.transfer_queue = queue.Queue()
|
149
155
|
self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
|
150
156
|
self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
|
151
157
|
self.start_prefill_thread()
|
152
158
|
self._register_to_bootstrap()
|
153
|
-
|
159
|
+
self.session_failures = defaultdict(int)
|
160
|
+
self.failed_sessions = set()
|
161
|
+
self.session_lock = threading.Lock()
|
154
162
|
# Determine the number of threads to use for kv sender
|
155
163
|
cpu_count = os.cpu_count()
|
156
|
-
|
157
|
-
|
164
|
+
transfer_thread_pool_size = get_int_env_var(
|
165
|
+
"SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
|
166
|
+
min(max(4, int(0.75 * cpu_count) // 8), 12),
|
167
|
+
)
|
168
|
+
transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
|
169
|
+
self.transfer_queues: List[FastQueue] = [
|
170
|
+
FastQueue() for _ in range(transfer_queue_size)
|
171
|
+
]
|
172
|
+
assert transfer_thread_pool_size >= transfer_queue_size, (
|
173
|
+
f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
|
174
|
+
f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
|
175
|
+
)
|
176
|
+
self.executors = [
|
177
|
+
concurrent.futures.ThreadPoolExecutor(
|
178
|
+
transfer_thread_pool_size // transfer_queue_size
|
179
|
+
)
|
180
|
+
for _ in range(transfer_queue_size)
|
181
|
+
]
|
182
|
+
for queue, executor in zip(self.transfer_queues, self.executors):
|
183
|
+
threading.Thread(
|
184
|
+
target=self.transfer_worker, args=(queue, executor), daemon=True
|
185
|
+
).start()
|
186
|
+
|
187
|
+
self.bootstrap_time_out = get_int_env_var(
|
188
|
+
"SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
|
158
189
|
)
|
159
190
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
191
|
+
self.heartbeat_failures = {}
|
192
|
+
self.session_pool = defaultdict(requests.Session)
|
193
|
+
self.session_pool_lock = threading.Lock()
|
194
|
+
self.addr_to_rooms_tracker = defaultdict(set)
|
195
|
+
self.connection_lock = threading.Lock()
|
196
|
+
# Heartbeat interval should be at least 2 seconds
|
197
|
+
self.heartbeat_interval = max(
|
198
|
+
float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
|
199
|
+
)
|
200
|
+
# Heartbeat failure should be at least 1
|
201
|
+
self.max_failures = max(
|
202
|
+
get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
|
203
|
+
)
|
160
204
|
self.start_decode_thread()
|
161
205
|
self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
|
162
206
|
self.prefill_tp_size_table: Dict[str, int] = {}
|
@@ -166,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
|
|
166
210
|
f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
|
167
211
|
)
|
168
212
|
|
213
|
+
self.failure_records: Dict[int, str] = {}
|
214
|
+
self.failure_lock = threading.Lock()
|
215
|
+
|
169
216
|
def register_buffer_to_engine(self):
|
170
217
|
for kv_data_ptr, kv_data_len in zip(
|
171
218
|
self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
|
@@ -189,6 +236,7 @@ class MooncakeKVManager(BaseKVManager):
|
|
189
236
|
prefill_kv_indices: npt.NDArray[np.int64],
|
190
237
|
dst_kv_ptrs: list[int],
|
191
238
|
dst_kv_indices: npt.NDArray[np.int64],
|
239
|
+
executor: concurrent.futures.ThreadPoolExecutor,
|
192
240
|
):
|
193
241
|
# Group by indices
|
194
242
|
prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
|
@@ -220,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
|
|
220
268
|
return 0
|
221
269
|
|
222
270
|
futures = [
|
223
|
-
|
271
|
+
executor.submit(
|
224
272
|
process_layer,
|
225
273
|
src_ptr,
|
226
274
|
dst_ptr,
|
@@ -232,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
|
|
232
280
|
for future in concurrent.futures.as_completed(futures):
|
233
281
|
status = future.result()
|
234
282
|
if status != 0:
|
235
|
-
# Immediate shutdown on first error (existing tasks will finish)
|
236
|
-
self.executor.shutdown(wait=False)
|
237
283
|
for f in futures:
|
238
284
|
f.cancel()
|
239
285
|
return status
|
@@ -252,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
|
|
252
298
|
self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
|
253
299
|
)
|
254
300
|
decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
|
255
|
-
# TODO: mooncake transfer engine can do async transfer. Do async later
|
256
|
-
# Not sure about the amount of aux data, maybe transfer it by zmq is more effective
|
257
301
|
status = self.engine.transfer_sync(
|
258
302
|
mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
|
259
303
|
)
|
260
304
|
return status
|
261
305
|
|
262
|
-
def sync_status_to_decode_endpoint(
|
306
|
+
def sync_status_to_decode_endpoint(
|
307
|
+
self, remote: str, dst_port: int, room: int, status: int
|
308
|
+
):
|
263
309
|
if ":" in remote:
|
264
310
|
remote = remote.split(":")[0]
|
265
311
|
self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
|
266
312
|
[
|
267
313
|
str(room).encode("ascii"),
|
268
|
-
str(
|
314
|
+
str(status).encode("ascii"),
|
269
315
|
]
|
270
316
|
)
|
271
317
|
|
318
|
+
def transfer_worker(
|
319
|
+
self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
|
320
|
+
):
|
321
|
+
while True:
|
322
|
+
try:
|
323
|
+
kv_chunk: TransferKVChunk = queue.get()
|
324
|
+
reqs_to_be_processed = (
|
325
|
+
self.transfer_infos[kv_chunk.room].values()
|
326
|
+
if kv_chunk.room in self.transfer_infos
|
327
|
+
else []
|
328
|
+
)
|
329
|
+
polls = []
|
330
|
+
dst_ranks_infos = []
|
331
|
+
for req in reqs_to_be_processed:
|
332
|
+
if not req.is_dummy:
|
333
|
+
# Early exit if the request has failed
|
334
|
+
with self.session_lock:
|
335
|
+
if req.mooncake_session_id in self.failed_sessions:
|
336
|
+
self.record_failure(
|
337
|
+
kv_chunk.room,
|
338
|
+
f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
|
339
|
+
)
|
340
|
+
self.update_status(kv_chunk.room, KVPoll.Failed)
|
341
|
+
self.sync_status_to_decode_endpoint(
|
342
|
+
req.endpoint,
|
343
|
+
req.dst_port,
|
344
|
+
req.room,
|
345
|
+
KVPoll.Failed,
|
346
|
+
)
|
347
|
+
break
|
348
|
+
|
349
|
+
chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
|
350
|
+
|
351
|
+
# NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
|
352
|
+
# is mismatched with the dst_kv_indices when page size > 1, this should never happen.
|
353
|
+
if len(chunked_dst_kv_indice) < len(
|
354
|
+
kv_chunk.prefill_kv_indices
|
355
|
+
):
|
356
|
+
kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
|
357
|
+
: len(chunked_dst_kv_indice)
|
358
|
+
]
|
359
|
+
logger.warning(
|
360
|
+
f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
|
361
|
+
)
|
362
|
+
|
363
|
+
ret = self.send_kvcache(
|
364
|
+
req.mooncake_session_id,
|
365
|
+
kv_chunk.prefill_kv_indices,
|
366
|
+
self.decode_kv_args_table[
|
367
|
+
req.mooncake_session_id
|
368
|
+
].dst_kv_ptrs,
|
369
|
+
chunked_dst_kv_indice,
|
370
|
+
executor,
|
371
|
+
)
|
372
|
+
if ret != 0:
|
373
|
+
with self.session_lock:
|
374
|
+
self.session_failures[req.mooncake_session_id] += 1
|
375
|
+
# Failures should never happen if the session is not dead, if the session fails once, mark it as failed
|
376
|
+
if self.session_failures[req.mooncake_session_id] >= 1:
|
377
|
+
self.failed_sessions.add(req.mooncake_session_id)
|
378
|
+
logger.error(
|
379
|
+
f"Session {req.mooncake_session_id} failed."
|
380
|
+
)
|
381
|
+
self.record_failure(
|
382
|
+
kv_chunk.room,
|
383
|
+
f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
|
384
|
+
)
|
385
|
+
self.update_status(kv_chunk.room, KVPoll.Failed)
|
386
|
+
self.sync_status_to_decode_endpoint(
|
387
|
+
req.endpoint, req.dst_port, req.room, KVPoll.Failed
|
388
|
+
)
|
389
|
+
break
|
390
|
+
|
391
|
+
if kv_chunk.is_last:
|
392
|
+
# Only the last chunk we need to send the aux data
|
393
|
+
ret = self.send_aux(
|
394
|
+
req.mooncake_session_id,
|
395
|
+
kv_chunk.prefill_aux_index,
|
396
|
+
self.decode_kv_args_table[
|
397
|
+
req.mooncake_session_id
|
398
|
+
].dst_aux_ptrs,
|
399
|
+
req.dst_aux_index,
|
400
|
+
)
|
401
|
+
polls.append(True if ret == 0 else False)
|
402
|
+
dst_ranks_infos.append(
|
403
|
+
(req.endpoint, req.dst_port, req.room)
|
404
|
+
)
|
405
|
+
|
406
|
+
# Only sync status when all the dst ranks have received the kvcache
|
407
|
+
if len(polls) == req.required_dst_info_num:
|
408
|
+
status = KVPoll.Success if all(polls) else KVPoll.Failed
|
409
|
+
self.update_status(req.room, status)
|
410
|
+
for endpoint, dst_port, room in dst_ranks_infos:
|
411
|
+
self.sync_status_to_decode_endpoint(
|
412
|
+
endpoint, dst_port, room, status
|
413
|
+
)
|
414
|
+
else:
|
415
|
+
# Dummy request means the decode instance is not used, so its status can be marked as success directly
|
416
|
+
# Dummy request does not need to sync status to decode endpoint
|
417
|
+
if kv_chunk.is_last and req.room in self.request_status:
|
418
|
+
self.update_status(req.room, KVPoll.Success)
|
419
|
+
|
420
|
+
if (
|
421
|
+
kv_chunk.room not in self.request_status
|
422
|
+
or self.check_status(kv_chunk.room) == KVPoll.Success
|
423
|
+
):
|
424
|
+
if kv_chunk.room in self.transfer_infos:
|
425
|
+
self.transfer_infos.pop(kv_chunk.room)
|
426
|
+
|
427
|
+
except Exception as e:
|
428
|
+
# NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
|
429
|
+
raise RuntimeError(
|
430
|
+
f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
|
431
|
+
)
|
432
|
+
|
272
433
|
def start_prefill_thread(self):
|
273
434
|
self.rank_port = get_free_port()
|
274
435
|
self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
|
@@ -284,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
|
|
284
445
|
self.decode_kv_args_table[mooncake_session_id] = (
|
285
446
|
KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
|
286
447
|
)
|
448
|
+
with self.session_lock:
|
449
|
+
if mooncake_session_id in self.failed_sessions:
|
450
|
+
self.failed_sessions.remove(mooncake_session_id)
|
451
|
+
if mooncake_session_id in self.session_failures:
|
452
|
+
del self.session_failures[mooncake_session_id]
|
287
453
|
logger.debug(
|
288
454
|
f"Register KVArgs from {mooncake_session_id} successfully"
|
289
455
|
)
|
@@ -301,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
|
|
301
467
|
if len(self.transfer_infos[room]) == required_dst_info_num:
|
302
468
|
self.update_status(room, KVPoll.WaitingForInput)
|
303
469
|
|
304
|
-
def transfer_thread():
|
305
|
-
# TODO: Shall we use KVPoll.Transferring state?
|
306
|
-
while True:
|
307
|
-
try:
|
308
|
-
kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
|
309
|
-
reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
|
310
|
-
polls = []
|
311
|
-
dst_ranks_infos = []
|
312
|
-
for req in reqs_to_be_processed:
|
313
|
-
if not req.is_dummy:
|
314
|
-
chunked_dst_kv_indice = req.dst_kv_indices[
|
315
|
-
kv_chunk.index_slice
|
316
|
-
]
|
317
|
-
assert len(chunked_dst_kv_indice) == len(
|
318
|
-
kv_chunk.prefill_kv_indices
|
319
|
-
), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
|
320
|
-
|
321
|
-
ret = self.send_kvcache(
|
322
|
-
req.mooncake_session_id,
|
323
|
-
kv_chunk.prefill_kv_indices,
|
324
|
-
self.decode_kv_args_table[
|
325
|
-
req.mooncake_session_id
|
326
|
-
].dst_kv_ptrs,
|
327
|
-
chunked_dst_kv_indice,
|
328
|
-
)
|
329
|
-
if ret != 0:
|
330
|
-
self.update_status(kv_chunk.room, KVPoll.Failed)
|
331
|
-
self.sync_status_to_decode_endpoint(
|
332
|
-
req.endpoint, req.dst_port, req.room
|
333
|
-
)
|
334
|
-
continue
|
335
|
-
|
336
|
-
if kv_chunk.is_last:
|
337
|
-
# Only the last chunk we need to send the aux data
|
338
|
-
ret = self.send_aux(
|
339
|
-
req.mooncake_session_id,
|
340
|
-
kv_chunk.prefill_aux_index,
|
341
|
-
self.decode_kv_args_table[
|
342
|
-
req.mooncake_session_id
|
343
|
-
].dst_aux_ptrs,
|
344
|
-
req.dst_aux_index,
|
345
|
-
)
|
346
|
-
polls.append(True if ret == 0 else False)
|
347
|
-
dst_ranks_infos.append(
|
348
|
-
(req.endpoint, req.dst_port, req.room)
|
349
|
-
)
|
350
|
-
|
351
|
-
# Only sync status when all the dst ranks have received the kvcache
|
352
|
-
if len(polls) == req.required_dst_info_num:
|
353
|
-
self.update_status(
|
354
|
-
req.room,
|
355
|
-
KVPoll.Success if all(polls) else KVPoll.Failed,
|
356
|
-
)
|
357
|
-
for endpoint, dst_port, room in dst_ranks_infos:
|
358
|
-
self.sync_status_to_decode_endpoint(
|
359
|
-
endpoint, dst_port, room
|
360
|
-
)
|
361
|
-
else:
|
362
|
-
# Dummy request means the decode instance is not used, so its status can be marked as success directly
|
363
|
-
# Dummy request does not need to sync status to decode endpoint
|
364
|
-
if kv_chunk.is_last:
|
365
|
-
self.update_status(req.room, KVPoll.Success)
|
366
|
-
|
367
|
-
if self.check_status(kv_chunk.room) == KVPoll.Success:
|
368
|
-
self.transfer_infos.pop(kv_chunk.room)
|
369
|
-
|
370
|
-
except queue.Empty:
|
371
|
-
continue
|
372
|
-
|
373
470
|
threading.Thread(target=bootstrap_thread).start()
|
374
|
-
threading.Thread(target=transfer_thread).start()
|
375
471
|
|
376
472
|
def start_decode_thread(self):
|
377
473
|
self.rank_port = get_free_port()
|
@@ -382,9 +478,69 @@ class MooncakeKVManager(BaseKVManager):
|
|
382
478
|
(bootstrap_room, status) = self.server_socket.recv_multipart()
|
383
479
|
status = int(status.decode("ascii"))
|
384
480
|
bootstrap_room = int(bootstrap_room.decode("ascii"))
|
481
|
+
if status == KVPoll.Failed:
|
482
|
+
self.record_failure(
|
483
|
+
bootstrap_room,
|
484
|
+
f"Failed to get kvcache from prefill instance, it might be dead",
|
485
|
+
)
|
385
486
|
self.update_status(bootstrap_room, status)
|
386
487
|
|
488
|
+
def heartbeat_checker():
|
489
|
+
while True:
|
490
|
+
time.sleep(self.heartbeat_interval)
|
491
|
+
with self.connection_lock:
|
492
|
+
addresses = list(self.prefill_dp_size_table.keys())
|
493
|
+
|
494
|
+
for bootstrap_addr in addresses:
|
495
|
+
session = None
|
496
|
+
try:
|
497
|
+
with self.session_pool_lock:
|
498
|
+
session = self.session_pool[bootstrap_addr]
|
499
|
+
response = session.get(
|
500
|
+
f"http://{bootstrap_addr}/health",
|
501
|
+
timeout=(2, 3),
|
502
|
+
headers={"Connection": "keep-alive"},
|
503
|
+
)
|
504
|
+
if response.status_code == 200:
|
505
|
+
self.heartbeat_failures[bootstrap_addr] = 0
|
506
|
+
|
507
|
+
current_rooms = self.addr_to_rooms_tracker[
|
508
|
+
bootstrap_addr
|
509
|
+
].copy()
|
510
|
+
|
511
|
+
for bootstrap_room in current_rooms:
|
512
|
+
# Remove KVPoll.Success requests from the tracker
|
513
|
+
if bootstrap_room not in self.request_status:
|
514
|
+
self.addr_to_rooms_tracker[bootstrap_addr].discard(
|
515
|
+
bootstrap_room
|
516
|
+
)
|
517
|
+
else:
|
518
|
+
logger.info(
|
519
|
+
f"Attempting to reconnect to {bootstrap_addr}..."
|
520
|
+
)
|
521
|
+
self.heartbeat_failures[bootstrap_addr] = (
|
522
|
+
self.heartbeat_failures.get(bootstrap_addr, 0) + 1
|
523
|
+
)
|
524
|
+
with self.session_pool_lock:
|
525
|
+
if bootstrap_addr in self.session_pool:
|
526
|
+
del self.session_pool[bootstrap_addr]
|
527
|
+
except Exception:
|
528
|
+
logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
|
529
|
+
self.heartbeat_failures[bootstrap_addr] = (
|
530
|
+
self.heartbeat_failures.get(bootstrap_addr, 0) + 1
|
531
|
+
)
|
532
|
+
|
533
|
+
if (
|
534
|
+
self.heartbeat_failures.get(bootstrap_addr, 0)
|
535
|
+
>= self.max_failures
|
536
|
+
):
|
537
|
+
self._handle_node_failure(bootstrap_addr)
|
538
|
+
with self.session_pool_lock:
|
539
|
+
if bootstrap_addr in self.session_pool:
|
540
|
+
del self.session_pool[bootstrap_addr]
|
541
|
+
|
387
542
|
threading.Thread(target=decode_thread).start()
|
543
|
+
threading.Thread(target=heartbeat_checker).start()
|
388
544
|
|
389
545
|
def add_transfer_request(
|
390
546
|
self,
|
@@ -397,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
|
|
397
553
|
assert self.disaggregation_mode == DisaggregationMode.PREFILL
|
398
554
|
assert not is_last or (is_last and aux_index is not None)
|
399
555
|
|
400
|
-
|
556
|
+
if (
|
557
|
+
bootstrap_room not in self.request_status
|
558
|
+
or self.check_status(bootstrap_room) == KVPoll.Failed
|
559
|
+
):
|
560
|
+
logger.debug(
|
561
|
+
"Request with bootstrap_room=%s already failed", bootstrap_room
|
562
|
+
)
|
563
|
+
return
|
564
|
+
|
565
|
+
if bootstrap_room not in self.transfer_infos:
|
566
|
+
# This means that the current rank is a dummy rank for this request,
|
567
|
+
# and it has already been marked as success, so there is no need to
|
568
|
+
# add further chunks into the transfer queue.
|
569
|
+
return
|
570
|
+
|
571
|
+
# NOTE(shangming): sharding according to the dst_infos to make sure
|
572
|
+
# requests with the same dst_sessions will be added into the same
|
573
|
+
# queue, which enables early abort with failed sessions.
|
574
|
+
dst_infos = self.transfer_infos[bootstrap_room].keys()
|
575
|
+
session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
|
576
|
+
shard_idx = session_port_sum % len(self.transfer_queues)
|
577
|
+
|
578
|
+
self.transfer_queues[shard_idx].put(
|
401
579
|
TransferKVChunk(
|
402
580
|
room=bootstrap_room,
|
403
581
|
prefill_kv_indices=kv_indices,
|
@@ -406,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
|
|
406
584
|
prefill_aux_index=aux_index,
|
407
585
|
)
|
408
586
|
)
|
409
|
-
self.update_status(bootstrap_room, KVPoll.WaitingForInput)
|
410
587
|
|
411
588
|
def check_status(self, bootstrap_room: int):
|
412
589
|
return self.request_status[bootstrap_room]
|
@@ -415,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
|
|
415
592
|
if bootstrap_room not in self.request_status:
|
416
593
|
self.request_status[bootstrap_room] = status
|
417
594
|
else:
|
418
|
-
# NOTE:
|
419
|
-
|
420
|
-
self.request_status[bootstrap_room]
|
421
|
-
|
595
|
+
# NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
|
596
|
+
if status == KVPoll.Failed:
|
597
|
+
self.request_status[bootstrap_room] = KVPoll.Failed
|
598
|
+
else:
|
599
|
+
self.request_status[bootstrap_room] = max(
|
600
|
+
self.request_status[bootstrap_room], status
|
601
|
+
)
|
602
|
+
|
603
|
+
def record_failure(self, bootstrap_room: int, failure_reason: str):
|
604
|
+
with self.failure_lock:
|
605
|
+
self.failure_records[bootstrap_room] = failure_reason
|
422
606
|
|
423
607
|
def get_session_id(self):
|
424
608
|
return self.engine.get_session_id()
|
@@ -442,15 +626,52 @@ class MooncakeKVManager(BaseKVManager):
|
|
442
626
|
}
|
443
627
|
|
444
628
|
try:
|
445
|
-
response = requests.put(url, json=payload)
|
629
|
+
response = requests.put(url, json=payload, timeout=5)
|
446
630
|
if response.status_code == 200:
|
447
631
|
logger.debug("Prefill successfully registered to bootstrap server.")
|
448
632
|
else:
|
449
633
|
logger.error(
|
450
|
-
f"Prefill
|
634
|
+
f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
|
451
635
|
)
|
452
636
|
except Exception as e:
|
453
|
-
logger.error(
|
637
|
+
logger.error(
|
638
|
+
f"Prefill instance failed to register to bootstrap server: {e}"
|
639
|
+
)
|
640
|
+
|
641
|
+
def _handle_node_failure(self, failed_bootstrap_addr):
|
642
|
+
with self.connection_lock:
|
643
|
+
keys_to_remove = [
|
644
|
+
k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
|
645
|
+
]
|
646
|
+
for k in keys_to_remove:
|
647
|
+
del self.connection_pool[k]
|
648
|
+
if failed_bootstrap_addr in self.prefill_tp_size_table:
|
649
|
+
del self.prefill_tp_size_table[failed_bootstrap_addr]
|
650
|
+
if failed_bootstrap_addr in self.prefill_dp_size_table:
|
651
|
+
del self.prefill_dp_size_table[failed_bootstrap_addr]
|
652
|
+
|
653
|
+
possible_affected_rooms = self.addr_to_rooms_tracker.get(
|
654
|
+
failed_bootstrap_addr, []
|
655
|
+
)
|
656
|
+
if failed_bootstrap_addr in self.addr_to_rooms_tracker:
|
657
|
+
del self.addr_to_rooms_tracker[failed_bootstrap_addr]
|
658
|
+
|
659
|
+
# Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
|
660
|
+
affected_rooms = []
|
661
|
+
for room in possible_affected_rooms:
|
662
|
+
if (
|
663
|
+
room in self.request_status
|
664
|
+
and self.check_status(room) != KVPoll.Success
|
665
|
+
):
|
666
|
+
self.record_failure(
|
667
|
+
room,
|
668
|
+
f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
|
669
|
+
)
|
670
|
+
self.update_status(room, KVPoll.Failed)
|
671
|
+
affected_rooms.append(room)
|
672
|
+
logger.error(
|
673
|
+
f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
|
674
|
+
)
|
454
675
|
|
455
676
|
|
456
677
|
class MooncakeKVSender(BaseKVSender):
|
@@ -463,18 +684,24 @@ class MooncakeKVSender(BaseKVSender):
|
|
463
684
|
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
464
685
|
self.aux_index = None
|
465
686
|
self.bootstrap_server_url = bootstrap_addr
|
466
|
-
self.
|
687
|
+
self.conclude_state = None
|
688
|
+
self.init_time = None
|
689
|
+
# inner state
|
690
|
+
self.curr_idx = 0
|
467
691
|
|
468
692
|
def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
|
469
693
|
self.num_kv_indices = num_kv_indices
|
470
694
|
self.aux_index = aux_index
|
695
|
+
self.init_time = time.time()
|
471
696
|
|
472
697
|
def send(
|
473
698
|
self,
|
474
699
|
kv_indices: npt.NDArray[np.int64],
|
475
|
-
index_slice: slice,
|
476
|
-
is_last: bool,
|
477
700
|
):
|
701
|
+
index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
|
702
|
+
self.curr_idx += len(kv_indices)
|
703
|
+
is_last = self.curr_idx == self.num_kv_indices
|
704
|
+
|
478
705
|
if not is_last:
|
479
706
|
self.kv_mgr.add_transfer_request(
|
480
707
|
self.bootstrap_room, kv_indices, index_slice, False
|
@@ -489,10 +716,42 @@ class MooncakeKVSender(BaseKVSender):
|
|
489
716
|
)
|
490
717
|
|
491
718
|
def poll(self) -> KVPoll:
|
492
|
-
|
719
|
+
if self.conclude_state is None:
|
720
|
+
status = self.kv_mgr.check_status(self.bootstrap_room)
|
721
|
+
if status in (KVPoll.Success, KVPoll.Failed):
|
722
|
+
self.conclude_state = status
|
723
|
+
elif status == KVPoll.Bootstrapping:
|
724
|
+
if self.init_time is not None:
|
725
|
+
now = time.time()
|
726
|
+
elapsed = now - self.init_time
|
727
|
+
if elapsed >= self.kv_mgr.bootstrap_time_out:
|
728
|
+
self.kv_mgr.record_failure(
|
729
|
+
self.bootstrap_room,
|
730
|
+
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
|
731
|
+
)
|
732
|
+
self.conclude_state = KVPoll.Failed
|
733
|
+
return KVPoll.Failed
|
734
|
+
|
735
|
+
return status
|
736
|
+
else:
|
737
|
+
return self.conclude_state
|
738
|
+
|
739
|
+
def clear(self) -> None:
|
740
|
+
if self.bootstrap_room in self.kv_mgr.request_status:
|
741
|
+
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
493
742
|
|
494
743
|
def failure_exception(self):
|
495
|
-
|
744
|
+
self.clear()
|
745
|
+
|
746
|
+
# Explicitly set the status to failure since this request has failed in another rank
|
747
|
+
if self.conclude_state is None:
|
748
|
+
self.conclude_state = KVPoll.Failed
|
749
|
+
|
750
|
+
with self.kv_mgr.failure_lock:
|
751
|
+
failure_reason = self.kv_mgr.failure_records.pop(
|
752
|
+
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
753
|
+
)
|
754
|
+
raise KVTransferError(self.bootstrap_room, failure_reason)
|
496
755
|
|
497
756
|
|
498
757
|
class MooncakeKVReceiver(BaseKVReceiver):
|
@@ -506,22 +765,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
506
765
|
mgr: MooncakeKVManager,
|
507
766
|
bootstrap_addr: str,
|
508
767
|
bootstrap_room: Optional[int] = None,
|
768
|
+
data_parallel_rank: Optional[int] = None,
|
509
769
|
):
|
510
770
|
self.bootstrap_room = bootstrap_room
|
511
771
|
self.bootstrap_addr = bootstrap_addr
|
512
772
|
self.kv_mgr = mgr
|
513
773
|
self.session_id = self.kv_mgr.get_session_id()
|
514
|
-
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
774
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
|
775
|
+
self.conclude_state = None
|
776
|
+
self.data_parallel_rank = data_parallel_rank
|
515
777
|
|
516
778
|
if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
|
517
779
|
self.prefill_tp_size, self.prefill_dp_size = (
|
518
|
-
self.
|
780
|
+
self._get_prefill_parallel_info_from_server()
|
519
781
|
)
|
520
782
|
if self.prefill_tp_size is None or self.prefill_dp_size is None:
|
521
|
-
|
522
|
-
|
783
|
+
self.kv_mgr.record_failure(
|
784
|
+
self.bootstrap_room,
|
785
|
+
f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
|
523
786
|
)
|
787
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
|
788
|
+
return
|
524
789
|
else:
|
790
|
+
logger.debug(
|
791
|
+
f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
|
792
|
+
)
|
525
793
|
self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
|
526
794
|
self.prefill_tp_size
|
527
795
|
)
|
@@ -579,7 +847,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
579
847
|
self.target_tp_rank = self.target_tp_ranks[0]
|
580
848
|
self.required_dst_info_num = 1
|
581
849
|
|
582
|
-
self.
|
850
|
+
if self.data_parallel_rank is not None:
|
851
|
+
logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
|
852
|
+
self.target_dp_group = self.data_parallel_rank
|
853
|
+
else:
|
854
|
+
self.target_dp_group = bootstrap_room % self.prefill_dp_size
|
583
855
|
|
584
856
|
# NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
|
585
857
|
bootstrap_key = (
|
@@ -599,32 +871,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
599
871
|
target_tp_rank == self.target_tp_rank
|
600
872
|
or self.target_tp_rank is None
|
601
873
|
)
|
874
|
+
logger.debug(
|
875
|
+
f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
|
876
|
+
)
|
602
877
|
bootstrap_infos.append(bootstrap_info)
|
603
878
|
else:
|
604
|
-
|
605
|
-
|
879
|
+
self.kv_mgr.record_failure(
|
880
|
+
self.bootstrap_room,
|
881
|
+
f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
|
606
882
|
)
|
883
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
|
884
|
+
return
|
885
|
+
|
607
886
|
self.bootstrap_infos = bootstrap_infos
|
887
|
+
self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
|
608
888
|
|
609
|
-
|
610
|
-
|
611
|
-
f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
|
612
|
-
)
|
613
|
-
else:
|
614
|
-
self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
|
615
|
-
# Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
|
616
|
-
self._register_kv_args()
|
889
|
+
# Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
|
890
|
+
self._register_kv_args()
|
617
891
|
else:
|
618
892
|
self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
|
619
893
|
|
620
894
|
assert len(self.bootstrap_infos) > 0
|
621
|
-
self.kv_mgr.
|
895
|
+
self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
|
896
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
|
622
897
|
|
623
898
|
def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
|
624
899
|
"""Fetch the bootstrap info from the bootstrap server."""
|
625
900
|
try:
|
626
901
|
url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
|
627
|
-
response = requests.get(url)
|
902
|
+
response = requests.get(url, timeout=5)
|
628
903
|
if response.status_code == 200:
|
629
904
|
bootstrap_info = response.json()
|
630
905
|
return bootstrap_info
|
@@ -637,7 +912,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
637
912
|
logger.error(f"Error fetching prefill info from bootstrap: {e}")
|
638
913
|
return None
|
639
914
|
|
640
|
-
def
|
915
|
+
def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
|
641
916
|
"""Fetch the prefill parallel info from the bootstrap server."""
|
642
917
|
try:
|
643
918
|
url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
|
@@ -651,10 +926,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
651
926
|
logger.error(
|
652
927
|
f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
|
653
928
|
)
|
654
|
-
return None
|
929
|
+
return None, None
|
655
930
|
except Exception as e:
|
656
931
|
logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
|
657
|
-
return None
|
932
|
+
return None, None
|
658
933
|
|
659
934
|
def _register_kv_args(self):
|
660
935
|
for bootstrap_info in self.bootstrap_infos:
|
@@ -696,9 +971,6 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
696
971
|
self.prefill_server_url = (
|
697
972
|
f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
|
698
973
|
)
|
699
|
-
logger.debug(
|
700
|
-
f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
|
701
|
-
)
|
702
974
|
is_dummy = bootstrap_info["is_dummy"]
|
703
975
|
|
704
976
|
sock, lock = self._connect("tcp://" + self.prefill_server_url)
|
@@ -716,10 +988,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
716
988
|
)
|
717
989
|
|
718
990
|
def poll(self) -> KVPoll:
|
719
|
-
|
991
|
+
if self.conclude_state is None:
|
992
|
+
status = self.kv_mgr.check_status(self.bootstrap_room)
|
993
|
+
if status in (KVPoll.Success, KVPoll.Failed):
|
994
|
+
self.conclude_state = status
|
995
|
+
|
996
|
+
return status
|
997
|
+
else:
|
998
|
+
return self.conclude_state
|
999
|
+
|
1000
|
+
def clear(self) -> None:
|
1001
|
+
if self.bootstrap_room in self.kv_mgr.request_status:
|
1002
|
+
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
720
1003
|
|
721
1004
|
def failure_exception(self):
|
722
|
-
|
1005
|
+
self.clear()
|
1006
|
+
|
1007
|
+
# Explicitly set the status to failure since this request has failed in another rank
|
1008
|
+
if self.conclude_state is None:
|
1009
|
+
self.conclude_state = KVPoll.Failed
|
1010
|
+
|
1011
|
+
with self.kv_mgr.failure_lock:
|
1012
|
+
failure_reason = self.kv_mgr.failure_records.pop(
|
1013
|
+
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
1014
|
+
)
|
1015
|
+
raise KVTransferError(self.bootstrap_room, failure_reason)
|
723
1016
|
|
724
1017
|
|
725
1018
|
class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
@@ -743,6 +1036,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
743
1036
|
|
744
1037
|
def _setup_routes(self):
|
745
1038
|
self.app.router.add_route("*", "/route", self._handle_route)
|
1039
|
+
self.app.router.add_get("/health", self._handle_health_check)
|
1040
|
+
|
1041
|
+
async def _handle_health_check(self, request):
|
1042
|
+
return web.Response(text="OK", status=200)
|
746
1043
|
|
747
1044
|
async def _handle_route(self, request: web.Request):
|
748
1045
|
method = request.method
|
@@ -771,14 +1068,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
771
1068
|
self.dp_size = dp_size
|
772
1069
|
|
773
1070
|
tp_size_per_dp_rank = tp_size // dp_size
|
774
|
-
if self.tp_size_per_dp_rank
|
1071
|
+
if self.tp_size_per_dp_rank is None:
|
775
1072
|
self.tp_size_per_dp_rank = tp_size_per_dp_rank
|
776
1073
|
|
777
|
-
# Add lock to make sure thread-safe
|
778
1074
|
if role == "Prefill":
|
779
1075
|
dp_group = engine_rank // tp_size_per_dp_rank
|
780
1076
|
tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
|
781
1077
|
|
1078
|
+
# Add lock to make sure thread-safe
|
782
1079
|
async with self.lock:
|
783
1080
|
if dp_group not in self.prefill_port_table:
|
784
1081
|
self.prefill_port_table[dp_group] = {}
|
@@ -788,7 +1085,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
788
1085
|
"rank_port": rank_port,
|
789
1086
|
}
|
790
1087
|
logger.debug(
|
791
|
-
f"Register
|
1088
|
+
f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
|
792
1089
|
)
|
793
1090
|
|
794
1091
|
return web.Response(text="OK", status=200)
|
@@ -824,7 +1121,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
824
1121
|
self._loop = asyncio.new_event_loop()
|
825
1122
|
asyncio.set_event_loop(self._loop)
|
826
1123
|
|
827
|
-
|
1124
|
+
access_log = None
|
1125
|
+
if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
|
1126
|
+
access_log = self.app.logger
|
1127
|
+
|
1128
|
+
self._runner = web.AppRunner(self.app, access_log=access_log)
|
828
1129
|
self._loop.run_until_complete(self._runner.setup())
|
829
1130
|
|
830
1131
|
site = web.TCPSite(self._runner, port=self.port)
|