sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ import queue
|
|
9
9
|
import socket
|
10
10
|
import struct
|
11
11
|
import threading
|
12
|
+
import time
|
13
|
+
from collections import defaultdict
|
12
14
|
from functools import cache
|
13
15
|
from typing import Dict, List, Optional, Tuple, Union
|
14
16
|
|
@@ -26,36 +28,38 @@ from sglang.srt.disaggregation.base.conn import (
|
|
26
28
|
KVArgs,
|
27
29
|
KVPoll,
|
28
30
|
)
|
31
|
+
from sglang.srt.disaggregation.common.utils import (
|
32
|
+
FastQueue,
|
33
|
+
group_concurrent_contiguous,
|
34
|
+
)
|
29
35
|
from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
|
30
36
|
from sglang.srt.disaggregation.utils import DisaggregationMode
|
31
37
|
from sglang.srt.server_args import ServerArgs
|
32
|
-
from sglang.srt.utils import
|
38
|
+
from sglang.srt.utils import (
|
39
|
+
get_free_port,
|
40
|
+
get_int_env_var,
|
41
|
+
get_ip,
|
42
|
+
get_local_ip_by_remote,
|
43
|
+
)
|
33
44
|
|
34
45
|
logger = logging.getLogger(__name__)
|
35
46
|
|
36
47
|
|
37
|
-
|
38
|
-
|
39
|
-
)
|
40
|
-
|
41
|
-
|
42
|
-
return [], []
|
43
|
-
|
44
|
-
brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
|
45
|
-
src_groups = np.split(src_indices, brk)
|
46
|
-
dst_groups = np.split(dst_indices, brk)
|
47
|
-
|
48
|
-
src_groups = [g.tolist() for g in src_groups]
|
49
|
-
dst_groups = [g.tolist() for g in dst_groups]
|
48
|
+
class KVTransferError(Exception):
|
49
|
+
def __init__(self, bootstrap_room: int, failure_reason: str):
|
50
|
+
super().__init__(failure_reason)
|
51
|
+
self.bootstrap_room = bootstrap_room
|
52
|
+
self.failure_reason = failure_reason
|
50
53
|
|
51
|
-
|
54
|
+
def __str__(self):
|
55
|
+
return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
|
52
56
|
|
53
57
|
|
54
58
|
# prefill
|
55
59
|
@dataclasses.dataclass
|
56
60
|
class TransferKVChunk:
|
57
61
|
room: int
|
58
|
-
prefill_kv_indices: npt.NDArray[np.
|
62
|
+
prefill_kv_indices: npt.NDArray[np.int32]
|
59
63
|
index_slice: slice
|
60
64
|
is_last: bool
|
61
65
|
prefill_aux_index: Optional[int]
|
@@ -68,7 +72,7 @@ class TransferInfo:
|
|
68
72
|
endpoint: str
|
69
73
|
dst_port: int
|
70
74
|
mooncake_session_id: str
|
71
|
-
dst_kv_indices: npt.NDArray[np.
|
75
|
+
dst_kv_indices: npt.NDArray[np.int32]
|
72
76
|
dst_aux_index: int
|
73
77
|
required_dst_info_num: int
|
74
78
|
is_dummy: bool
|
@@ -77,10 +81,10 @@ class TransferInfo:
|
|
77
81
|
def from_zmq(cls, msg: List[bytes]):
|
78
82
|
if msg[4] == b"" and msg[5] == b"":
|
79
83
|
is_dummy = True
|
80
|
-
dst_kv_indices = np.array([], dtype=np.
|
84
|
+
dst_kv_indices = np.array([], dtype=np.int32)
|
81
85
|
dst_aux_index = None
|
82
86
|
else:
|
83
|
-
dst_kv_indices = np.frombuffer(msg[4], dtype=np.
|
87
|
+
dst_kv_indices = np.frombuffer(msg[4], dtype=np.int32)
|
84
88
|
dst_aux_index = int(msg[5].decode("ascii"))
|
85
89
|
is_dummy = False
|
86
90
|
return cls(
|
@@ -148,18 +152,55 @@ class MooncakeKVManager(BaseKVManager):
|
|
148
152
|
self.server_socket = zmq.Context().socket(zmq.PULL)
|
149
153
|
self.register_buffer_to_engine()
|
150
154
|
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
151
|
-
self.transfer_queue = queue.Queue()
|
152
155
|
self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
|
153
156
|
self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
|
154
157
|
self.start_prefill_thread()
|
155
158
|
self._register_to_bootstrap()
|
156
|
-
|
159
|
+
self.session_failures = defaultdict(int)
|
160
|
+
self.failed_sessions = set()
|
161
|
+
self.session_lock = threading.Lock()
|
157
162
|
# Determine the number of threads to use for kv sender
|
158
163
|
cpu_count = os.cpu_count()
|
159
|
-
|
160
|
-
|
164
|
+
transfer_thread_pool_size = get_int_env_var(
|
165
|
+
"SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
|
166
|
+
min(max(4, int(0.75 * cpu_count) // 8), 12),
|
167
|
+
)
|
168
|
+
transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
|
169
|
+
self.transfer_queues: List[FastQueue] = [
|
170
|
+
FastQueue() for _ in range(transfer_queue_size)
|
171
|
+
]
|
172
|
+
assert transfer_thread_pool_size >= transfer_queue_size, (
|
173
|
+
f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
|
174
|
+
f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
|
175
|
+
)
|
176
|
+
self.executors = [
|
177
|
+
concurrent.futures.ThreadPoolExecutor(
|
178
|
+
transfer_thread_pool_size // transfer_queue_size
|
179
|
+
)
|
180
|
+
for _ in range(transfer_queue_size)
|
181
|
+
]
|
182
|
+
for queue, executor in zip(self.transfer_queues, self.executors):
|
183
|
+
threading.Thread(
|
184
|
+
target=self.transfer_worker, args=(queue, executor), daemon=True
|
185
|
+
).start()
|
186
|
+
|
187
|
+
self.bootstrap_time_out = get_int_env_var(
|
188
|
+
"SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
|
161
189
|
)
|
162
190
|
elif self.disaggregation_mode == DisaggregationMode.DECODE:
|
191
|
+
self.heartbeat_failures = {}
|
192
|
+
self.session_pool = defaultdict(requests.Session)
|
193
|
+
self.session_pool_lock = threading.Lock()
|
194
|
+
self.addr_to_rooms_tracker = defaultdict(set)
|
195
|
+
self.connection_lock = threading.Lock()
|
196
|
+
# Heartbeat interval should be at least 2 seconds
|
197
|
+
self.heartbeat_interval = max(
|
198
|
+
float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
|
199
|
+
)
|
200
|
+
# Heartbeat failure should be at least 1
|
201
|
+
self.max_failures = max(
|
202
|
+
get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
|
203
|
+
)
|
163
204
|
self.start_decode_thread()
|
164
205
|
self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
|
165
206
|
self.prefill_tp_size_table: Dict[str, int] = {}
|
@@ -169,6 +210,9 @@ class MooncakeKVManager(BaseKVManager):
|
|
169
210
|
f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
|
170
211
|
)
|
171
212
|
|
213
|
+
self.failure_records: Dict[int, str] = {}
|
214
|
+
self.failure_lock = threading.Lock()
|
215
|
+
|
172
216
|
def register_buffer_to_engine(self):
|
173
217
|
for kv_data_ptr, kv_data_len in zip(
|
174
218
|
self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
|
@@ -189,9 +233,10 @@ class MooncakeKVManager(BaseKVManager):
|
|
189
233
|
def send_kvcache(
|
190
234
|
self,
|
191
235
|
mooncake_session_id: str,
|
192
|
-
prefill_kv_indices: npt.NDArray[np.
|
236
|
+
prefill_kv_indices: npt.NDArray[np.int32],
|
193
237
|
dst_kv_ptrs: list[int],
|
194
|
-
dst_kv_indices: npt.NDArray[np.
|
238
|
+
dst_kv_indices: npt.NDArray[np.int32],
|
239
|
+
executor: concurrent.futures.ThreadPoolExecutor,
|
195
240
|
):
|
196
241
|
# Group by indices
|
197
242
|
prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
|
@@ -223,7 +268,7 @@ class MooncakeKVManager(BaseKVManager):
|
|
223
268
|
return 0
|
224
269
|
|
225
270
|
futures = [
|
226
|
-
|
271
|
+
executor.submit(
|
227
272
|
process_layer,
|
228
273
|
src_ptr,
|
229
274
|
dst_ptr,
|
@@ -235,8 +280,6 @@ class MooncakeKVManager(BaseKVManager):
|
|
235
280
|
for future in concurrent.futures.as_completed(futures):
|
236
281
|
status = future.result()
|
237
282
|
if status != 0:
|
238
|
-
# Immediate shutdown on first error (existing tasks will finish)
|
239
|
-
self.executor.shutdown(wait=False)
|
240
283
|
for f in futures:
|
241
284
|
f.cancel()
|
242
285
|
return status
|
@@ -255,23 +298,138 @@ class MooncakeKVManager(BaseKVManager):
|
|
255
298
|
self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
|
256
299
|
)
|
257
300
|
decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
|
258
|
-
# TODO: mooncake transfer engine can do async transfer. Do async later
|
259
|
-
# Not sure about the amount of aux data, maybe transfer it by zmq is more effective
|
260
301
|
status = self.engine.transfer_sync(
|
261
302
|
mooncake_session_id, prefill_aux_addr, decode_aux_addr, aux_item_len
|
262
303
|
)
|
263
304
|
return status
|
264
305
|
|
265
|
-
def sync_status_to_decode_endpoint(
|
306
|
+
def sync_status_to_decode_endpoint(
|
307
|
+
self, remote: str, dst_port: int, room: int, status: int
|
308
|
+
):
|
266
309
|
if ":" in remote:
|
267
310
|
remote = remote.split(":")[0]
|
268
311
|
self._connect("tcp://" + remote + ":" + str(dst_port)).send_multipart(
|
269
312
|
[
|
270
313
|
str(room).encode("ascii"),
|
271
|
-
str(
|
314
|
+
str(status).encode("ascii"),
|
272
315
|
]
|
273
316
|
)
|
274
317
|
|
318
|
+
def transfer_worker(
|
319
|
+
self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
|
320
|
+
):
|
321
|
+
while True:
|
322
|
+
try:
|
323
|
+
kv_chunk: TransferKVChunk = queue.get()
|
324
|
+
reqs_to_be_processed = (
|
325
|
+
self.transfer_infos[kv_chunk.room].values()
|
326
|
+
if kv_chunk.room in self.transfer_infos
|
327
|
+
else []
|
328
|
+
)
|
329
|
+
polls = []
|
330
|
+
dst_ranks_infos = []
|
331
|
+
for req in reqs_to_be_processed:
|
332
|
+
if not req.is_dummy:
|
333
|
+
# Early exit if the request has failed
|
334
|
+
with self.session_lock:
|
335
|
+
if req.mooncake_session_id in self.failed_sessions:
|
336
|
+
self.record_failure(
|
337
|
+
kv_chunk.room,
|
338
|
+
f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
|
339
|
+
)
|
340
|
+
self.update_status(kv_chunk.room, KVPoll.Failed)
|
341
|
+
self.sync_status_to_decode_endpoint(
|
342
|
+
req.endpoint,
|
343
|
+
req.dst_port,
|
344
|
+
req.room,
|
345
|
+
KVPoll.Failed,
|
346
|
+
)
|
347
|
+
break
|
348
|
+
|
349
|
+
chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
|
350
|
+
|
351
|
+
# NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
|
352
|
+
# is mismatched with the dst_kv_indices when page size > 1, this should never happen.
|
353
|
+
if len(chunked_dst_kv_indice) < len(
|
354
|
+
kv_chunk.prefill_kv_indices
|
355
|
+
):
|
356
|
+
kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
|
357
|
+
: len(chunked_dst_kv_indice)
|
358
|
+
]
|
359
|
+
logger.warning(
|
360
|
+
f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
|
361
|
+
)
|
362
|
+
|
363
|
+
ret = self.send_kvcache(
|
364
|
+
req.mooncake_session_id,
|
365
|
+
kv_chunk.prefill_kv_indices,
|
366
|
+
self.decode_kv_args_table[
|
367
|
+
req.mooncake_session_id
|
368
|
+
].dst_kv_ptrs,
|
369
|
+
chunked_dst_kv_indice,
|
370
|
+
executor,
|
371
|
+
)
|
372
|
+
if ret != 0:
|
373
|
+
with self.session_lock:
|
374
|
+
self.session_failures[req.mooncake_session_id] += 1
|
375
|
+
# Failures should never happen if the session is not dead, if the session fails once, mark it as failed
|
376
|
+
if self.session_failures[req.mooncake_session_id] >= 1:
|
377
|
+
self.failed_sessions.add(req.mooncake_session_id)
|
378
|
+
logger.error(
|
379
|
+
f"Session {req.mooncake_session_id} failed."
|
380
|
+
)
|
381
|
+
self.record_failure(
|
382
|
+
kv_chunk.room,
|
383
|
+
f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
|
384
|
+
)
|
385
|
+
self.update_status(kv_chunk.room, KVPoll.Failed)
|
386
|
+
self.sync_status_to_decode_endpoint(
|
387
|
+
req.endpoint, req.dst_port, req.room, KVPoll.Failed
|
388
|
+
)
|
389
|
+
break
|
390
|
+
|
391
|
+
if kv_chunk.is_last:
|
392
|
+
# Only the last chunk we need to send the aux data
|
393
|
+
ret = self.send_aux(
|
394
|
+
req.mooncake_session_id,
|
395
|
+
kv_chunk.prefill_aux_index,
|
396
|
+
self.decode_kv_args_table[
|
397
|
+
req.mooncake_session_id
|
398
|
+
].dst_aux_ptrs,
|
399
|
+
req.dst_aux_index,
|
400
|
+
)
|
401
|
+
polls.append(True if ret == 0 else False)
|
402
|
+
dst_ranks_infos.append(
|
403
|
+
(req.endpoint, req.dst_port, req.room)
|
404
|
+
)
|
405
|
+
|
406
|
+
# Only sync status when all the dst ranks have received the kvcache
|
407
|
+
if len(polls) == req.required_dst_info_num:
|
408
|
+
status = KVPoll.Success if all(polls) else KVPoll.Failed
|
409
|
+
self.update_status(req.room, status)
|
410
|
+
for endpoint, dst_port, room in dst_ranks_infos:
|
411
|
+
self.sync_status_to_decode_endpoint(
|
412
|
+
endpoint, dst_port, room, status
|
413
|
+
)
|
414
|
+
else:
|
415
|
+
# Dummy request means the decode instance is not used, so its status can be marked as success directly
|
416
|
+
# Dummy request does not need to sync status to decode endpoint
|
417
|
+
if kv_chunk.is_last and req.room in self.request_status:
|
418
|
+
self.update_status(req.room, KVPoll.Success)
|
419
|
+
|
420
|
+
if (
|
421
|
+
kv_chunk.room not in self.request_status
|
422
|
+
or self.check_status(kv_chunk.room) == KVPoll.Success
|
423
|
+
):
|
424
|
+
if kv_chunk.room in self.transfer_infos:
|
425
|
+
self.transfer_infos.pop(kv_chunk.room)
|
426
|
+
|
427
|
+
except Exception as e:
|
428
|
+
# NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
|
429
|
+
raise RuntimeError(
|
430
|
+
f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
|
431
|
+
)
|
432
|
+
|
275
433
|
def start_prefill_thread(self):
|
276
434
|
self.rank_port = get_free_port()
|
277
435
|
self.server_socket.bind(f"tcp://{get_local_ip_by_remote()}:{self.rank_port}")
|
@@ -287,6 +445,11 @@ class MooncakeKVManager(BaseKVManager):
|
|
287
445
|
self.decode_kv_args_table[mooncake_session_id] = (
|
288
446
|
KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
|
289
447
|
)
|
448
|
+
with self.session_lock:
|
449
|
+
if mooncake_session_id in self.failed_sessions:
|
450
|
+
self.failed_sessions.remove(mooncake_session_id)
|
451
|
+
if mooncake_session_id in self.session_failures:
|
452
|
+
del self.session_failures[mooncake_session_id]
|
290
453
|
logger.debug(
|
291
454
|
f"Register KVArgs from {mooncake_session_id} successfully"
|
292
455
|
)
|
@@ -304,77 +467,7 @@ class MooncakeKVManager(BaseKVManager):
|
|
304
467
|
if len(self.transfer_infos[room]) == required_dst_info_num:
|
305
468
|
self.update_status(room, KVPoll.WaitingForInput)
|
306
469
|
|
307
|
-
def transfer_thread():
|
308
|
-
# TODO: Shall we use KVPoll.Transferring state?
|
309
|
-
while True:
|
310
|
-
try:
|
311
|
-
kv_chunk: TransferKVChunk = self.transfer_queue.get(timeout=0.01)
|
312
|
-
reqs_to_be_processed = self.transfer_infos[kv_chunk.room].values()
|
313
|
-
polls = []
|
314
|
-
dst_ranks_infos = []
|
315
|
-
for req in reqs_to_be_processed:
|
316
|
-
if not req.is_dummy:
|
317
|
-
chunked_dst_kv_indice = req.dst_kv_indices[
|
318
|
-
kv_chunk.index_slice
|
319
|
-
]
|
320
|
-
assert len(chunked_dst_kv_indice) == len(
|
321
|
-
kv_chunk.prefill_kv_indices
|
322
|
-
), f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
|
323
|
-
|
324
|
-
ret = self.send_kvcache(
|
325
|
-
req.mooncake_session_id,
|
326
|
-
kv_chunk.prefill_kv_indices,
|
327
|
-
self.decode_kv_args_table[
|
328
|
-
req.mooncake_session_id
|
329
|
-
].dst_kv_ptrs,
|
330
|
-
chunked_dst_kv_indice,
|
331
|
-
)
|
332
|
-
if ret != 0:
|
333
|
-
self.update_status(kv_chunk.room, KVPoll.Failed)
|
334
|
-
self.sync_status_to_decode_endpoint(
|
335
|
-
req.endpoint, req.dst_port, req.room
|
336
|
-
)
|
337
|
-
continue
|
338
|
-
|
339
|
-
if kv_chunk.is_last:
|
340
|
-
# Only the last chunk we need to send the aux data
|
341
|
-
ret = self.send_aux(
|
342
|
-
req.mooncake_session_id,
|
343
|
-
kv_chunk.prefill_aux_index,
|
344
|
-
self.decode_kv_args_table[
|
345
|
-
req.mooncake_session_id
|
346
|
-
].dst_aux_ptrs,
|
347
|
-
req.dst_aux_index,
|
348
|
-
)
|
349
|
-
polls.append(True if ret == 0 else False)
|
350
|
-
dst_ranks_infos.append(
|
351
|
-
(req.endpoint, req.dst_port, req.room)
|
352
|
-
)
|
353
|
-
|
354
|
-
# Only sync status when all the dst ranks have received the kvcache
|
355
|
-
if len(polls) == req.required_dst_info_num:
|
356
|
-
self.update_status(
|
357
|
-
req.room,
|
358
|
-
KVPoll.Success if all(polls) else KVPoll.Failed,
|
359
|
-
)
|
360
|
-
for endpoint, dst_port, room in dst_ranks_infos:
|
361
|
-
self.sync_status_to_decode_endpoint(
|
362
|
-
endpoint, dst_port, room
|
363
|
-
)
|
364
|
-
else:
|
365
|
-
# Dummy request means the decode instance is not used, so its status can be marked as success directly
|
366
|
-
# Dummy request does not need to sync status to decode endpoint
|
367
|
-
if kv_chunk.is_last:
|
368
|
-
self.update_status(req.room, KVPoll.Success)
|
369
|
-
|
370
|
-
if self.check_status(kv_chunk.room) == KVPoll.Success:
|
371
|
-
self.transfer_infos.pop(kv_chunk.room)
|
372
|
-
|
373
|
-
except queue.Empty:
|
374
|
-
continue
|
375
|
-
|
376
470
|
threading.Thread(target=bootstrap_thread).start()
|
377
|
-
threading.Thread(target=transfer_thread).start()
|
378
471
|
|
379
472
|
def start_decode_thread(self):
|
380
473
|
self.rank_port = get_free_port()
|
@@ -385,14 +478,74 @@ class MooncakeKVManager(BaseKVManager):
|
|
385
478
|
(bootstrap_room, status) = self.server_socket.recv_multipart()
|
386
479
|
status = int(status.decode("ascii"))
|
387
480
|
bootstrap_room = int(bootstrap_room.decode("ascii"))
|
481
|
+
if status == KVPoll.Failed:
|
482
|
+
self.record_failure(
|
483
|
+
bootstrap_room,
|
484
|
+
f"Failed to get kvcache from prefill instance, it might be dead",
|
485
|
+
)
|
388
486
|
self.update_status(bootstrap_room, status)
|
389
487
|
|
488
|
+
def heartbeat_checker():
|
489
|
+
while True:
|
490
|
+
time.sleep(self.heartbeat_interval)
|
491
|
+
with self.connection_lock:
|
492
|
+
addresses = list(self.prefill_dp_size_table.keys())
|
493
|
+
|
494
|
+
for bootstrap_addr in addresses:
|
495
|
+
session = None
|
496
|
+
try:
|
497
|
+
with self.session_pool_lock:
|
498
|
+
session = self.session_pool[bootstrap_addr]
|
499
|
+
response = session.get(
|
500
|
+
f"http://{bootstrap_addr}/health",
|
501
|
+
timeout=(2, 3),
|
502
|
+
headers={"Connection": "keep-alive"},
|
503
|
+
)
|
504
|
+
if response.status_code == 200:
|
505
|
+
self.heartbeat_failures[bootstrap_addr] = 0
|
506
|
+
|
507
|
+
current_rooms = self.addr_to_rooms_tracker[
|
508
|
+
bootstrap_addr
|
509
|
+
].copy()
|
510
|
+
|
511
|
+
for bootstrap_room in current_rooms:
|
512
|
+
# Remove KVPoll.Success requests from the tracker
|
513
|
+
if bootstrap_room not in self.request_status:
|
514
|
+
self.addr_to_rooms_tracker[bootstrap_addr].discard(
|
515
|
+
bootstrap_room
|
516
|
+
)
|
517
|
+
else:
|
518
|
+
logger.info(
|
519
|
+
f"Attempting to reconnect to {bootstrap_addr}..."
|
520
|
+
)
|
521
|
+
self.heartbeat_failures[bootstrap_addr] = (
|
522
|
+
self.heartbeat_failures.get(bootstrap_addr, 0) + 1
|
523
|
+
)
|
524
|
+
with self.session_pool_lock:
|
525
|
+
if bootstrap_addr in self.session_pool:
|
526
|
+
del self.session_pool[bootstrap_addr]
|
527
|
+
except Exception:
|
528
|
+
logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
|
529
|
+
self.heartbeat_failures[bootstrap_addr] = (
|
530
|
+
self.heartbeat_failures.get(bootstrap_addr, 0) + 1
|
531
|
+
)
|
532
|
+
|
533
|
+
if (
|
534
|
+
self.heartbeat_failures.get(bootstrap_addr, 0)
|
535
|
+
>= self.max_failures
|
536
|
+
):
|
537
|
+
self._handle_node_failure(bootstrap_addr)
|
538
|
+
with self.session_pool_lock:
|
539
|
+
if bootstrap_addr in self.session_pool:
|
540
|
+
del self.session_pool[bootstrap_addr]
|
541
|
+
|
390
542
|
threading.Thread(target=decode_thread).start()
|
543
|
+
threading.Thread(target=heartbeat_checker).start()
|
391
544
|
|
392
545
|
def add_transfer_request(
|
393
546
|
self,
|
394
547
|
bootstrap_room: int,
|
395
|
-
kv_indices: npt.NDArray[np.
|
548
|
+
kv_indices: npt.NDArray[np.int32],
|
396
549
|
index_slice: slice,
|
397
550
|
is_last: bool,
|
398
551
|
aux_index: Optional[int] = None,
|
@@ -400,7 +553,29 @@ class MooncakeKVManager(BaseKVManager):
|
|
400
553
|
assert self.disaggregation_mode == DisaggregationMode.PREFILL
|
401
554
|
assert not is_last or (is_last and aux_index is not None)
|
402
555
|
|
403
|
-
|
556
|
+
if (
|
557
|
+
bootstrap_room not in self.request_status
|
558
|
+
or self.check_status(bootstrap_room) == KVPoll.Failed
|
559
|
+
):
|
560
|
+
logger.debug(
|
561
|
+
"Request with bootstrap_room=%s already failed", bootstrap_room
|
562
|
+
)
|
563
|
+
return
|
564
|
+
|
565
|
+
if bootstrap_room not in self.transfer_infos:
|
566
|
+
# This means that the current rank is a dummy rank for this request,
|
567
|
+
# and it has already been marked as success, so there is no need to
|
568
|
+
# add further chunks into the transfer queue.
|
569
|
+
return
|
570
|
+
|
571
|
+
# NOTE(shangming): sharding according to the dst_infos to make sure
|
572
|
+
# requests with the same dst_sessions will be added into the same
|
573
|
+
# queue, which enables early abort with failed sessions.
|
574
|
+
dst_infos = self.transfer_infos[bootstrap_room].keys()
|
575
|
+
session_port_sum = sum(int(session.split(":")[1]) for session in dst_infos)
|
576
|
+
shard_idx = session_port_sum % len(self.transfer_queues)
|
577
|
+
|
578
|
+
self.transfer_queues[shard_idx].put(
|
404
579
|
TransferKVChunk(
|
405
580
|
room=bootstrap_room,
|
406
581
|
prefill_kv_indices=kv_indices,
|
@@ -409,7 +584,6 @@ class MooncakeKVManager(BaseKVManager):
|
|
409
584
|
prefill_aux_index=aux_index,
|
410
585
|
)
|
411
586
|
)
|
412
|
-
self.update_status(bootstrap_room, KVPoll.WaitingForInput)
|
413
587
|
|
414
588
|
def check_status(self, bootstrap_room: int):
|
415
589
|
return self.request_status[bootstrap_room]
|
@@ -418,10 +592,17 @@ class MooncakeKVManager(BaseKVManager):
|
|
418
592
|
if bootstrap_room not in self.request_status:
|
419
593
|
self.request_status[bootstrap_room] = status
|
420
594
|
else:
|
421
|
-
# NOTE:
|
422
|
-
|
423
|
-
self.request_status[bootstrap_room]
|
424
|
-
|
595
|
+
# NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
|
596
|
+
if status == KVPoll.Failed:
|
597
|
+
self.request_status[bootstrap_room] = KVPoll.Failed
|
598
|
+
else:
|
599
|
+
self.request_status[bootstrap_room] = max(
|
600
|
+
self.request_status[bootstrap_room], status
|
601
|
+
)
|
602
|
+
|
603
|
+
def record_failure(self, bootstrap_room: int, failure_reason: str):
|
604
|
+
with self.failure_lock:
|
605
|
+
self.failure_records[bootstrap_room] = failure_reason
|
425
606
|
|
426
607
|
def get_session_id(self):
|
427
608
|
return self.engine.get_session_id()
|
@@ -445,38 +626,82 @@ class MooncakeKVManager(BaseKVManager):
|
|
445
626
|
}
|
446
627
|
|
447
628
|
try:
|
448
|
-
response = requests.put(url, json=payload)
|
629
|
+
response = requests.put(url, json=payload, timeout=5)
|
449
630
|
if response.status_code == 200:
|
450
631
|
logger.debug("Prefill successfully registered to bootstrap server.")
|
451
632
|
else:
|
452
633
|
logger.error(
|
453
|
-
f"Prefill
|
634
|
+
f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
|
454
635
|
)
|
455
636
|
except Exception as e:
|
456
|
-
logger.error(
|
637
|
+
logger.error(
|
638
|
+
f"Prefill instance failed to register to bootstrap server: {e}"
|
639
|
+
)
|
640
|
+
|
641
|
+
def _handle_node_failure(self, failed_bootstrap_addr):
|
642
|
+
with self.connection_lock:
|
643
|
+
keys_to_remove = [
|
644
|
+
k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
|
645
|
+
]
|
646
|
+
for k in keys_to_remove:
|
647
|
+
del self.connection_pool[k]
|
648
|
+
if failed_bootstrap_addr in self.prefill_tp_size_table:
|
649
|
+
del self.prefill_tp_size_table[failed_bootstrap_addr]
|
650
|
+
if failed_bootstrap_addr in self.prefill_dp_size_table:
|
651
|
+
del self.prefill_dp_size_table[failed_bootstrap_addr]
|
652
|
+
|
653
|
+
possible_affected_rooms = self.addr_to_rooms_tracker.get(
|
654
|
+
failed_bootstrap_addr, []
|
655
|
+
)
|
656
|
+
if failed_bootstrap_addr in self.addr_to_rooms_tracker:
|
657
|
+
del self.addr_to_rooms_tracker[failed_bootstrap_addr]
|
658
|
+
|
659
|
+
# Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
|
660
|
+
affected_rooms = []
|
661
|
+
for room in possible_affected_rooms:
|
662
|
+
if (
|
663
|
+
room in self.request_status
|
664
|
+
and self.check_status(room) != KVPoll.Success
|
665
|
+
):
|
666
|
+
self.record_failure(
|
667
|
+
room,
|
668
|
+
f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
|
669
|
+
)
|
670
|
+
self.update_status(room, KVPoll.Failed)
|
671
|
+
affected_rooms.append(room)
|
672
|
+
logger.error(
|
673
|
+
f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), affected {len(affected_rooms)} requests"
|
674
|
+
)
|
457
675
|
|
458
676
|
|
459
677
|
class MooncakeKVSender(BaseKVSender):
|
460
678
|
|
461
679
|
def __init__(
|
462
|
-
self,
|
680
|
+
self,
|
681
|
+
mgr: MooncakeKVManager,
|
682
|
+
bootstrap_addr: str,
|
683
|
+
bootstrap_room: int,
|
684
|
+
dest_tp_ranks: List[int],
|
685
|
+
pp_rank: int,
|
463
686
|
):
|
464
687
|
self.kv_mgr = mgr
|
465
688
|
self.bootstrap_room = bootstrap_room
|
466
689
|
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
467
690
|
self.aux_index = None
|
468
691
|
self.bootstrap_server_url = bootstrap_addr
|
469
|
-
self.
|
692
|
+
self.conclude_state = None
|
693
|
+
self.init_time = None
|
470
694
|
# inner state
|
471
695
|
self.curr_idx = 0
|
472
696
|
|
473
697
|
def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
|
474
698
|
self.num_kv_indices = num_kv_indices
|
475
699
|
self.aux_index = aux_index
|
700
|
+
self.init_time = time.time()
|
476
701
|
|
477
702
|
def send(
|
478
703
|
self,
|
479
|
-
kv_indices: npt.NDArray[np.
|
704
|
+
kv_indices: npt.NDArray[np.int32],
|
480
705
|
):
|
481
706
|
index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
|
482
707
|
self.curr_idx += len(kv_indices)
|
@@ -496,11 +721,42 @@ class MooncakeKVSender(BaseKVSender):
|
|
496
721
|
)
|
497
722
|
|
498
723
|
def poll(self) -> KVPoll:
|
499
|
-
|
724
|
+
if self.conclude_state is None:
|
725
|
+
status = self.kv_mgr.check_status(self.bootstrap_room)
|
726
|
+
if status in (KVPoll.Success, KVPoll.Failed):
|
727
|
+
self.conclude_state = status
|
728
|
+
elif status == KVPoll.Bootstrapping:
|
729
|
+
if self.init_time is not None:
|
730
|
+
now = time.time()
|
731
|
+
elapsed = now - self.init_time
|
732
|
+
if elapsed >= self.kv_mgr.bootstrap_time_out:
|
733
|
+
self.kv_mgr.record_failure(
|
734
|
+
self.bootstrap_room,
|
735
|
+
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
|
736
|
+
)
|
737
|
+
self.conclude_state = KVPoll.Failed
|
738
|
+
return KVPoll.Failed
|
739
|
+
|
740
|
+
return status
|
741
|
+
else:
|
742
|
+
return self.conclude_state
|
743
|
+
|
744
|
+
def clear(self) -> None:
|
745
|
+
if self.bootstrap_room in self.kv_mgr.request_status:
|
746
|
+
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
500
747
|
|
501
748
|
def failure_exception(self):
|
502
|
-
|
503
|
-
|
749
|
+
self.clear()
|
750
|
+
|
751
|
+
# Explicitly set the status to failure since this request has failed in another rank
|
752
|
+
if self.conclude_state is None:
|
753
|
+
self.conclude_state = KVPoll.Failed
|
754
|
+
|
755
|
+
with self.kv_mgr.failure_lock:
|
756
|
+
failure_reason = self.kv_mgr.failure_records.pop(
|
757
|
+
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
758
|
+
)
|
759
|
+
raise KVTransferError(self.bootstrap_room, failure_reason)
|
504
760
|
|
505
761
|
|
506
762
|
class MooncakeKVReceiver(BaseKVReceiver):
|
@@ -514,22 +770,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
514
770
|
mgr: MooncakeKVManager,
|
515
771
|
bootstrap_addr: str,
|
516
772
|
bootstrap_room: Optional[int] = None,
|
773
|
+
data_parallel_rank: Optional[int] = None,
|
517
774
|
):
|
518
775
|
self.bootstrap_room = bootstrap_room
|
519
776
|
self.bootstrap_addr = bootstrap_addr
|
520
777
|
self.kv_mgr = mgr
|
521
778
|
self.session_id = self.kv_mgr.get_session_id()
|
522
|
-
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
|
779
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
|
780
|
+
self.conclude_state = None
|
781
|
+
self.data_parallel_rank = data_parallel_rank
|
523
782
|
|
524
783
|
if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
|
525
784
|
self.prefill_tp_size, self.prefill_dp_size = (
|
526
|
-
self.
|
785
|
+
self._get_prefill_parallel_info_from_server()
|
527
786
|
)
|
528
787
|
if self.prefill_tp_size is None or self.prefill_dp_size is None:
|
529
|
-
|
530
|
-
|
788
|
+
self.kv_mgr.record_failure(
|
789
|
+
self.bootstrap_room,
|
790
|
+
f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
|
531
791
|
)
|
792
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
|
793
|
+
return
|
532
794
|
else:
|
795
|
+
logger.debug(
|
796
|
+
f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_tp_size}"
|
797
|
+
)
|
533
798
|
self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
|
534
799
|
self.prefill_tp_size
|
535
800
|
)
|
@@ -587,7 +852,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
587
852
|
self.target_tp_rank = self.target_tp_ranks[0]
|
588
853
|
self.required_dst_info_num = 1
|
589
854
|
|
590
|
-
self.
|
855
|
+
if self.data_parallel_rank is not None:
|
856
|
+
logger.debug(f"Targeting DP rank: {self.data_parallel_rank}")
|
857
|
+
self.target_dp_group = self.data_parallel_rank
|
858
|
+
else:
|
859
|
+
self.target_dp_group = bootstrap_room % self.prefill_dp_size
|
591
860
|
|
592
861
|
# NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
|
593
862
|
bootstrap_key = (
|
@@ -607,32 +876,35 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
607
876
|
target_tp_rank == self.target_tp_rank
|
608
877
|
or self.target_tp_rank is None
|
609
878
|
)
|
879
|
+
logger.debug(
|
880
|
+
f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank}"
|
881
|
+
)
|
610
882
|
bootstrap_infos.append(bootstrap_info)
|
611
883
|
else:
|
612
|
-
|
613
|
-
|
884
|
+
self.kv_mgr.record_failure(
|
885
|
+
self.bootstrap_room,
|
886
|
+
f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}",
|
614
887
|
)
|
888
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
|
889
|
+
return
|
890
|
+
|
615
891
|
self.bootstrap_infos = bootstrap_infos
|
892
|
+
self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
|
616
893
|
|
617
|
-
|
618
|
-
|
619
|
-
f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
|
620
|
-
)
|
621
|
-
else:
|
622
|
-
self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
|
623
|
-
# Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
|
624
|
-
self._register_kv_args()
|
894
|
+
# Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
|
895
|
+
self._register_kv_args()
|
625
896
|
else:
|
626
897
|
self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
|
627
898
|
|
628
899
|
assert len(self.bootstrap_infos) > 0
|
629
|
-
self.kv_mgr.
|
900
|
+
self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
|
901
|
+
self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
|
630
902
|
|
631
903
|
def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
|
632
904
|
"""Fetch the bootstrap info from the bootstrap server."""
|
633
905
|
try:
|
634
906
|
url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
|
635
|
-
response = requests.get(url)
|
907
|
+
response = requests.get(url, timeout=5)
|
636
908
|
if response.status_code == 200:
|
637
909
|
bootstrap_info = response.json()
|
638
910
|
return bootstrap_info
|
@@ -645,7 +917,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
645
917
|
logger.error(f"Error fetching prefill info from bootstrap: {e}")
|
646
918
|
return None
|
647
919
|
|
648
|
-
def
|
920
|
+
def _get_prefill_parallel_info_from_server(self) -> Tuple[int, int]:
|
649
921
|
"""Fetch the prefill parallel info from the bootstrap server."""
|
650
922
|
try:
|
651
923
|
url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
|
@@ -659,10 +931,10 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
659
931
|
logger.error(
|
660
932
|
f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
|
661
933
|
)
|
662
|
-
return None
|
934
|
+
return None, None
|
663
935
|
except Exception as e:
|
664
936
|
logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
|
665
|
-
return None
|
937
|
+
return None, None
|
666
938
|
|
667
939
|
def _register_kv_args(self):
|
668
940
|
for bootstrap_info in self.bootstrap_infos:
|
@@ -699,14 +971,11 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
699
971
|
cls._socket_locks[endpoint] = threading.Lock()
|
700
972
|
return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
|
701
973
|
|
702
|
-
def init(self, kv_indices: npt.NDArray[np.
|
974
|
+
def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
|
703
975
|
for bootstrap_info in self.bootstrap_infos:
|
704
976
|
self.prefill_server_url = (
|
705
977
|
f"{bootstrap_info['rank_ip']}:{bootstrap_info['rank_port']}"
|
706
978
|
)
|
707
|
-
logger.debug(
|
708
|
-
f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
|
709
|
-
)
|
710
979
|
is_dummy = bootstrap_info["is_dummy"]
|
711
980
|
|
712
981
|
sock, lock = self._connect("tcp://" + self.prefill_server_url)
|
@@ -724,11 +993,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
724
993
|
)
|
725
994
|
|
726
995
|
def poll(self) -> KVPoll:
|
727
|
-
|
996
|
+
if self.conclude_state is None:
|
997
|
+
status = self.kv_mgr.check_status(self.bootstrap_room)
|
998
|
+
if status in (KVPoll.Success, KVPoll.Failed):
|
999
|
+
self.conclude_state = status
|
1000
|
+
|
1001
|
+
return status
|
1002
|
+
else:
|
1003
|
+
return self.conclude_state
|
1004
|
+
|
1005
|
+
def clear(self) -> None:
|
1006
|
+
if self.bootstrap_room in self.kv_mgr.request_status:
|
1007
|
+
self.kv_mgr.request_status.pop(self.bootstrap_room)
|
728
1008
|
|
729
1009
|
def failure_exception(self):
|
730
|
-
|
731
|
-
|
1010
|
+
self.clear()
|
1011
|
+
|
1012
|
+
# Explicitly set the status to failure since this request has failed in another rank
|
1013
|
+
if self.conclude_state is None:
|
1014
|
+
self.conclude_state = KVPoll.Failed
|
1015
|
+
|
1016
|
+
with self.kv_mgr.failure_lock:
|
1017
|
+
failure_reason = self.kv_mgr.failure_records.pop(
|
1018
|
+
self.bootstrap_room, "Failed due to an unknown reason from another rank"
|
1019
|
+
)
|
1020
|
+
raise KVTransferError(self.bootstrap_room, failure_reason)
|
732
1021
|
|
733
1022
|
|
734
1023
|
class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
@@ -752,6 +1041,10 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
752
1041
|
|
753
1042
|
def _setup_routes(self):
|
754
1043
|
self.app.router.add_route("*", "/route", self._handle_route)
|
1044
|
+
self.app.router.add_get("/health", self._handle_health_check)
|
1045
|
+
|
1046
|
+
async def _handle_health_check(self, request):
|
1047
|
+
return web.Response(text="OK", status=200)
|
755
1048
|
|
756
1049
|
async def _handle_route(self, request: web.Request):
|
757
1050
|
method = request.method
|
@@ -780,14 +1073,14 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
780
1073
|
self.dp_size = dp_size
|
781
1074
|
|
782
1075
|
tp_size_per_dp_rank = tp_size // dp_size
|
783
|
-
if self.tp_size_per_dp_rank
|
1076
|
+
if self.tp_size_per_dp_rank is None:
|
784
1077
|
self.tp_size_per_dp_rank = tp_size_per_dp_rank
|
785
1078
|
|
786
|
-
# Add lock to make sure thread-safe
|
787
1079
|
if role == "Prefill":
|
788
1080
|
dp_group = engine_rank // tp_size_per_dp_rank
|
789
1081
|
tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
|
790
1082
|
|
1083
|
+
# Add lock to make sure thread-safe
|
791
1084
|
async with self.lock:
|
792
1085
|
if dp_group not in self.prefill_port_table:
|
793
1086
|
self.prefill_port_table[dp_group] = {}
|
@@ -797,7 +1090,7 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
797
1090
|
"rank_port": rank_port,
|
798
1091
|
}
|
799
1092
|
logger.debug(
|
800
|
-
f"Register
|
1093
|
+
f"Register prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
|
801
1094
|
)
|
802
1095
|
|
803
1096
|
return web.Response(text="OK", status=200)
|
@@ -833,7 +1126,11 @@ class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
|
833
1126
|
self._loop = asyncio.new_event_loop()
|
834
1127
|
asyncio.set_event_loop(self._loop)
|
835
1128
|
|
836
|
-
|
1129
|
+
access_log = None
|
1130
|
+
if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
|
1131
|
+
access_log = self.app.logger
|
1132
|
+
|
1133
|
+
self._runner = web.AppRunner(self.app, access_log=access_log)
|
837
1134
|
self._loop.run_until_complete(self._runner.setup())
|
838
1135
|
|
839
1136
|
site = web.TCPSite(self._runner, port=self.port)
|