sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -13,16 +13,15 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""A controller that dispatches requests to multiple data parallel workers."""
|
15
15
|
|
16
|
+
import faulthandler
|
16
17
|
import logging
|
17
18
|
import multiprocessing as mp
|
18
19
|
import signal
|
19
|
-
import struct
|
20
|
-
import sys
|
21
20
|
import threading
|
22
21
|
import time
|
22
|
+
from collections import deque
|
23
23
|
from enum import Enum, auto
|
24
|
-
from
|
25
|
-
from typing import Dict, List
|
24
|
+
from typing import List
|
26
25
|
|
27
26
|
import psutil
|
28
27
|
import setproctitle
|
@@ -33,14 +32,19 @@ from sglang.srt.managers.io_struct import (
|
|
33
32
|
BlockReqInput,
|
34
33
|
TokenizedEmbeddingReqInput,
|
35
34
|
TokenizedGenerateReqInput,
|
35
|
+
WatchLoadUpdateReq,
|
36
36
|
)
|
37
37
|
from sglang.srt.managers.schedule_batch import Req
|
38
38
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
39
|
-
from sglang.srt.managers.utils import DPBalanceMeta
|
40
39
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
41
40
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
42
|
-
from sglang.srt.utils import
|
43
|
-
|
41
|
+
from sglang.srt.utils import (
|
42
|
+
bind_port,
|
43
|
+
configure_logger,
|
44
|
+
get_zmq_socket,
|
45
|
+
kill_itself_when_parent_died,
|
46
|
+
)
|
47
|
+
from sglang.utils import TypeBasedDispatcher, get_exception_traceback
|
44
48
|
|
45
49
|
logger = logging.getLogger(__name__)
|
46
50
|
|
@@ -61,18 +65,48 @@ class LoadBalanceMethod(Enum):
|
|
61
65
|
raise ValueError(f"Invalid load balance method: {method}") from exc
|
62
66
|
|
63
67
|
|
68
|
+
class DPBudget:
|
69
|
+
def __init__(self):
|
70
|
+
# TODO: support minimum tokens method
|
71
|
+
self.budget_queue = deque()
|
72
|
+
|
73
|
+
def update_budget(self, load_update: WatchLoadUpdateReq):
|
74
|
+
"""Update the budget queue.
|
75
|
+
Use num_reqs instead of num_waiting_reqs to balance decode running batch.
|
76
|
+
"""
|
77
|
+
loads = load_update.loads
|
78
|
+
self.budget_queue.clear()
|
79
|
+
|
80
|
+
num_reqs = [load.num_reqs for load in loads]
|
81
|
+
if not num_reqs:
|
82
|
+
return
|
83
|
+
|
84
|
+
max_num_reqs = max(num_reqs)
|
85
|
+
if all(x == max_num_reqs for x in num_reqs):
|
86
|
+
return
|
87
|
+
|
88
|
+
while any(x != num_reqs[0] for x in num_reqs):
|
89
|
+
min_load = min(num_reqs)
|
90
|
+
min_indices = [i for i, x in enumerate(num_reqs) if x == min_load]
|
91
|
+
second_min_load = min(x for x in num_reqs if x > min_load)
|
92
|
+
self.budget_queue.extend(
|
93
|
+
[loads[i].dp_rank for i in min_indices] * (second_min_load - min_load)
|
94
|
+
)
|
95
|
+
for idx in min_indices:
|
96
|
+
num_reqs[idx] = second_min_load
|
97
|
+
|
98
|
+
def dispatch(self):
|
99
|
+
if self.budget_queue:
|
100
|
+
return self.budget_queue.popleft()
|
101
|
+
return None
|
102
|
+
|
103
|
+
|
64
104
|
class DataParallelController:
|
65
105
|
"""A controller that dispatches requests to multiple data parallel workers."""
|
66
106
|
|
67
|
-
def __init__(
|
68
|
-
self,
|
69
|
-
server_args: ServerArgs,
|
70
|
-
port_args: PortArgs,
|
71
|
-
dp_balance_meta: DPBalanceMeta,
|
72
|
-
) -> None:
|
107
|
+
def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
|
73
108
|
# for dp balance
|
74
109
|
self.global_balance_id = 0
|
75
|
-
self.balance_meta = dp_balance_meta
|
76
110
|
|
77
111
|
# Parse args
|
78
112
|
self.max_total_num_tokens = None
|
@@ -98,9 +132,12 @@ class DataParallelController:
|
|
98
132
|
}
|
99
133
|
self.dispatching = dispatch_lookup[self.load_balance_method]
|
100
134
|
|
135
|
+
# Load balance budget
|
136
|
+
self.dp_budget = DPBudget()
|
137
|
+
|
101
138
|
# Launch data parallel workers
|
102
139
|
self.scheduler_procs = []
|
103
|
-
self.workers = [None] * server_args.dp_size
|
140
|
+
self.workers: List[zmq.Socket] = [None] * server_args.dp_size
|
104
141
|
|
105
142
|
if server_args.enable_dp_attention:
|
106
143
|
dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
|
@@ -121,6 +158,31 @@ class DataParallelController:
|
|
121
158
|
|
122
159
|
self.max_req_input_len = None
|
123
160
|
|
161
|
+
self.init_dispatcher()
|
162
|
+
|
163
|
+
def send_to_all_workers(self, obj):
|
164
|
+
for worker in self.workers:
|
165
|
+
worker.send_pyobj(obj)
|
166
|
+
|
167
|
+
def send_control_message(self, obj):
|
168
|
+
# Send control messages to first worker of tp group
|
169
|
+
for worker in self.workers[:: self.control_message_step]:
|
170
|
+
worker.send_pyobj(obj)
|
171
|
+
|
172
|
+
def handle_load_update_req(self, obj):
|
173
|
+
self.dp_budget.update_budget(obj)
|
174
|
+
|
175
|
+
def init_dispatcher(self):
|
176
|
+
self._request_dispatcher = TypeBasedDispatcher(
|
177
|
+
[
|
178
|
+
(TokenizedGenerateReqInput, self.dispatching),
|
179
|
+
(TokenizedEmbeddingReqInput, self.dispatching),
|
180
|
+
(BlockReqInput, self.send_to_all_workers),
|
181
|
+
(WatchLoadUpdateReq, self.handle_load_update_req),
|
182
|
+
]
|
183
|
+
)
|
184
|
+
self._request_dispatcher.add_fallback_fn(self.send_control_message)
|
185
|
+
|
124
186
|
def launch_dp_schedulers(self, server_args, port_args):
|
125
187
|
base_gpu_id = 0
|
126
188
|
|
@@ -147,7 +209,9 @@ class DataParallelController:
|
|
147
209
|
args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
|
148
210
|
)
|
149
211
|
threads.append(thread)
|
150
|
-
base_gpu_id +=
|
212
|
+
base_gpu_id += (
|
213
|
+
server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
|
214
|
+
)
|
151
215
|
|
152
216
|
# Free all sockets before starting the threads to launch TP workers
|
153
217
|
for sock in sockets:
|
@@ -250,7 +314,6 @@ class DataParallelController:
|
|
250
314
|
pp_rank,
|
251
315
|
dp_rank,
|
252
316
|
writer,
|
253
|
-
self.balance_meta,
|
254
317
|
),
|
255
318
|
)
|
256
319
|
with memory_saver_adapter.configure_subprocess():
|
@@ -266,52 +329,43 @@ class DataParallelController:
|
|
266
329
|
self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
|
267
330
|
self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
|
268
331
|
|
332
|
+
def maybe_external_dp_rank_routing(self, req: Req):
|
333
|
+
if req.data_parallel_rank is not None:
|
334
|
+
logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
|
335
|
+
self.workers[req.data_parallel_rank].send_pyobj(req)
|
336
|
+
return True
|
337
|
+
return False
|
338
|
+
|
269
339
|
def round_robin_scheduler(self, req: Req):
|
340
|
+
if self.maybe_external_dp_rank_routing(req):
|
341
|
+
return
|
342
|
+
|
270
343
|
if self.server_args.disaggregation_mode == "null":
|
271
|
-
|
272
|
-
|
273
|
-
self.workers
|
274
|
-
|
275
|
-
self.workers[self.round_robin_counter].send_pyobj(req)
|
276
|
-
self.round_robin_counter = (self.round_robin_counter + 1) % len(
|
277
|
-
self.workers
|
278
|
-
)
|
344
|
+
self.workers[self.round_robin_counter].send_pyobj(req)
|
345
|
+
self.round_robin_counter = (self.round_robin_counter + 1) % len(
|
346
|
+
self.workers
|
347
|
+
)
|
279
348
|
else:
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
349
|
+
self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
|
350
|
+
|
351
|
+
def shortest_queue_scheduler(self, req):
|
352
|
+
if self.maybe_external_dp_rank_routing(req):
|
353
|
+
return
|
354
|
+
target_worker = self.dp_budget.dispatch()
|
355
|
+
if target_worker is None:
|
356
|
+
self.round_robin_scheduler(req)
|
357
|
+
else:
|
358
|
+
self.workers[target_worker].send_pyobj(req)
|
288
359
|
|
289
360
|
def minimum_tokens_scheduler(self, req):
|
290
|
-
|
291
|
-
|
292
|
-
def get_next_global_balance_id() -> int:
|
293
|
-
INT32_MAX = 2147483647
|
294
|
-
current_id = self.global_balance_id
|
295
|
-
self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
|
296
|
-
return current_id
|
297
|
-
|
298
|
-
req.dp_balance_id = get_next_global_balance_id()
|
299
|
-
with self.balance_meta.mutex:
|
300
|
-
# 1. local_tokens represents the tokens currently inferring on the worker,
|
301
|
-
# while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
|
302
|
-
onfly_info = self.balance_meta.get_shared_onfly()
|
303
|
-
local_tokens = self.balance_meta.get_shared_local_tokens()
|
304
|
-
total_tokens = [
|
305
|
-
local_token + sum(onfly_dict.values())
|
306
|
-
for local_token, onfly_dict in zip(local_tokens, onfly_info)
|
307
|
-
]
|
308
|
-
target_worker = total_tokens.index(min(total_tokens))
|
309
|
-
onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
|
310
|
-
# 2. write the new onfly info to the shm
|
311
|
-
self.balance_meta.set_shared_onfly_info(onfly_info)
|
361
|
+
if self.maybe_external_dp_rank_routing(req):
|
362
|
+
return
|
312
363
|
|
313
|
-
|
314
|
-
|
364
|
+
logger.warning(
|
365
|
+
"The 'minimum_tokens' load balancing method is deprecated for now and will introduced later."
|
366
|
+
"Fall back to 'round_robin_scheduler'"
|
367
|
+
)
|
368
|
+
self.round_robin_scheduler(req)
|
315
369
|
|
316
370
|
def event_loop(self):
|
317
371
|
while True:
|
@@ -320,22 +374,7 @@ class DataParallelController:
|
|
320
374
|
recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
|
321
375
|
except zmq.ZMQError:
|
322
376
|
break
|
323
|
-
|
324
|
-
if isinstance(
|
325
|
-
recv_req,
|
326
|
-
(
|
327
|
-
TokenizedGenerateReqInput,
|
328
|
-
TokenizedEmbeddingReqInput,
|
329
|
-
),
|
330
|
-
):
|
331
|
-
self.dispatching(recv_req)
|
332
|
-
elif isinstance(recv_req, BlockReqInput):
|
333
|
-
for worker in self.workers:
|
334
|
-
worker.send_pyobj(recv_req)
|
335
|
-
else:
|
336
|
-
# Send other control messages to first worker of tp group
|
337
|
-
for worker in self.workers[:: self.control_message_step]:
|
338
|
-
worker.send_pyobj(recv_req)
|
377
|
+
self._request_dispatcher(recv_req)
|
339
378
|
|
340
379
|
|
341
380
|
def run_data_parallel_controller_process(
|
@@ -343,15 +382,14 @@ def run_data_parallel_controller_process(
|
|
343
382
|
port_args: PortArgs,
|
344
383
|
pipe_writer,
|
345
384
|
):
|
385
|
+
kill_itself_when_parent_died()
|
346
386
|
setproctitle.setproctitle("sglang::data_parallel_controller")
|
387
|
+
faulthandler.enable()
|
347
388
|
configure_logger(server_args)
|
348
389
|
parent_process = psutil.Process().parent()
|
349
|
-
balance_meta = DPBalanceMeta(server_args.dp_size)
|
350
390
|
|
351
391
|
try:
|
352
|
-
controller = DataParallelController(
|
353
|
-
server_args, port_args, dp_balance_meta=balance_meta
|
354
|
-
)
|
392
|
+
controller = DataParallelController(server_args, port_args)
|
355
393
|
pipe_writer.send(
|
356
394
|
{
|
357
395
|
"status": "ready",
|
@@ -370,6 +408,3 @@ def run_data_parallel_controller_process(
|
|
370
408
|
traceback = get_exception_traceback()
|
371
409
|
logger.error(f"DataParallelController hit an exception: {traceback}")
|
372
410
|
parent_process.send_signal(signal.SIGQUIT)
|
373
|
-
finally:
|
374
|
-
# we need to destruct mp.Manager() in balance_meta
|
375
|
-
balance_meta.destructor()
|
@@ -24,17 +24,16 @@ import psutil
|
|
24
24
|
import setproctitle
|
25
25
|
import zmq
|
26
26
|
|
27
|
-
from sglang.srt.hf_transformers_utils import get_tokenizer
|
28
27
|
from sglang.srt.managers.io_struct import (
|
29
|
-
|
28
|
+
BatchEmbeddingOutput,
|
30
29
|
BatchMultimodalDecodeReq,
|
31
|
-
|
32
|
-
|
33
|
-
|
30
|
+
BatchMultimodalOutput,
|
31
|
+
BatchStrOutput,
|
32
|
+
BatchTokenIDOutput,
|
34
33
|
FreezeGCReq,
|
35
34
|
MultiTokenizerRegisterReq,
|
36
35
|
)
|
37
|
-
from sglang.srt.managers.multi_tokenizer_mixin import
|
36
|
+
from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
|
38
37
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
39
38
|
from sglang.srt.utils import (
|
40
39
|
configure_logger,
|
@@ -42,6 +41,7 @@ from sglang.srt.utils import (
|
|
42
41
|
get_zmq_socket,
|
43
42
|
kill_itself_when_parent_died,
|
44
43
|
)
|
44
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
45
45
|
from sglang.utils import (
|
46
46
|
TypeBasedDispatcher,
|
47
47
|
find_printable_text,
|
@@ -69,7 +69,7 @@ class DecodeStatus:
|
|
69
69
|
sent_offset: int = 0
|
70
70
|
|
71
71
|
|
72
|
-
class DetokenizerManager(
|
72
|
+
class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
|
73
73
|
"""DetokenizerManager is a process that detokenizes the token ids."""
|
74
74
|
|
75
75
|
def __init__(
|
@@ -101,8 +101,8 @@ class DetokenizerManager(MultiTokenizerMixin):
|
|
101
101
|
|
102
102
|
self._request_dispatcher = TypeBasedDispatcher(
|
103
103
|
[
|
104
|
-
(
|
105
|
-
(
|
104
|
+
(BatchEmbeddingOutput, self.handle_batch_embedding_out),
|
105
|
+
(BatchTokenIDOutput, self.handle_batch_token_id_out),
|
106
106
|
(BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
|
107
107
|
(MultiTokenizerRegisterReq, lambda x: x),
|
108
108
|
(FreezeGCReq, self.handle_freeze_gc_req),
|
@@ -145,11 +145,11 @@ class DetokenizerManager(MultiTokenizerMixin):
|
|
145
145
|
return output[:-1]
|
146
146
|
return output
|
147
147
|
|
148
|
-
def handle_batch_embedding_out(self, recv_obj:
|
148
|
+
def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOutput):
|
149
149
|
# If it is embedding model, no detokenization is needed.
|
150
150
|
return recv_obj
|
151
151
|
|
152
|
-
def handle_batch_token_id_out(self, recv_obj:
|
152
|
+
def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
|
153
153
|
bs = len(recv_obj.rids)
|
154
154
|
|
155
155
|
# Initialize decode status
|
@@ -224,7 +224,7 @@ class DetokenizerManager(MultiTokenizerMixin):
|
|
224
224
|
s.sent_offset = len(output_str)
|
225
225
|
output_strs.append(incremental_output)
|
226
226
|
|
227
|
-
return
|
227
|
+
return BatchStrOutput(
|
228
228
|
rids=recv_obj.rids,
|
229
229
|
finished_reasons=recv_obj.finished_reasons,
|
230
230
|
output_strs=output_strs,
|
@@ -246,17 +246,21 @@ class DetokenizerManager(MultiTokenizerMixin):
|
|
246
246
|
output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
|
247
247
|
output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
|
248
248
|
output_hidden_states=recv_obj.output_hidden_states,
|
249
|
+
placeholder_tokens_idx=None,
|
250
|
+
placeholder_tokens_val=None,
|
249
251
|
)
|
250
252
|
|
251
253
|
def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
|
252
254
|
outputs = self.tokenizer.detokenize(recv_obj)
|
253
|
-
return
|
255
|
+
return BatchMultimodalOutput(
|
254
256
|
rids=recv_obj.rids,
|
255
257
|
finished_reasons=recv_obj.finished_reasons,
|
256
258
|
outputs=outputs,
|
257
259
|
prompt_tokens=recv_obj.prompt_tokens,
|
258
260
|
completion_tokens=recv_obj.completion_tokens,
|
259
261
|
cached_tokens=recv_obj.cached_tokens,
|
262
|
+
placeholder_tokens_idx=None,
|
263
|
+
placeholder_tokens_val=None,
|
260
264
|
)
|
261
265
|
|
262
266
|
def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
|
@@ -289,11 +293,11 @@ def run_detokenizer_process(
|
|
289
293
|
try:
|
290
294
|
manager = DetokenizerManager(server_args, port_args)
|
291
295
|
if server_args.tokenizer_worker_num > 1:
|
292
|
-
manager.
|
296
|
+
manager.multi_http_worker_event_loop()
|
293
297
|
else:
|
294
298
|
manager.event_loop()
|
295
299
|
except Exception:
|
296
|
-
manager.
|
300
|
+
manager.maybe_clear_socket_mapping()
|
297
301
|
traceback = get_exception_traceback()
|
298
302
|
logger.error(f"DetokenizerManager hit an exception: {traceback}")
|
299
303
|
parent_process.send_signal(signal.SIGQUIT)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
"""Start bootstrap/kv-store-related server"""
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Type
|
5
|
+
|
6
|
+
from sglang.srt.disaggregation.base import BaseKVBootstrapServer
|
7
|
+
from sglang.srt.disaggregation.utils import (
|
8
|
+
DisaggregationMode,
|
9
|
+
KVClassType,
|
10
|
+
TransferBackend,
|
11
|
+
get_kv_class,
|
12
|
+
)
|
13
|
+
from sglang.srt.server_args import ServerArgs
|
14
|
+
|
15
|
+
|
16
|
+
def start_disagg_service(
|
17
|
+
server_args: ServerArgs,
|
18
|
+
):
|
19
|
+
# Start kv boostrap server on prefill
|
20
|
+
disagg_mode = DisaggregationMode(server_args.disaggregation_mode)
|
21
|
+
transfer_backend = TransferBackend(server_args.disaggregation_transfer_backend)
|
22
|
+
|
23
|
+
if disagg_mode == DisaggregationMode.PREFILL:
|
24
|
+
# only start bootstrap server on prefill tm
|
25
|
+
kv_bootstrap_server_class: Type[BaseKVBootstrapServer] = get_kv_class(
|
26
|
+
transfer_backend, KVClassType.BOOTSTRAP_SERVER
|
27
|
+
)
|
28
|
+
bootstrap_server: BaseKVBootstrapServer = kv_bootstrap_server_class(
|
29
|
+
host=server_args.host,
|
30
|
+
port=server_args.disaggregation_bootstrap_port,
|
31
|
+
)
|
32
|
+
is_create_store = (
|
33
|
+
server_args.node_rank == 0 and transfer_backend == TransferBackend.ASCEND
|
34
|
+
)
|
35
|
+
if is_create_store:
|
36
|
+
try:
|
37
|
+
from mf_adapter import create_config_store
|
38
|
+
|
39
|
+
ascend_url = os.getenv("ASCEND_MF_STORE_URL")
|
40
|
+
create_config_store(ascend_url)
|
41
|
+
except Exception as e:
|
42
|
+
error_message = f"Failed create mf store, invalid ascend_url."
|
43
|
+
error_message += f" With exception {e}"
|
44
|
+
raise error_message
|
45
|
+
|
46
|
+
return bootstrap_server
|