sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
# ==============================================================================
|
|
14
14
|
"""The arguments of the server."""
|
|
15
15
|
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
16
18
|
import argparse
|
|
17
19
|
import dataclasses
|
|
18
20
|
import json
|
|
@@ -20,7 +22,9 @@ import logging
|
|
|
20
22
|
import os
|
|
21
23
|
import random
|
|
22
24
|
import tempfile
|
|
23
|
-
from typing import List, Literal, Optional, Union
|
|
25
|
+
from typing import Dict, List, Literal, Optional, Union
|
|
26
|
+
|
|
27
|
+
import orjson
|
|
24
28
|
|
|
25
29
|
from sglang.srt.connector import ConnectorType
|
|
26
30
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
|
@@ -32,6 +36,7 @@ from sglang.srt.utils import (
|
|
|
32
36
|
configure_ipv6,
|
|
33
37
|
get_device,
|
|
34
38
|
get_device_memory_capacity,
|
|
39
|
+
get_device_sm,
|
|
35
40
|
is_cuda,
|
|
36
41
|
is_flashinfer_available,
|
|
37
42
|
is_hip,
|
|
@@ -40,6 +45,7 @@ from sglang.srt.utils import (
|
|
|
40
45
|
is_remote_url,
|
|
41
46
|
is_sm90_supported,
|
|
42
47
|
is_sm100_supported,
|
|
48
|
+
is_sm120_supported,
|
|
43
49
|
is_triton_kernels_available,
|
|
44
50
|
is_valid_ipv6_address,
|
|
45
51
|
json_list_type,
|
|
@@ -51,6 +57,7 @@ from sglang.utils import is_in_ci
|
|
|
51
57
|
|
|
52
58
|
logger = logging.getLogger(__name__)
|
|
53
59
|
|
|
60
|
+
|
|
54
61
|
# Define constants
|
|
55
62
|
LOAD_FORMAT_CHOICES = [
|
|
56
63
|
"auto",
|
|
@@ -76,6 +83,7 @@ QUANTIZATION_CHOICES = [
|
|
|
76
83
|
"bitsandbytes",
|
|
77
84
|
"gguf",
|
|
78
85
|
"modelopt",
|
|
86
|
+
"modelopt_fp8",
|
|
79
87
|
"modelopt_fp4",
|
|
80
88
|
"petit_nvfp4",
|
|
81
89
|
"w8a8_int8",
|
|
@@ -84,6 +92,7 @@ QUANTIZATION_CHOICES = [
|
|
|
84
92
|
"qoq",
|
|
85
93
|
"w4afp8",
|
|
86
94
|
"mxfp4",
|
|
95
|
+
"compressed-tensors", # for Ktransformers
|
|
87
96
|
]
|
|
88
97
|
|
|
89
98
|
ATTENTION_BACKEND_CHOICES = [
|
|
@@ -107,6 +116,7 @@ ATTENTION_BACKEND_CHOICES = [
|
|
|
107
116
|
# Other platforms
|
|
108
117
|
"intel_amx",
|
|
109
118
|
"ascend",
|
|
119
|
+
"intel_xpu",
|
|
110
120
|
]
|
|
111
121
|
|
|
112
122
|
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
|
@@ -117,10 +127,24 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
|
|
117
127
|
|
|
118
128
|
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
|
119
129
|
|
|
120
|
-
|
|
130
|
+
DEFAULT_LORA_EVICTION_POLICY = "lru"
|
|
131
|
+
|
|
132
|
+
NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
|
|
121
133
|
|
|
122
134
|
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
|
|
123
135
|
|
|
136
|
+
MOE_RUNNER_BACKEND_CHOICES = [
|
|
137
|
+
"auto",
|
|
138
|
+
"deep_gemm",
|
|
139
|
+
"triton",
|
|
140
|
+
"triton_kernel",
|
|
141
|
+
"flashinfer_trtllm",
|
|
142
|
+
"flashinfer_cutlass",
|
|
143
|
+
"flashinfer_mxfp4",
|
|
144
|
+
"flashinfer_cutedsl",
|
|
145
|
+
"cutlass",
|
|
146
|
+
]
|
|
147
|
+
|
|
124
148
|
|
|
125
149
|
# Allow external code to add more choices
|
|
126
150
|
def add_load_format_choices(choices):
|
|
@@ -143,6 +167,10 @@ def add_grammar_backend_choices(choices):
|
|
|
143
167
|
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
|
144
168
|
|
|
145
169
|
|
|
170
|
+
def add_moe_runner_backend_choices(choices):
|
|
171
|
+
MOE_RUNNER_BACKEND_CHOICES.extend(choices)
|
|
172
|
+
|
|
173
|
+
|
|
146
174
|
def add_deterministic_attention_backend_choices(choices):
|
|
147
175
|
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
|
|
148
176
|
|
|
@@ -162,6 +190,11 @@ class ServerArgs:
|
|
|
162
190
|
load_format: str = "auto"
|
|
163
191
|
model_loader_extra_config: str = "{}"
|
|
164
192
|
trust_remote_code: bool = False
|
|
193
|
+
modelopt_quant: Optional[Union[str, Dict]] = None
|
|
194
|
+
modelopt_checkpoint_restore_path: Optional[str] = None
|
|
195
|
+
modelopt_checkpoint_save_path: Optional[str] = None
|
|
196
|
+
modelopt_export_path: Optional[str] = None
|
|
197
|
+
quantize_and_serve: bool = False
|
|
165
198
|
context_length: Optional[int] = None
|
|
166
199
|
is_embedding: bool = False
|
|
167
200
|
enable_multimodal: Optional[bool] = None
|
|
@@ -171,9 +204,11 @@ class ServerArgs:
|
|
|
171
204
|
# HTTP server
|
|
172
205
|
host: str = "127.0.0.1"
|
|
173
206
|
port: int = 30000
|
|
207
|
+
grpc_mode: bool = False
|
|
174
208
|
skip_server_warmup: bool = False
|
|
175
209
|
warmups: Optional[str] = None
|
|
176
210
|
nccl_port: Optional[int] = None
|
|
211
|
+
checkpoint_engine_wait_weights_before_ready: bool = False
|
|
177
212
|
|
|
178
213
|
# Quantization and data type
|
|
179
214
|
dtype: str = "auto"
|
|
@@ -191,6 +226,7 @@ class ServerArgs:
|
|
|
191
226
|
max_prefill_tokens: int = 16384
|
|
192
227
|
schedule_policy: str = "fcfs"
|
|
193
228
|
enable_priority_scheduling: bool = False
|
|
229
|
+
abort_on_priority_when_disabled: bool = False
|
|
194
230
|
schedule_low_priority_values_first: bool = False
|
|
195
231
|
priority_scheduling_preemption_threshold: int = 10
|
|
196
232
|
schedule_conservativeness: float = 1.0
|
|
@@ -202,13 +238,16 @@ class ServerArgs:
|
|
|
202
238
|
|
|
203
239
|
# Runtime options
|
|
204
240
|
device: Optional[str] = None
|
|
241
|
+
elastic_ep_backend: Literal[None, "mooncake"] = None
|
|
242
|
+
mooncake_ib_device: Optional[str] = None
|
|
205
243
|
tp_size: int = 1
|
|
206
244
|
pp_size: int = 1
|
|
207
|
-
|
|
245
|
+
pp_max_micro_batch_size: Optional[int] = None
|
|
208
246
|
stream_interval: int = 1
|
|
209
247
|
stream_output: bool = False
|
|
210
248
|
random_seed: Optional[int] = None
|
|
211
249
|
constrained_json_whitespace_pattern: Optional[str] = None
|
|
250
|
+
constrained_json_disable_any_whitespace: bool = False
|
|
212
251
|
watchdog_timeout: float = 300
|
|
213
252
|
dist_timeout: Optional[int] = None # timeout for torch.distributed
|
|
214
253
|
download_dir: Optional[str] = None
|
|
@@ -251,6 +290,7 @@ class ServerArgs:
|
|
|
251
290
|
reasoning_parser: Optional[str] = None
|
|
252
291
|
tool_call_parser: Optional[str] = None
|
|
253
292
|
tool_server: Optional[str] = None
|
|
293
|
+
sampling_defaults: str = "model"
|
|
254
294
|
|
|
255
295
|
# Data parallelism
|
|
256
296
|
dp_size: int = 1
|
|
@@ -277,6 +317,7 @@ class ServerArgs:
|
|
|
277
317
|
] = None
|
|
278
318
|
max_loaded_loras: Optional[int] = None
|
|
279
319
|
max_loras_per_batch: int = 8
|
|
320
|
+
lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
|
|
280
321
|
lora_backend: str = "triton"
|
|
281
322
|
max_lora_chunk_size: Optional[int] = 16
|
|
282
323
|
|
|
@@ -287,13 +328,15 @@ class ServerArgs:
|
|
|
287
328
|
sampling_backend: Optional[str] = None
|
|
288
329
|
grammar_backend: Optional[str] = None
|
|
289
330
|
mm_attention_backend: Optional[str] = None
|
|
290
|
-
|
|
291
|
-
|
|
331
|
+
nsa_prefill_backend: str = "flashmla_sparse"
|
|
332
|
+
nsa_decode_backend: str = "fa3"
|
|
292
333
|
|
|
293
334
|
# Speculative decoding
|
|
335
|
+
enable_beta_spec: bool = False
|
|
294
336
|
speculative_algorithm: Optional[str] = None
|
|
295
337
|
speculative_draft_model_path: Optional[str] = None
|
|
296
338
|
speculative_draft_model_revision: Optional[str] = None
|
|
339
|
+
speculative_draft_load_format: Optional[str] = None
|
|
297
340
|
speculative_num_steps: Optional[int] = None
|
|
298
341
|
speculative_eagle_topk: Optional[int] = None
|
|
299
342
|
speculative_num_draft_tokens: Optional[int] = None
|
|
@@ -312,15 +355,8 @@ class ServerArgs:
|
|
|
312
355
|
|
|
313
356
|
# Expert parallelism
|
|
314
357
|
ep_size: int = 1
|
|
315
|
-
moe_a2a_backend: Literal["none", "deepep"] = "none"
|
|
316
|
-
moe_runner_backend:
|
|
317
|
-
"auto",
|
|
318
|
-
"triton",
|
|
319
|
-
"triton_kernel",
|
|
320
|
-
"flashinfer_trtllm",
|
|
321
|
-
"flashinfer_cutlass",
|
|
322
|
-
"flashinfer_mxfp4",
|
|
323
|
-
] = "auto"
|
|
358
|
+
moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
|
|
359
|
+
moe_runner_backend: str = "auto"
|
|
324
360
|
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
|
325
361
|
enable_flashinfer_allreduce_fusion: bool = False
|
|
326
362
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
|
@@ -343,6 +379,7 @@ class ServerArgs:
|
|
|
343
379
|
# Mamba cache
|
|
344
380
|
max_mamba_cache_size: Optional[int] = None
|
|
345
381
|
mamba_ssm_dtype: str = "float32"
|
|
382
|
+
mamba_full_memory_ratio: float = 0.9
|
|
346
383
|
|
|
347
384
|
# Hierarchical cache
|
|
348
385
|
enable_hierarchical_cache: bool = False
|
|
@@ -357,6 +394,13 @@ class ServerArgs:
|
|
|
357
394
|
# LMCache
|
|
358
395
|
enable_lmcache: bool = False
|
|
359
396
|
|
|
397
|
+
# Ktransformers
|
|
398
|
+
kt_amx_weight_path: Optional[str] = None
|
|
399
|
+
kt_amx_method: Optional[str] = None
|
|
400
|
+
kt_cpuinfer: Optional[int] = None
|
|
401
|
+
kt_threadpool_count: Optional[int] = None
|
|
402
|
+
kt_num_gpu_experts: Optional[int] = None
|
|
403
|
+
|
|
360
404
|
# Double Sparsity
|
|
361
405
|
enable_double_sparsity: bool = False
|
|
362
406
|
ds_channel_config_path: Optional[str] = None
|
|
@@ -372,6 +416,12 @@ class ServerArgs:
|
|
|
372
416
|
offload_prefetch_step: int = 1
|
|
373
417
|
offload_mode: str = "cpu"
|
|
374
418
|
|
|
419
|
+
# Scoring configuration
|
|
420
|
+
# Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
|
|
421
|
+
# Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
|
|
422
|
+
# This enables efficient batch processing of multiple items against a single query.
|
|
423
|
+
multi_item_scoring_delimiter: Optional[Union[int]] = None
|
|
424
|
+
|
|
375
425
|
# Optimization/debug options
|
|
376
426
|
disable_radix_cache: bool = False
|
|
377
427
|
cuda_graph_max_bs: Optional[int] = None
|
|
@@ -384,6 +434,7 @@ class ServerArgs:
|
|
|
384
434
|
enable_symm_mem: bool = False
|
|
385
435
|
disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
|
|
386
436
|
enable_tokenizer_batch_encode: bool = False
|
|
437
|
+
disable_tokenizer_batch_decode: bool = False
|
|
387
438
|
disable_outlines_disk_cache: bool = False
|
|
388
439
|
disable_custom_all_reduce: bool = False
|
|
389
440
|
enable_mscclpp: bool = False
|
|
@@ -396,7 +447,11 @@ class ServerArgs:
|
|
|
396
447
|
enable_single_batch_overlap: bool = False
|
|
397
448
|
tbo_token_distribution_threshold: float = 0.48
|
|
398
449
|
enable_torch_compile: bool = False
|
|
450
|
+
enable_piecewise_cuda_graph: bool = False
|
|
399
451
|
torch_compile_max_bs: int = 32
|
|
452
|
+
piecewise_cuda_graph_max_tokens: int = 4096
|
|
453
|
+
piecewise_cuda_graph_tokens: Optional[List[int]] = None
|
|
454
|
+
piecewise_cuda_graph_compiler: str = "eager"
|
|
400
455
|
torchao_config: str = ""
|
|
401
456
|
enable_nan_detection: bool = False
|
|
402
457
|
enable_p2p_check: bool = False
|
|
@@ -428,7 +483,6 @@ class ServerArgs:
|
|
|
428
483
|
debug_tensor_dump_output_folder: Optional[str] = None
|
|
429
484
|
debug_tensor_dump_input_file: Optional[str] = None
|
|
430
485
|
debug_tensor_dump_inject: bool = False
|
|
431
|
-
debug_tensor_dump_prefill_only: bool = False
|
|
432
486
|
|
|
433
487
|
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
|
434
488
|
disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
|
|
@@ -452,12 +506,31 @@ class ServerArgs:
|
|
|
452
506
|
|
|
453
507
|
# For PD-Multiplexing
|
|
454
508
|
enable_pdmux: bool = False
|
|
455
|
-
|
|
509
|
+
pdmux_config_path: Optional[str] = None
|
|
510
|
+
sm_group_num: int = 8
|
|
511
|
+
|
|
512
|
+
def get_attention_backends(server_args):
|
|
513
|
+
prefill_attention_backend_str = (
|
|
514
|
+
server_args.prefill_attention_backend
|
|
515
|
+
if server_args.prefill_attention_backend
|
|
516
|
+
else server_args.attention_backend
|
|
517
|
+
)
|
|
518
|
+
decode_attention_backend_str = (
|
|
519
|
+
server_args.decode_attention_backend
|
|
520
|
+
if server_args.decode_attention_backend
|
|
521
|
+
else server_args.attention_backend
|
|
522
|
+
)
|
|
523
|
+
return prefill_attention_backend_str, decode_attention_backend_str
|
|
456
524
|
|
|
457
525
|
def __post_init__(self):
|
|
458
526
|
"""
|
|
459
527
|
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
|
460
528
|
"""
|
|
529
|
+
|
|
530
|
+
if self.model_path.lower() in ["none", "dummy"]:
|
|
531
|
+
# Skip for dummy models
|
|
532
|
+
return
|
|
533
|
+
|
|
461
534
|
# Handle deprecated arguments.
|
|
462
535
|
self._handle_deprecated_args()
|
|
463
536
|
|
|
@@ -484,12 +557,15 @@ class ServerArgs:
|
|
|
484
557
|
self._handle_amd_specifics()
|
|
485
558
|
self._handle_grammar_backend()
|
|
486
559
|
|
|
560
|
+
# Handle Ktransformers specific configs
|
|
561
|
+
self._handle_ktransformers_configs()
|
|
562
|
+
|
|
487
563
|
# Handle data parallelism.
|
|
488
564
|
self._handle_data_parallelism()
|
|
489
565
|
|
|
490
566
|
# Handle MoE configurations.
|
|
491
567
|
self._handle_moe_kernel_config()
|
|
492
|
-
self.
|
|
568
|
+
self._handle_a2a_moe()
|
|
493
569
|
self._handle_eplb_and_dispatch()
|
|
494
570
|
self._handle_expert_distribution_metrics()
|
|
495
571
|
|
|
@@ -526,8 +602,33 @@ class ServerArgs:
|
|
|
526
602
|
# Handle any other necessary validations.
|
|
527
603
|
self._handle_other_validations()
|
|
528
604
|
|
|
605
|
+
# Handle elastic expert parallelism.
|
|
606
|
+
self._handle_elastic_ep()
|
|
607
|
+
|
|
529
608
|
def _handle_deprecated_args(self):
|
|
530
|
-
|
|
609
|
+
# handle deprecated tool call parsers
|
|
610
|
+
deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
|
|
611
|
+
if self.tool_call_parser in deprecated_tool_call_parsers:
|
|
612
|
+
logger.warning(
|
|
613
|
+
f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
|
|
614
|
+
)
|
|
615
|
+
self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
|
|
616
|
+
|
|
617
|
+
def _handle_ktransformers_configs(self):
|
|
618
|
+
from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
|
|
619
|
+
CompressedTensorsWNA16AMXEPMoEMethod,
|
|
620
|
+
override_config,
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
override_config(
|
|
624
|
+
CompressedTensorsWNA16AMXEPMoEMethod,
|
|
625
|
+
self.kt_num_gpu_experts,
|
|
626
|
+
self.kt_cpuinfer,
|
|
627
|
+
self.kt_threadpool_count,
|
|
628
|
+
self.kt_amx_weight_path,
|
|
629
|
+
self.kt_amx_method,
|
|
630
|
+
self.chunked_prefill_size,
|
|
631
|
+
)
|
|
531
632
|
|
|
532
633
|
def _handle_missing_default_values(self):
|
|
533
634
|
if self.tokenizer_path is None:
|
|
@@ -571,6 +672,16 @@ class ServerArgs:
|
|
|
571
672
|
self.chunked_prefill_size = 2048
|
|
572
673
|
if self.cuda_graph_max_bs is None:
|
|
573
674
|
self.cuda_graph_max_bs = 8
|
|
675
|
+
elif is_npu() and gpu_mem < 32 * 1024:
|
|
676
|
+
# Atlas A2B4
|
|
677
|
+
# (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
|
|
678
|
+
if self.chunked_prefill_size is None:
|
|
679
|
+
self.chunked_prefill_size = 32768
|
|
680
|
+
if self.cuda_graph_max_bs is None:
|
|
681
|
+
if self.tp_size < 4:
|
|
682
|
+
self.cuda_graph_max_bs = 16
|
|
683
|
+
else:
|
|
684
|
+
self.cuda_graph_max_bs = 64
|
|
574
685
|
elif gpu_mem < 35 * 1024:
|
|
575
686
|
# A10, 4090, 5090
|
|
576
687
|
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
|
@@ -594,6 +705,16 @@ class ServerArgs:
|
|
|
594
705
|
self.cuda_graph_max_bs = 32
|
|
595
706
|
else:
|
|
596
707
|
self.cuda_graph_max_bs = 160
|
|
708
|
+
elif is_npu() and gpu_mem < 64 * 1024:
|
|
709
|
+
# Atlas A2 and Atlas A3
|
|
710
|
+
# (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
|
|
711
|
+
if self.chunked_prefill_size is None:
|
|
712
|
+
self.chunked_prefill_size = 32768
|
|
713
|
+
if self.cuda_graph_max_bs is None:
|
|
714
|
+
if self.tp_size < 4:
|
|
715
|
+
self.cuda_graph_max_bs = 64
|
|
716
|
+
else:
|
|
717
|
+
self.cuda_graph_max_bs = 128
|
|
597
718
|
elif gpu_mem < 90 * 1024:
|
|
598
719
|
# H100, A100
|
|
599
720
|
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
|
@@ -634,6 +755,11 @@ class ServerArgs:
|
|
|
634
755
|
else:
|
|
635
756
|
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
|
|
636
757
|
|
|
758
|
+
if self.piecewise_cuda_graph_tokens is None:
|
|
759
|
+
self.piecewise_cuda_graph_tokens = (
|
|
760
|
+
self._generate_piecewise_cuda_graph_tokens()
|
|
761
|
+
)
|
|
762
|
+
|
|
637
763
|
if self.mem_fraction_static is None:
|
|
638
764
|
# Constant meta data (e.g., from attention backend)
|
|
639
765
|
reserved_mem = 512
|
|
@@ -712,6 +838,25 @@ class ServerArgs:
|
|
|
712
838
|
|
|
713
839
|
return capture_bs
|
|
714
840
|
|
|
841
|
+
def _generate_piecewise_cuda_graph_tokens(self):
|
|
842
|
+
"""
|
|
843
|
+
Generate the list of batch sizes for piecewise CUDA graph capture
|
|
844
|
+
based on piecewise_cuda_graph_max_tokens.
|
|
845
|
+
"""
|
|
846
|
+
capture_sizes = (
|
|
847
|
+
list(range(4, 33, 4))
|
|
848
|
+
+ list(range(48, 257, 16))
|
|
849
|
+
+ list(range(288, 513, 32))
|
|
850
|
+
+ list(range(640, 4096 + 1, 128))
|
|
851
|
+
+ list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
capture_sizes = [
|
|
855
|
+
s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
|
|
856
|
+
]
|
|
857
|
+
|
|
858
|
+
return capture_sizes
|
|
859
|
+
|
|
715
860
|
def _handle_hpu_backends(self):
|
|
716
861
|
if self.device == "hpu":
|
|
717
862
|
self.attention_backend = "torch_native"
|
|
@@ -731,21 +876,54 @@ class ServerArgs:
|
|
|
731
876
|
|
|
732
877
|
hf_config = self.get_hf_config()
|
|
733
878
|
model_arch = hf_config.architectures[0]
|
|
734
|
-
if model_arch in ["
|
|
735
|
-
if
|
|
879
|
+
if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
|
|
880
|
+
if is_cuda() and is_sm100_supported():
|
|
881
|
+
if (
|
|
882
|
+
self.attention_backend is None
|
|
883
|
+
and self.prefill_attention_backend is None
|
|
884
|
+
and self.decode_attention_backend is None
|
|
885
|
+
):
|
|
886
|
+
self.attention_backend = "trtllm_mla"
|
|
887
|
+
logger.info(
|
|
888
|
+
"Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
|
|
889
|
+
)
|
|
890
|
+
if not self.enable_dp_attention:
|
|
891
|
+
self.enable_flashinfer_allreduce_fusion = True
|
|
892
|
+
logger.info(
|
|
893
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
|
|
894
|
+
)
|
|
895
|
+
if (
|
|
896
|
+
self.quantization == "modelopt_fp4"
|
|
897
|
+
and self.moe_runner_backend == "auto"
|
|
898
|
+
):
|
|
899
|
+
self.moe_runner_backend = "flashinfer_trtllm"
|
|
900
|
+
logger.info(
|
|
901
|
+
"Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
elif model_arch in ["GptOssForCausalLM"]:
|
|
905
|
+
if (
|
|
906
|
+
self.attention_backend is None
|
|
907
|
+
and self.prefill_attention_backend is None
|
|
908
|
+
and self.decode_attention_backend is None
|
|
909
|
+
):
|
|
736
910
|
if is_cuda() and is_sm100_supported():
|
|
737
911
|
self.attention_backend = "trtllm_mha"
|
|
738
912
|
elif is_cuda() and is_sm90_supported():
|
|
739
913
|
self.attention_backend = "fa3"
|
|
740
914
|
else:
|
|
741
915
|
self.attention_backend = "triton"
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
)
|
|
916
|
+
|
|
917
|
+
supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
|
|
918
|
+
prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
|
|
746
919
|
assert (
|
|
747
|
-
|
|
748
|
-
|
|
920
|
+
prefill_attn_backend in supported_backends
|
|
921
|
+
and decode_attn_backend in supported_backends
|
|
922
|
+
), (
|
|
923
|
+
f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
|
|
924
|
+
f"- Prefill: {prefill_attn_backend}\n"
|
|
925
|
+
f"- Decode: {decode_attn_backend}\n"
|
|
926
|
+
)
|
|
749
927
|
|
|
750
928
|
if is_sm100_supported():
|
|
751
929
|
if not self.enable_dp_attention:
|
|
@@ -802,6 +980,31 @@ class ServerArgs:
|
|
|
802
980
|
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
|
803
981
|
)
|
|
804
982
|
self.disable_hybrid_swa_memory = True
|
|
983
|
+
elif model_arch in ["Olmo2ForCausalLM"]:
|
|
984
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
|
|
985
|
+
logger.warning(
|
|
986
|
+
f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
|
|
987
|
+
)
|
|
988
|
+
self.disable_hybrid_swa_memory = True
|
|
989
|
+
|
|
990
|
+
if self.attention_backend is None:
|
|
991
|
+
if is_cuda() and is_sm100_supported():
|
|
992
|
+
self.attention_backend = "trtllm_mha"
|
|
993
|
+
elif is_cuda() and get_device_sm() >= 80:
|
|
994
|
+
self.attention_backend = "fa3"
|
|
995
|
+
else:
|
|
996
|
+
self.attention_backend = "triton"
|
|
997
|
+
|
|
998
|
+
# Flashinfer appears to degrade performance when sliding window attention
|
|
999
|
+
# is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
|
|
1000
|
+
# but Olmo3 does.
|
|
1001
|
+
assert (
|
|
1002
|
+
self.attention_backend != "flashinfer"
|
|
1003
|
+
), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
|
|
1004
|
+
|
|
1005
|
+
logger.info(
|
|
1006
|
+
f"Using {self.attention_backend} as attention backend for {model_arch}."
|
|
1007
|
+
)
|
|
805
1008
|
|
|
806
1009
|
if is_deepseek_nsa(hf_config):
|
|
807
1010
|
if (
|
|
@@ -820,9 +1023,6 @@ class ServerArgs:
|
|
|
820
1023
|
self.page_size = 64
|
|
821
1024
|
logger.warning("Setting page size to 64 for DeepSeek NSA.")
|
|
822
1025
|
|
|
823
|
-
self.mem_fraction_static = 0.8
|
|
824
|
-
logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
|
|
825
|
-
|
|
826
1026
|
# For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
|
|
827
1027
|
import torch
|
|
828
1028
|
|
|
@@ -832,10 +1032,10 @@ class ServerArgs:
|
|
|
832
1032
|
logger.warning("Setting KV cache dtype to fp8.")
|
|
833
1033
|
|
|
834
1034
|
if self.kv_cache_dtype == "fp8_e4m3":
|
|
835
|
-
self.
|
|
836
|
-
self.
|
|
1035
|
+
self.nsa_prefill_backend = "flashmla_kv"
|
|
1036
|
+
self.nsa_decode_backend = "flashmla_kv"
|
|
837
1037
|
logger.warning(
|
|
838
|
-
"Setting NSA backend to
|
|
1038
|
+
"Setting NSA backend to flashmla_kv for FP8 KV Cache."
|
|
839
1039
|
)
|
|
840
1040
|
|
|
841
1041
|
# Logging env vars for NSA
|
|
@@ -934,6 +1134,22 @@ class ServerArgs:
|
|
|
934
1134
|
self.enable_mixed_chunk = False
|
|
935
1135
|
self.disable_radix_cache = True
|
|
936
1136
|
|
|
1137
|
+
if self.attention_backend == "intel_xpu":
|
|
1138
|
+
if self.page_size not in [32, 64, 128]:
|
|
1139
|
+
logger.warning(
|
|
1140
|
+
f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
|
|
1141
|
+
)
|
|
1142
|
+
self.page_size = 128
|
|
1143
|
+
if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
|
|
1144
|
+
raise ValueError(
|
|
1145
|
+
"FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
|
|
1146
|
+
)
|
|
1147
|
+
if self.prefill_attention_backend == "fa4":
|
|
1148
|
+
logger.warning(
|
|
1149
|
+
f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
|
|
1150
|
+
)
|
|
1151
|
+
self.page_size = 128
|
|
1152
|
+
|
|
937
1153
|
def _handle_page_size(self):
|
|
938
1154
|
if self.page_size is None:
|
|
939
1155
|
self.page_size = 1
|
|
@@ -983,7 +1199,7 @@ class ServerArgs:
|
|
|
983
1199
|
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
|
984
1200
|
)
|
|
985
1201
|
|
|
986
|
-
def
|
|
1202
|
+
def _handle_a2a_moe(self):
|
|
987
1203
|
if self.moe_a2a_backend == "deepep":
|
|
988
1204
|
if self.deepep_mode == "normal":
|
|
989
1205
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
|
@@ -993,6 +1209,12 @@ class ServerArgs:
|
|
|
993
1209
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
|
994
1210
|
)
|
|
995
1211
|
|
|
1212
|
+
if self.moe_a2a_backend == "mooncake":
|
|
1213
|
+
self.ep_size = self.tp_size
|
|
1214
|
+
logger.warning(
|
|
1215
|
+
f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
|
1216
|
+
)
|
|
1217
|
+
|
|
996
1218
|
def _handle_eplb_and_dispatch(self):
|
|
997
1219
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
|
998
1220
|
self.expert_distribution_recorder_mode = "stat"
|
|
@@ -1008,6 +1230,15 @@ class ServerArgs:
|
|
|
1008
1230
|
if self.enable_eplb:
|
|
1009
1231
|
assert self.ep_size > 1
|
|
1010
1232
|
|
|
1233
|
+
def _handle_elastic_ep(self):
|
|
1234
|
+
if self.elastic_ep_backend is not None:
|
|
1235
|
+
if self.enable_eplb:
|
|
1236
|
+
if self.eplb_algorithm == "auto":
|
|
1237
|
+
self.eplb_algorithm = "elasticity_aware"
|
|
1238
|
+
assert (
|
|
1239
|
+
self.eplb_algorithm == "elasticity_aware"
|
|
1240
|
+
), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
|
|
1241
|
+
|
|
1011
1242
|
def _handle_expert_distribution_metrics(self):
|
|
1012
1243
|
if self.enable_expert_distribution_metrics and (
|
|
1013
1244
|
self.expert_distribution_recorder_mode is None
|
|
@@ -1058,11 +1289,22 @@ class ServerArgs:
|
|
|
1058
1289
|
)
|
|
1059
1290
|
if self.max_running_requests is None:
|
|
1060
1291
|
self.max_running_requests = 48
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1292
|
+
logger.warning(
|
|
1293
|
+
"Max running requests is reset to 48 for speculative decoding."
|
|
1294
|
+
)
|
|
1295
|
+
|
|
1296
|
+
if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
|
|
1297
|
+
self.disable_overlap_schedule = False
|
|
1298
|
+
logger.warning(
|
|
1299
|
+
"Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
|
|
1300
|
+
)
|
|
1301
|
+
|
|
1302
|
+
if not self.enable_beta_spec:
|
|
1303
|
+
self.disable_overlap_schedule = True
|
|
1304
|
+
logger.warning(
|
|
1305
|
+
"Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1066
1308
|
if self.enable_mixed_chunk:
|
|
1067
1309
|
self.enable_mixed_chunk = False
|
|
1068
1310
|
logger.warning(
|
|
@@ -1216,6 +1458,26 @@ class ServerArgs:
|
|
|
1216
1458
|
"Please choose one tokenizer batching approach."
|
|
1217
1459
|
)
|
|
1218
1460
|
|
|
1461
|
+
if self.skip_tokenizer_init:
|
|
1462
|
+
if self.tokenizer_worker_num != 1:
|
|
1463
|
+
logger.warning(
|
|
1464
|
+
"skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
|
|
1465
|
+
f"(requested {self.tokenizer_worker_num})."
|
|
1466
|
+
)
|
|
1467
|
+
self.tokenizer_worker_num = 1
|
|
1468
|
+
|
|
1469
|
+
if self.enable_tokenizer_batch_encode:
|
|
1470
|
+
logger.warning(
|
|
1471
|
+
"skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
|
|
1472
|
+
)
|
|
1473
|
+
self.enable_tokenizer_batch_encode = False
|
|
1474
|
+
|
|
1475
|
+
if self.enable_dynamic_batch_tokenizer:
|
|
1476
|
+
logger.warning(
|
|
1477
|
+
"skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
|
|
1478
|
+
)
|
|
1479
|
+
self.enable_dynamic_batch_tokenizer = False
|
|
1480
|
+
|
|
1219
1481
|
def _handle_environment_variables(self):
|
|
1220
1482
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
|
1221
1483
|
"1" if self.enable_torch_compile else "0"
|
|
@@ -1261,13 +1523,27 @@ class ServerArgs:
|
|
|
1261
1523
|
)
|
|
1262
1524
|
|
|
1263
1525
|
# Check attention backend
|
|
1264
|
-
if self.attention_backend
|
|
1526
|
+
if self.attention_backend is None:
|
|
1527
|
+
# User didn't specify attention backend, fallback based on GPU architecture
|
|
1528
|
+
if is_sm100_supported() or is_sm120_supported():
|
|
1529
|
+
# Blackwell and newer architectures
|
|
1530
|
+
self.attention_backend = "flashinfer"
|
|
1531
|
+
else:
|
|
1532
|
+
# Hopper (SM90) and older architectures
|
|
1533
|
+
self.attention_backend = "fa3"
|
|
1534
|
+
logger.warning(
|
|
1535
|
+
f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
|
|
1536
|
+
f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
|
|
1537
|
+
)
|
|
1538
|
+
elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
|
1539
|
+
# User explicitly specified an incompatible attention backend
|
|
1265
1540
|
raise ValueError(
|
|
1266
|
-
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference
|
|
1541
|
+
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
|
|
1542
|
+
f"but you explicitly specified '{self.attention_backend}'."
|
|
1267
1543
|
)
|
|
1268
1544
|
|
|
1269
|
-
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
|
1270
|
-
if self.attention_backend
|
|
1545
|
+
# Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
|
|
1546
|
+
if self.attention_backend not in ["fa3", "triton"]:
|
|
1271
1547
|
self.disable_radix_cache = True
|
|
1272
1548
|
logger.warning(
|
|
1273
1549
|
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
|
|
@@ -1286,6 +1562,7 @@ class ServerArgs:
|
|
|
1286
1562
|
|
|
1287
1563
|
@staticmethod
|
|
1288
1564
|
def add_cli_args(parser: argparse.ArgumentParser):
|
|
1565
|
+
|
|
1289
1566
|
# Model and tokenizer
|
|
1290
1567
|
parser.add_argument(
|
|
1291
1568
|
"--model-path",
|
|
@@ -1405,6 +1682,11 @@ class ServerArgs:
|
|
|
1405
1682
|
default=ServerArgs.port,
|
|
1406
1683
|
help="The port of the HTTP server.",
|
|
1407
1684
|
)
|
|
1685
|
+
parser.add_argument(
|
|
1686
|
+
"--grpc-mode",
|
|
1687
|
+
action="store_true",
|
|
1688
|
+
help="If set, use gRPC server instead of HTTP server.",
|
|
1689
|
+
)
|
|
1408
1690
|
parser.add_argument(
|
|
1409
1691
|
"--skip-server-warmup",
|
|
1410
1692
|
action="store_true",
|
|
@@ -1423,6 +1705,12 @@ class ServerArgs:
|
|
|
1423
1705
|
default=ServerArgs.nccl_port,
|
|
1424
1706
|
help="The port for NCCL distributed environment setup. Defaults to a random port.",
|
|
1425
1707
|
)
|
|
1708
|
+
parser.add_argument(
|
|
1709
|
+
"--checkpoint-engine-wait-weights-before-ready",
|
|
1710
|
+
action="store_true",
|
|
1711
|
+
help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
|
|
1712
|
+
"before serving inference requests.",
|
|
1713
|
+
)
|
|
1426
1714
|
|
|
1427
1715
|
# Quantization and data type
|
|
1428
1716
|
parser.add_argument(
|
|
@@ -1455,12 +1743,51 @@ class ServerArgs:
|
|
|
1455
1743
|
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
|
1456
1744
|
"default to 1.0, which may cause accuracy issues. ",
|
|
1457
1745
|
)
|
|
1746
|
+
parser.add_argument(
|
|
1747
|
+
"--modelopt-quant",
|
|
1748
|
+
type=str,
|
|
1749
|
+
default=ServerArgs.modelopt_quant,
|
|
1750
|
+
help="The ModelOpt quantization configuration. "
|
|
1751
|
+
"Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
|
|
1752
|
+
"This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
|
|
1753
|
+
)
|
|
1754
|
+
parser.add_argument(
|
|
1755
|
+
"--modelopt-checkpoint-restore-path",
|
|
1756
|
+
type=str,
|
|
1757
|
+
default=ServerArgs.modelopt_checkpoint_restore_path,
|
|
1758
|
+
help="Path to restore a previously saved ModelOpt quantized checkpoint. "
|
|
1759
|
+
"If provided, the quantization process will be skipped and the model "
|
|
1760
|
+
"will be loaded from this checkpoint.",
|
|
1761
|
+
)
|
|
1762
|
+
parser.add_argument(
|
|
1763
|
+
"--modelopt-checkpoint-save-path",
|
|
1764
|
+
type=str,
|
|
1765
|
+
default=ServerArgs.modelopt_checkpoint_save_path,
|
|
1766
|
+
help="Path to save the ModelOpt quantized checkpoint after quantization. "
|
|
1767
|
+
"This allows reusing the quantized model in future runs.",
|
|
1768
|
+
)
|
|
1769
|
+
parser.add_argument(
|
|
1770
|
+
"--modelopt-export-path",
|
|
1771
|
+
type=str,
|
|
1772
|
+
default=ServerArgs.modelopt_export_path,
|
|
1773
|
+
help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
|
|
1774
|
+
"The exported model can then be used directly with SGLang for inference. "
|
|
1775
|
+
"If not provided, the model will not be exported.",
|
|
1776
|
+
)
|
|
1777
|
+
parser.add_argument(
|
|
1778
|
+
"--quantize-and-serve",
|
|
1779
|
+
action="store_true",
|
|
1780
|
+
default=ServerArgs.quantize_and_serve,
|
|
1781
|
+
help="Quantize the model with ModelOpt and immediately serve it without exporting. "
|
|
1782
|
+
"This is useful for development and prototyping. For production, it's recommended "
|
|
1783
|
+
"to use separate quantization and deployment steps.",
|
|
1784
|
+
)
|
|
1458
1785
|
parser.add_argument(
|
|
1459
1786
|
"--kv-cache-dtype",
|
|
1460
1787
|
type=str,
|
|
1461
1788
|
default=ServerArgs.kv_cache_dtype,
|
|
1462
|
-
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
|
1463
|
-
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3"
|
|
1789
|
+
choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
|
|
1790
|
+
help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
|
|
1464
1791
|
)
|
|
1465
1792
|
parser.add_argument(
|
|
1466
1793
|
"--enable-fp32-lm-head",
|
|
@@ -1519,6 +1846,12 @@ class ServerArgs:
|
|
|
1519
1846
|
default=ServerArgs.enable_priority_scheduling,
|
|
1520
1847
|
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
|
|
1521
1848
|
)
|
|
1849
|
+
parser.add_argument(
|
|
1850
|
+
"--abort-on-priority-when-disabled",
|
|
1851
|
+
action="store_true",
|
|
1852
|
+
default=ServerArgs.abort_on_priority_when_disabled,
|
|
1853
|
+
help="If set, abort requests that specify a priority when priority scheduling is disabled.",
|
|
1854
|
+
)
|
|
1522
1855
|
parser.add_argument(
|
|
1523
1856
|
"--schedule-low-priority-values-first",
|
|
1524
1857
|
action="store_true",
|
|
@@ -1575,6 +1908,21 @@ class ServerArgs:
|
|
|
1575
1908
|
default=ServerArgs.device,
|
|
1576
1909
|
help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
|
|
1577
1910
|
)
|
|
1911
|
+
parser.add_argument(
|
|
1912
|
+
"--elastic-ep-backend",
|
|
1913
|
+
type=str,
|
|
1914
|
+
default=ServerArgs.elastic_ep_backend,
|
|
1915
|
+
choices=["none", "mooncake"],
|
|
1916
|
+
help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
|
|
1917
|
+
)
|
|
1918
|
+
parser.add_argument(
|
|
1919
|
+
"--mooncake-ib-device",
|
|
1920
|
+
type=str,
|
|
1921
|
+
default=ServerArgs.mooncake_ib_device,
|
|
1922
|
+
help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
|
|
1923
|
+
"(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
|
|
1924
|
+
"Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
|
|
1925
|
+
)
|
|
1578
1926
|
parser.add_argument(
|
|
1579
1927
|
"--tensor-parallel-size",
|
|
1580
1928
|
"--tp-size",
|
|
@@ -1590,9 +1938,9 @@ class ServerArgs:
|
|
|
1590
1938
|
help="The pipeline parallelism size.",
|
|
1591
1939
|
)
|
|
1592
1940
|
parser.add_argument(
|
|
1593
|
-
"--max-micro-batch-size",
|
|
1941
|
+
"--pp-max-micro-batch-size",
|
|
1594
1942
|
type=int,
|
|
1595
|
-
default=ServerArgs.
|
|
1943
|
+
default=ServerArgs.pp_max_micro_batch_size,
|
|
1596
1944
|
help="The maximum micro batch size in pipeline parallelism.",
|
|
1597
1945
|
)
|
|
1598
1946
|
parser.add_argument(
|
|
@@ -1616,7 +1964,12 @@ class ServerArgs:
|
|
|
1616
1964
|
"--constrained-json-whitespace-pattern",
|
|
1617
1965
|
type=str,
|
|
1618
1966
|
default=ServerArgs.constrained_json_whitespace_pattern,
|
|
1619
|
-
help="(outlines
|
|
1967
|
+
help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
|
|
1968
|
+
)
|
|
1969
|
+
parser.add_argument(
|
|
1970
|
+
"--constrained-json-disable-any-whitespace",
|
|
1971
|
+
action="store_true",
|
|
1972
|
+
help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
|
|
1620
1973
|
)
|
|
1621
1974
|
parser.add_argument(
|
|
1622
1975
|
"--watchdog-timeout",
|
|
@@ -1857,6 +2210,16 @@ class ServerArgs:
|
|
|
1857
2210
|
default=ServerArgs.tool_call_parser,
|
|
1858
2211
|
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
|
|
1859
2212
|
)
|
|
2213
|
+
parser.add_argument(
|
|
2214
|
+
"--sampling-defaults",
|
|
2215
|
+
type=str,
|
|
2216
|
+
choices=["openai", "model"],
|
|
2217
|
+
default=ServerArgs.sampling_defaults,
|
|
2218
|
+
help="Where to get default sampling parameters. "
|
|
2219
|
+
"'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
|
|
2220
|
+
"'model' uses the model's generation_config.json to get the recommended "
|
|
2221
|
+
"sampling parameters if available. Default is 'model'.",
|
|
2222
|
+
)
|
|
1860
2223
|
parser.add_argument(
|
|
1861
2224
|
"--tool-server",
|
|
1862
2225
|
type=str,
|
|
@@ -1966,6 +2329,13 @@ class ServerArgs:
|
|
|
1966
2329
|
default=ServerArgs.max_loaded_loras,
|
|
1967
2330
|
help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
|
|
1968
2331
|
)
|
|
2332
|
+
parser.add_argument(
|
|
2333
|
+
"--lora-eviction-policy",
|
|
2334
|
+
type=str,
|
|
2335
|
+
default=DEFAULT_LORA_EVICTION_POLICY,
|
|
2336
|
+
choices=["lru", "fifo"],
|
|
2337
|
+
help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
|
|
2338
|
+
)
|
|
1969
2339
|
parser.add_argument(
|
|
1970
2340
|
"--lora-backend",
|
|
1971
2341
|
type=str,
|
|
@@ -2025,19 +2395,20 @@ class ServerArgs:
|
|
|
2025
2395
|
help="Set multimodal attention backend.",
|
|
2026
2396
|
)
|
|
2027
2397
|
parser.add_argument(
|
|
2028
|
-
"--nsa-prefill",
|
|
2029
|
-
default=ServerArgs.
|
|
2398
|
+
"--nsa-prefill-backend",
|
|
2399
|
+
default=ServerArgs.nsa_prefill_backend,
|
|
2030
2400
|
type=str,
|
|
2031
2401
|
choices=NSA_CHOICES,
|
|
2032
2402
|
)
|
|
2033
2403
|
parser.add_argument(
|
|
2034
|
-
"--nsa-decode",
|
|
2035
|
-
default=ServerArgs.
|
|
2404
|
+
"--nsa-decode-backend",
|
|
2405
|
+
default=ServerArgs.nsa_decode_backend,
|
|
2036
2406
|
type=str,
|
|
2037
2407
|
choices=NSA_CHOICES,
|
|
2038
2408
|
)
|
|
2039
2409
|
|
|
2040
2410
|
# Speculative decoding
|
|
2411
|
+
parser.add_argument("--enable-beta-spec", action="store_true")
|
|
2041
2412
|
parser.add_argument(
|
|
2042
2413
|
"--speculative-algorithm",
|
|
2043
2414
|
type=str,
|
|
@@ -2058,6 +2429,15 @@ class ServerArgs:
|
|
|
2058
2429
|
"name, a tag name, or a commit id. If unspecified, will use "
|
|
2059
2430
|
"the default version.",
|
|
2060
2431
|
)
|
|
2432
|
+
parser.add_argument(
|
|
2433
|
+
"--speculative-draft-load-format",
|
|
2434
|
+
type=str,
|
|
2435
|
+
default=ServerArgs.speculative_draft_load_format,
|
|
2436
|
+
choices=LOAD_FORMAT_CHOICES,
|
|
2437
|
+
help="The format of the draft model weights to load. "
|
|
2438
|
+
"If not specified, will use the same format as --load-format. "
|
|
2439
|
+
"Use 'dummy' to initialize draft model weights with random values for profiling.",
|
|
2440
|
+
)
|
|
2061
2441
|
parser.add_argument(
|
|
2062
2442
|
"--speculative-num-steps",
|
|
2063
2443
|
type=int,
|
|
@@ -2158,22 +2538,14 @@ class ServerArgs:
|
|
|
2158
2538
|
parser.add_argument(
|
|
2159
2539
|
"--moe-a2a-backend",
|
|
2160
2540
|
type=str,
|
|
2161
|
-
choices=["none", "deepep"],
|
|
2541
|
+
choices=["none", "deepep", "mooncake"],
|
|
2162
2542
|
default=ServerArgs.moe_a2a_backend,
|
|
2163
2543
|
help="Choose the backend for MoE A2A.",
|
|
2164
2544
|
)
|
|
2165
2545
|
parser.add_argument(
|
|
2166
2546
|
"--moe-runner-backend",
|
|
2167
2547
|
type=str,
|
|
2168
|
-
choices=
|
|
2169
|
-
"auto",
|
|
2170
|
-
"triton",
|
|
2171
|
-
"triton_kernel",
|
|
2172
|
-
"flashinfer_trtllm",
|
|
2173
|
-
"flashinfer_cutlass",
|
|
2174
|
-
"flashinfer_mxfp4",
|
|
2175
|
-
"flashinfer_cutedsl",
|
|
2176
|
-
],
|
|
2548
|
+
choices=MOE_RUNNER_BACKEND_CHOICES,
|
|
2177
2549
|
default=ServerArgs.moe_runner_backend,
|
|
2178
2550
|
help="Choose the runner backend for MoE.",
|
|
2179
2551
|
)
|
|
@@ -2287,6 +2659,12 @@ class ServerArgs:
|
|
|
2287
2659
|
choices=["float32", "bfloat16"],
|
|
2288
2660
|
help="The data type of the SSM states in mamba cache.",
|
|
2289
2661
|
)
|
|
2662
|
+
parser.add_argument(
|
|
2663
|
+
"--mamba-full-memory-ratio",
|
|
2664
|
+
type=float,
|
|
2665
|
+
default=ServerArgs.mamba_full_memory_ratio,
|
|
2666
|
+
help="The ratio of mamba state memory to full kv cache memory.",
|
|
2667
|
+
)
|
|
2290
2668
|
|
|
2291
2669
|
# Hierarchical cache
|
|
2292
2670
|
parser.add_argument(
|
|
@@ -2364,6 +2742,35 @@ class ServerArgs:
|
|
|
2364
2742
|
help="Using LMCache as an alternative hierarchical cache solution",
|
|
2365
2743
|
)
|
|
2366
2744
|
|
|
2745
|
+
# Ktransformer server args
|
|
2746
|
+
parser.add_argument(
|
|
2747
|
+
"--kt-amx-weight-path",
|
|
2748
|
+
type=str,
|
|
2749
|
+
help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
|
|
2750
|
+
)
|
|
2751
|
+
parser.add_argument(
|
|
2752
|
+
"--kt-amx-method",
|
|
2753
|
+
type=str,
|
|
2754
|
+
default="AMXINT4",
|
|
2755
|
+
help="[ktransformers parameter] Quantization formats for CPU execution.",
|
|
2756
|
+
)
|
|
2757
|
+
parser.add_argument(
|
|
2758
|
+
"--kt-cpuinfer",
|
|
2759
|
+
type=int,
|
|
2760
|
+
help="[ktransformers parameter] The number of CPUInfer threads.",
|
|
2761
|
+
)
|
|
2762
|
+
parser.add_argument(
|
|
2763
|
+
"--kt-threadpool-count",
|
|
2764
|
+
type=int,
|
|
2765
|
+
default=2,
|
|
2766
|
+
help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
|
|
2767
|
+
)
|
|
2768
|
+
parser.add_argument(
|
|
2769
|
+
"--kt-num-gpu-experts",
|
|
2770
|
+
type=int,
|
|
2771
|
+
help="[ktransformers parameter] The number of GPU experts.",
|
|
2772
|
+
)
|
|
2773
|
+
|
|
2367
2774
|
# Double Sparsity
|
|
2368
2775
|
parser.add_argument(
|
|
2369
2776
|
"--enable-double-sparsity",
|
|
@@ -2433,6 +2840,14 @@ class ServerArgs:
|
|
|
2433
2840
|
help="Mode of offloading.",
|
|
2434
2841
|
)
|
|
2435
2842
|
|
|
2843
|
+
# Args for multi-item-scoring
|
|
2844
|
+
parser.add_argument(
|
|
2845
|
+
"--multi-item-scoring-delimiter",
|
|
2846
|
+
type=int,
|
|
2847
|
+
default=ServerArgs.multi_item_scoring_delimiter,
|
|
2848
|
+
help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
|
|
2849
|
+
)
|
|
2850
|
+
|
|
2436
2851
|
# Optimization/debug options
|
|
2437
2852
|
parser.add_argument(
|
|
2438
2853
|
"--disable-radix-cache",
|
|
@@ -2491,6 +2906,11 @@ class ServerArgs:
|
|
|
2491
2906
|
action="store_true",
|
|
2492
2907
|
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
|
2493
2908
|
)
|
|
2909
|
+
parser.add_argument(
|
|
2910
|
+
"--disable-tokenizer-batch-decode",
|
|
2911
|
+
action="store_true",
|
|
2912
|
+
help="Disable batch decoding when decoding multiple completions.",
|
|
2913
|
+
)
|
|
2494
2914
|
parser.add_argument(
|
|
2495
2915
|
"--disable-outlines-disk-cache",
|
|
2496
2916
|
action="store_true",
|
|
@@ -2552,12 +2972,36 @@ class ServerArgs:
|
|
|
2552
2972
|
action="store_true",
|
|
2553
2973
|
help="Optimize the model with torch.compile. Experimental feature.",
|
|
2554
2974
|
)
|
|
2975
|
+
parser.add_argument(
|
|
2976
|
+
"--enable-piecewise-cuda-graph",
|
|
2977
|
+
action="store_true",
|
|
2978
|
+
help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
|
|
2979
|
+
)
|
|
2980
|
+
parser.add_argument(
|
|
2981
|
+
"--piecewise-cuda-graph-tokens",
|
|
2982
|
+
type=json_list_type,
|
|
2983
|
+
default=ServerArgs.piecewise_cuda_graph_tokens,
|
|
2984
|
+
help="Set the list of tokens when using piecewise cuda graph.",
|
|
2985
|
+
)
|
|
2986
|
+
parser.add_argument(
|
|
2987
|
+
"--piecewise-cuda-graph-compiler",
|
|
2988
|
+
type=str,
|
|
2989
|
+
default=ServerArgs.piecewise_cuda_graph_compiler,
|
|
2990
|
+
help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
|
|
2991
|
+
choices=["eager", "inductor"],
|
|
2992
|
+
)
|
|
2555
2993
|
parser.add_argument(
|
|
2556
2994
|
"--torch-compile-max-bs",
|
|
2557
2995
|
type=int,
|
|
2558
2996
|
default=ServerArgs.torch_compile_max_bs,
|
|
2559
2997
|
help="Set the maximum batch size when using torch compile.",
|
|
2560
2998
|
)
|
|
2999
|
+
parser.add_argument(
|
|
3000
|
+
"--piecewise-cuda-graph-max-tokens",
|
|
3001
|
+
type=int,
|
|
3002
|
+
default=ServerArgs.piecewise_cuda_graph_max_tokens,
|
|
3003
|
+
help="Set the maximum tokens when using piecewise cuda graph.",
|
|
3004
|
+
)
|
|
2561
3005
|
parser.add_argument(
|
|
2562
3006
|
"--torchao-config",
|
|
2563
3007
|
type=str,
|
|
@@ -2687,11 +3131,6 @@ class ServerArgs:
|
|
|
2687
3131
|
default=ServerArgs.debug_tensor_dump_inject,
|
|
2688
3132
|
help="Inject the outputs from jax as the input of every layer.",
|
|
2689
3133
|
)
|
|
2690
|
-
parser.add_argument(
|
|
2691
|
-
"--debug-tensor-dump-prefill-only",
|
|
2692
|
-
action="store_true",
|
|
2693
|
-
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
|
2694
|
-
)
|
|
2695
3134
|
parser.add_argument(
|
|
2696
3135
|
"--enable-dynamic-batch-tokenizer",
|
|
2697
3136
|
action="store_true",
|
|
@@ -2813,6 +3252,12 @@ class ServerArgs:
|
|
|
2813
3252
|
action="store_true",
|
|
2814
3253
|
help="Enable PD-Multiplexing, PD running on greenctx stream.",
|
|
2815
3254
|
)
|
|
3255
|
+
parser.add_argument(
|
|
3256
|
+
"--pdmux-config-path",
|
|
3257
|
+
type=str,
|
|
3258
|
+
default=None,
|
|
3259
|
+
help="The path of the PD-Multiplexing config file.",
|
|
3260
|
+
)
|
|
2816
3261
|
|
|
2817
3262
|
parser.add_argument(
|
|
2818
3263
|
"--sm-group-num",
|
|
@@ -2894,7 +3339,7 @@ class ServerArgs:
|
|
|
2894
3339
|
self.model_path,
|
|
2895
3340
|
trust_remote_code=self.trust_remote_code,
|
|
2896
3341
|
revision=self.revision,
|
|
2897
|
-
model_override_args=
|
|
3342
|
+
model_override_args=orjson.loads(self.json_model_override_args),
|
|
2898
3343
|
**kwargs,
|
|
2899
3344
|
)
|
|
2900
3345
|
return hf_config
|
|
@@ -2941,7 +3386,34 @@ class ServerArgs:
|
|
|
2941
3386
|
self.chunked_prefill_size % self.page_size == 0
|
|
2942
3387
|
), "chunked_prefill_size must be divisible by page_size"
|
|
2943
3388
|
|
|
2944
|
-
# Check
|
|
3389
|
+
# Check pdmux
|
|
3390
|
+
if self.enable_pdmux:
|
|
3391
|
+
assert (
|
|
3392
|
+
self.pp_size == 1
|
|
3393
|
+
), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
|
|
3394
|
+
assert (
|
|
3395
|
+
self.chunked_prefill_size == -1
|
|
3396
|
+
), "PD-Multiplexing is not compatible with chunked prefill."
|
|
3397
|
+
assert (
|
|
3398
|
+
self.disaggregation_mode == "null"
|
|
3399
|
+
), "PD-Multiplexing is not compatible with disaggregation mode."
|
|
3400
|
+
assert (
|
|
3401
|
+
self.disable_overlap_schedule
|
|
3402
|
+
), "PD-Multiplexing is not compatible with overlap schedule."
|
|
3403
|
+
|
|
3404
|
+
# NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
|
|
3405
|
+
import torch
|
|
3406
|
+
|
|
3407
|
+
parts = torch.__version__.split("+", 1)[0].split(".")
|
|
3408
|
+
major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
|
|
3409
|
+
minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
|
|
3410
|
+
if (major, minor) > (2, 6):
|
|
3411
|
+
logger.warning(
|
|
3412
|
+
"WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
|
|
3413
|
+
f" Current torch version is {torch.__version__}.\n"
|
|
3414
|
+
" Please manually install torch 2.6.x."
|
|
3415
|
+
)
|
|
3416
|
+
|
|
2945
3417
|
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
|
2946
3418
|
self.validate_buckets_rule(
|
|
2947
3419
|
"--prompt-tokens-buckets", self.prompt_tokens_buckets
|
|
@@ -2957,6 +3429,17 @@ class ServerArgs:
|
|
|
2957
3429
|
"lof",
|
|
2958
3430
|
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
|
|
2959
3431
|
|
|
3432
|
+
# Check multi-item scoring
|
|
3433
|
+
if self.multi_item_scoring_delimiter is not None:
|
|
3434
|
+
assert self.disable_radix_cache, (
|
|
3435
|
+
"Multi-item scoring requires radix cache to be disabled. "
|
|
3436
|
+
"Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
|
|
3437
|
+
)
|
|
3438
|
+
assert self.chunked_prefill_size == -1, (
|
|
3439
|
+
"Multi-item scoring requires chunked prefill to be disabled. "
|
|
3440
|
+
"Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
|
|
3441
|
+
)
|
|
3442
|
+
|
|
2960
3443
|
def check_lora_server_args(self):
|
|
2961
3444
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
|
2962
3445
|
|
|
@@ -3141,6 +3624,22 @@ class ServerArgs:
|
|
|
3141
3624
|
)
|
|
3142
3625
|
|
|
3143
3626
|
|
|
3627
|
+
# NOTE: This is a global variable to hold the server args for scheduler.
|
|
3628
|
+
_global_server_args: Optional[ServerArgs] = None
|
|
3629
|
+
|
|
3630
|
+
|
|
3631
|
+
def set_global_server_args_for_scheduler(server_args: ServerArgs):
|
|
3632
|
+
global _global_server_args
|
|
3633
|
+
_global_server_args = server_args
|
|
3634
|
+
|
|
3635
|
+
|
|
3636
|
+
def get_global_server_args() -> ServerArgs:
|
|
3637
|
+
if _global_server_args is None:
|
|
3638
|
+
raise ValueError("Global server args is not set yet!")
|
|
3639
|
+
|
|
3640
|
+
return _global_server_args
|
|
3641
|
+
|
|
3642
|
+
|
|
3144
3643
|
def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
3145
3644
|
"""
|
|
3146
3645
|
Prepare the server arguments from the command line arguments.
|
|
@@ -3175,11 +3674,12 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
|
3175
3674
|
parser = argparse.ArgumentParser()
|
|
3176
3675
|
ServerArgs.add_cli_args(parser)
|
|
3177
3676
|
raw_args = parser.parse_args(argv)
|
|
3178
|
-
|
|
3179
|
-
return
|
|
3677
|
+
|
|
3678
|
+
return ServerArgs.from_cli_args(raw_args)
|
|
3180
3679
|
|
|
3181
3680
|
|
|
3182
3681
|
ZMQ_TCP_PORT_DELTA = 233
|
|
3682
|
+
DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
|
|
3183
3683
|
|
|
3184
3684
|
|
|
3185
3685
|
@dataclasses.dataclass
|
|
@@ -3204,7 +3704,11 @@ class PortArgs:
|
|
|
3204
3704
|
tokenizer_worker_ipc_name: Optional[str]
|
|
3205
3705
|
|
|
3206
3706
|
@staticmethod
|
|
3207
|
-
def init_new(
|
|
3707
|
+
def init_new(
|
|
3708
|
+
server_args: ServerArgs,
|
|
3709
|
+
dp_rank: Optional[int] = None,
|
|
3710
|
+
worker_ports: Optional[List[int]] = None,
|
|
3711
|
+
) -> PortArgs:
|
|
3208
3712
|
if server_args.nccl_port is None:
|
|
3209
3713
|
nccl_port = server_args.port + random.randint(100, 1000)
|
|
3210
3714
|
while True:
|
|
@@ -3251,8 +3755,8 @@ class PortArgs:
|
|
|
3251
3755
|
# TokenizerManager to DataParallelController
|
|
3252
3756
|
scheduler_input_port = port_base + 4
|
|
3253
3757
|
else:
|
|
3254
|
-
|
|
3255
|
-
|
|
3758
|
+
assert worker_ports is not None
|
|
3759
|
+
scheduler_input_port = worker_ports[dp_rank]
|
|
3256
3760
|
return PortArgs(
|
|
3257
3761
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
|
3258
3762
|
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|