PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_one_batch.py +8 -6
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +6 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +19 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +211 -72
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +30 -29
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +144 -55
sglang/srt/disaggregation/utils.py +155 -123
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +37 -29
sglang/srt/entrypoints/http_server.py +153 -72
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +40 -3
sglang/srt/layers/attention/aiter_backend.py +20 -4
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
sglang/srt/layers/attention/flashattention_backend.py +71 -72
sglang/srt/layers/attention/flashinfer_backend.py +10 -8
sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -12
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +138 -130
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +28 -10
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +2 -14
sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
sglang/srt/layers/moe/ep_moe/layer.py +249 -33
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +107 -12
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +249 -105
sglang/srt/lora/mem_pool.py +53 -50
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -14
sglang/srt/managers/io_struct.py +31 -10
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +79 -37
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +220 -79
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +40 -10
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -15
sglang/srt/mem_cache/hiradix_cache.py +38 -25
sglang/srt/mem_cache/memory_pool.py +213 -505
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +56 -28
sglang/srt/model_executor/cuda_graph_runner.py +198 -100
sglang/srt/model_executor/forward_batch_info.py +32 -10
sglang/srt/model_executor/model_runner.py +28 -12
sglang/srt/model_loader/loader.py +16 -2
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +213 -173
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +351 -238
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
sglang/srt/speculative/eagle_utils.py +468 -116
sglang/srt/speculative/eagle_worker.py +258 -84
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +235 -11
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +2 -0
sglang/test/test_utils.py +4 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -1990
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -28,7 +28,6 @@ from sglang.srt.utils import (
     configure_ipv6,
     get_device,
     get_device_memory_capacity,
-    is_cuda,
     is_flashinfer_available,
     is_hip,
     is_port_available,
@@ -91,6 +90,7 @@ class ServerArgs:
     download_dir: Optional[str] = None
     base_gpu_id: int = 0
     gpu_id_step: int = 1
+    sleep_on_idle: bool = False
     # Logging
     log_level: str = "info"
@@ -112,14 +112,12 @@ class ServerArgs:
     file_storage_path: str = "sglang_storage"
     enable_cache_report: bool = False
     reasoning_parser: Optional[str] = None
+    tool_call_parser: Optional[str] = None
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
-    # Expert parallelism
-    ep_size: int = 1
     # Multi-node distributed serving
     dist_init_addr: Optional[str] = None
     nnodes: int = 1
@@ -138,6 +136,7 @@ class ServerArgs:
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
     grammar_backend: Optional[str] = None
+    mm_attention_backend: Optional[str] = None
     # Speculative decoding
     speculative_algorithm: Optional[str] = None
@@ -149,6 +148,27 @@ class ServerArgs:
     speculative_accept_threshold_acc: float = 1.0
     speculative_token_map: Optional[str] = None
+    # Expert parallelism
+    ep_size: int = 1
+    enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
+    enable_flashinfer_moe: bool = False
+    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
+    ep_num_redundant_experts: int = 0
+    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
+    init_expert_location: str = "trivial"
+    enable_eplb: bool = False
+    eplb_algorithm: str = "auto"
+    eplb_rebalance_num_iterations: int = 1000
+    eplb_rebalance_layers_per_chunk: Optional[int] = None
+    expert_distribution_recorder_mode: Optional[
+        Literal["stat", "stat_approx", "per_pass", "per_token"]
+    ] = None
+    expert_distribution_recorder_buffer_size: Optional[int] = None
+    enable_expert_distribution_metrics: bool = False
+    deepep_config: Optional[str] = None
+    moe_dense_tp_size: Optional[int] = None
     # Double Sparsity
     enable_double_sparsity: bool = False
     ds_channel_config_path: Optional[str] = None
@@ -159,38 +179,24 @@ class ServerArgs:
     # Optimization/debug options
     disable_radix_cache: bool = False
+    cuda_graph_max_bs: Optional[int] = None
+    cuda_graph_bs: Optional[List[int]] = None
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
+    enable_profile_cuda_graph: bool = False
     enable_nccl_nvls: bool = False
     enable_tokenizer_batch_encode: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     enable_mscclpp: bool = False
     disable_overlap_schedule: bool = False
+    disable_overlap_cg_plan: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
     enable_dp_lm_head: bool = False
     enable_two_batch_overlap: bool = False
-    enable_ep_moe: bool = False
-    enable_deepep_moe: bool = False
-    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
-    ep_num_redundant_experts: int = 0
-    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
-    init_expert_location: str = "trivial"
-    enable_eplb: bool = False
-    eplb_algorithm: str = "auto"
-    eplb_rebalance_num_iterations: int = 1000
-    eplb_rebalance_layers_per_chunk: Optional[int] = None
-    expert_distribution_recorder_mode: Optional[
-        Literal["stat", "stat_approx", "per_pass", "per_token"]
-    ] = None
-    expert_distribution_recorder_buffer_size: Optional[int] = None
-    enable_expert_distribution_metrics: bool = False
-    deepep_config: Optional[str] = None
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
-    cuda_graph_max_bs: Optional[int] = None
-    cuda_graph_bs: Optional[List[int]] = None
     torchao_config: str = ""
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
@@ -201,31 +207,38 @@ class ServerArgs:
     enable_memory_saver: bool = False
     allow_auto_truncate: bool = False
     enable_custom_logit_processor: bool = False
-    tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
     hicache_size: int = 0
     hicache_write_policy: str = "write_through_selective"
     flashinfer_mla_disable_ragged: bool = False
-    warmups: Optional[str] = None
-    moe_dense_tp_size: Optional[int] = None
     disable_shared_experts_fusion: bool = False
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
-    mm_attention_backend: Optional[str] = None
+    enable_return_hidden_states: bool = False
+    warmups: Optional[str] = None
     # Debug tensor dumps
     debug_tensor_dump_output_folder: Optional[str] = None
     debug_tensor_dump_input_file: Optional[str] = None
     debug_tensor_dump_inject: bool = False
+    debug_tensor_dump_prefill_only: bool = False
     # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
     disaggregation_mode: str = "null"
-    disaggregation_bootstrap_port: int = 8998
     disaggregation_transfer_backend: str = "mooncake"
+    disaggregation_bootstrap_port: int = 8998
+    disaggregation_decode_tp: Optional[int] = None
+    disaggregation_decode_dp: Optional[int] = None
+    disaggregation_prefill_pp: Optional[int] = 1
     disaggregation_ib_device: Optional[str] = None
+    num_reserved_decode_tokens: int = 512  # used for decode kv cache offload in PD
     pdlb_url: Optional[str] = None
+    # For model weight update
+    custom_weight_loader: Optional[List[str]] = None
+    weight_loader_disable_mmap: bool = False
     def __post_init__(self):
         # Expert parallelism
         if self.enable_ep_moe:
@@ -233,7 +246,15 @@ class ServerArgs:
             logger.warning(
                 f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
+        if self.enable_flashinfer_moe:
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            os.environ["TRTLLM_ENABLE_PDL"] = "1"
+            self.disable_shared_experts_fusion = True
+            logger.warning(
+                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
+            )
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
@@ -249,51 +270,72 @@ class ServerArgs:
         gpu_mem = get_device_memory_capacity(self.device)
-        # Set mem fraction static, which depends on the tensor parallelism size
+        # Set mem fraction static
         if self.mem_fraction_static is None:
-            parallel_size = self.tp_size * self.pp_size
-            if gpu_mem is not None and gpu_mem <= 81920:
-                if parallel_size >= 16:
-                    self.mem_fraction_static = 0.79
-                elif parallel_size >= 8:
-                    self.mem_fraction_static = 0.81
-                elif parallel_size >= 4:
-                    self.mem_fraction_static = 0.85
-                elif parallel_size >= 2:
-                    self.mem_fraction_static = 0.87
+            if gpu_mem is not None:
+                # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
+                # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
+                # We want mem_fraction_static to be as large as possible but still has enough room
+                # for activations and cuda graph buffers. We use the following heuristic to
+                # compute the needed size for activations and cuda graph buffers:
+                # - The size of the activation depends on the chunked_prefill_size and model size.
+                # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
+                # For GPUs with more memory, we use a larger chunked_prefill_size and
+                # capture more cuda graphs, so they need to reserve more memory.
+                parallel_size = self.tp_size * self.pp_size
+                if gpu_mem < 20 * 1024:
+                    # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
+                    reserved_mem = (2.8 + parallel_size / 10) * 1024
+                elif gpu_mem < 35 * 1024:
+                    # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
+                    reserved_mem = (2.8 + parallel_size / 10) * 1024
+                elif gpu_mem < 90 * 1024:
+                    # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
+                    reserved_mem = (9.5 + parallel_size / 2) * 1024
+                elif gpu_mem < 100 * 1024:
+                    # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
+                    reserved_mem = (12 + parallel_size / 2) * 1024
+                elif gpu_mem < 160 * 1024:
+                    # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
+                    reserved_mem = (12 + parallel_size / 2) * 1024
                 else:
-                    self.mem_fraction_static = 0.88
-            else:
-                self.mem_fraction_static = 0.88
-            if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
-                self.mem_fraction_static = 0.79
-            elif gpu_mem is not None and gpu_mem > 96 * 1024:
-                mem_fraction = self.mem_fraction_static
-                # 15 GB + additional 3GB for cuda graph
-                reserve_mem = 1024 * 18
-                # need reserve more memory for spec cuda graph
+                    # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
+                    reserved_mem = 32 * 1024
                 if self.speculative_algorithm is not None:
-                    reserve_mem = 1024 * 20
-                self.mem_fraction_static = min(
-                    mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
-                    (gpu_mem - reserve_mem) / gpu_mem,
-                )
+                    # draft model and larger cuda graph buffers
+                    reserved_mem += 2 * 1024
+                if self.enable_dp_attention:
+                    reserved_mem += 4 * 1024
+                self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
             else:
-                if self.speculative_algorithm is not None:
-                    self.mem_fraction_static *= 0.95
+                self.mem_fraction_static = 0.88
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:
-            if gpu_mem is not None and gpu_mem > 180_000:
-                self.chunked_prefill_size = 16384
-            elif gpu_mem is not None and gpu_mem < 25_000:
-                self.chunked_prefill_size = 2048
-            elif self.disaggregation_mode != "null":
-                self.chunked_prefill_size = 16384
+            if gpu_mem is not None:
+                if gpu_mem < 35 * 1024:  # A10, L40, 4090
+                    self.chunked_prefill_size = 2048
+                elif gpu_mem < 160 * 1024:  # H100, H200, A100, H20
+                    self.chunked_prefill_size = 8192
+                else:  # B200, MI300
+                    self.chunked_prefill_size = 16384
             else:
-                self.chunked_prefill_size = 8192
+                self.chunked_prefill_size = 4096
         assert self.chunked_prefill_size % self.page_size == 0
+        # Set cuda graph max batch size
+        if self.cuda_graph_max_bs is None:
+            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
+            if gpu_mem is not None and gpu_mem < 35 * 1024:
+                if self.tp_size < 4:
+                    self.cuda_graph_max_bs = 8
+                else:
+                    self.cuda_graph_max_bs = 80
         assert self.moe_dense_tp_size in {
             1,
             None,
@@ -311,15 +353,6 @@ class ServerArgs:
             )
             self.page_size = 128
-        # Set cuda graph max batch size
-        if self.cuda_graph_max_bs is None:
-            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
-            if gpu_mem is not None and gpu_mem < 25_000:
-                if self.tp_size < 4:
-                    self.cuda_graph_max_bs = 8
-                else:
-                    self.cuda_graph_max_bs = 80
         # Set kernel backends for hpu device
         if self.device == "hpu":
             self.attention_backend = "torch_native"
@@ -364,7 +397,6 @@ class ServerArgs:
             ), "Please enable dp attention when setting enable_dp_attention. "
         # DeepEP MoE
-        self.enable_sp_layernorm = False
         if self.enable_deepep_moe:
             if self.deepep_mode == "auto":
                 assert (
@@ -374,9 +406,6 @@ class ServerArgs:
                 logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
                 self.disable_cuda_graph = True
             self.ep_size = self.tp_size
-            self.enable_sp_layernorm = (
-                self.dp_size < self.tp_size if self.enable_dp_attention else True
-            )
             logger.warning(
                 f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
@@ -390,7 +419,7 @@ class ServerArgs:
         if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
             self.expert_distribution_recorder_mode = "stat"
             logger.info(
-                f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
+                "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
             )
         if (self.enable_eplb or (self.init_expert_location is not None)) and (
@@ -398,7 +427,7 @@ class ServerArgs:
         ):
             self.ep_dispatch_algorithm = "static"
             logger.info(
-                f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
+                "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
             )
         if self.enable_expert_distribution_metrics and (
@@ -488,12 +517,27 @@ class ServerArgs:
             self.triton_attention_num_kv_splits = 16
         # PD disaggregation
-        if self.disaggregation_mode == "prefill":
-            self.disable_cuda_graph = True
-            logger.warning("Cuda graph is disabled for prefill server")
-        elif self.disaggregation_mode == "decode":
+        if self.disaggregation_mode == "decode":
+            assert (
+                self.disaggregation_decode_tp is None
+            ), "Cannot set --disaggregation-decode-tp for the decode engine."
+            assert (
+                self.disaggregation_decode_dp is None
+            ), "Cannot set --disaggregation-decode-dp for the decode engine."
             self.disable_radix_cache = True
             logger.warning("KV cache is forced as chunk cache for decode server")
+        elif self.disaggregation_mode == "prefill":
+            if self.disaggregation_decode_tp is None:
+                self.disaggregation_decode_tp = self.tp_size
+            if self.disaggregation_decode_dp is None:
+                self.disaggregation_decode_dp = self.dp_size
+            self.disaggregation_prefill_pp = self.pp_size
+            self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
+            self.disable_cuda_graph = True
+            logger.warning("Cuda graph is disabled for prefill server")
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
@@ -503,6 +547,17 @@ class ServerArgs:
             "1" if self.disable_outlines_disk_cache else "0"
         )
+        if self.custom_weight_loader is None:
+            self.custom_weight_loader = []
+    def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
+        larger_tp = max(decode_tp, prefill_tp)
+        smaller_tp = min(decode_tp, prefill_tp)
+        assert larger_tp % smaller_tp == 0, (
+            "Different tp size is supported only when one tp is multiple of the other. "
+            f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
+        )
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -519,10 +574,16 @@ class ServerArgs:
             help="The path of the tokenizer.",
         )
         parser.add_argument(
-            "--host", type=str, default=ServerArgs.host, help="The host of the server."
+            "--host",
+            type=str,
+            default=ServerArgs.host,
+            help="The host of the HTTP server.",
         )
         parser.add_argument(
-            "--port", type=int, default=ServerArgs.port, help="The port of the server."
+            "--port",
+            type=int,
+            default=ServerArgs.port,
+            help="The port of the HTTP server.",
         )
         parser.add_argument(
             "--tokenizer-mode",
@@ -677,6 +738,18 @@ class ServerArgs:
             "name, a tag name, or a commit id. If unspecified, will use "
             "the default version.",
         )
+        parser.add_argument(
+            "--impl",
+            type=str,
+            default=ServerArgs.impl,
+            help="Which implementation of the model to use.\n\n"
+            '* "auto" will try to use the SGLang implementation if it exists '
+            "and fall back to the Transformers implementation if no SGLang "
+            "implementation is available.\n"
+            '* "sglang" will use the SGLang model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            "implementation.\n",
+        )
         # Memory and scheduling
         parser.add_argument(
@@ -735,18 +808,6 @@ class ServerArgs:
             default=ServerArgs.page_size,
             help="The number of tokens in a page.",
         )
-        parser.add_argument(
-            "--impl",
-            type=str,
-            default=ServerArgs.impl,
-            help="Which implementation of the model to use.\n\n"
-            '* "auto" will try to use the SGLang implementation if it exists '
-            "and fall back to the Transformers implementation if no SGLang "
-            "implementation is available.\n"
-            '* "sglang" will use the SGLang model implementation.\n'
-            '* "transformers" will use the Transformers model '
-            "implementation.\n",
-        )
         # Other runtime options
         parser.add_argument(
@@ -822,6 +883,11 @@ class ServerArgs:
             default=ServerArgs.gpu_id_step,
             help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
         )
+        parser.add_argument(
+            "--sleep-on-idle",
+            action="store_true",
+            help="Reduce CPU usage when sglang is idle.",
+        )
         # Logging
         parser.add_argument(
@@ -929,6 +995,13 @@ class ServerArgs:
             default=ServerArgs.reasoning_parser,
             help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
         )
+        parser.add_argument(
+            "--tool-call-parser",
+            type=str,
+            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
+            default=ServerArgs.tool_call_parser,
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
+        )
         # Data parallelism
         parser.add_argument(
@@ -949,15 +1022,6 @@ class ServerArgs:
             ],
         )
-        # Expert parallelism
-        parser.add_argument(
-            "--expert-parallel-size",
-            "--ep-size",
-            type=int,
-            default=ServerArgs.ep_size,
-            help="The expert parallelism size.",
-        )
         # Multi-node distributed serving
         parser.add_argument(
             "--dist-init-addr",
@@ -1038,21 +1102,6 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
-        parser.add_argument(
-            "--enable-flashinfer-mla",
-            action=DeprecatedAction,
-            help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
-        )
-        parser.add_argument(
-            "--enable-flashmla",
-            action=DeprecatedAction,
-            help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
-        )
-        parser.add_argument(
-            "--flashinfer-mla-disable-ragged",
-            action="store_true",
-            help="Not using ragged prefill wrapper when running flashinfer mla",
-        )
         # Speculative decoding
         parser.add_argument(
@@ -1102,6 +1151,114 @@ class ServerArgs:
             help="The path of the draft model's small vocab table.",
             default=ServerArgs.speculative_token_map,
         )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
+        # Expert parallelism
+        parser.add_argument(
+            "--expert-parallel-size",
+            "--ep-size",
+            type=int,
+            default=ServerArgs.ep_size,
+            help="The expert parallelism size.",
+        )
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
+        parser.add_argument(
+            "--enable-flashinfer-moe",
+            action="store_true",
+            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="Enabling DeepEP MoE implementation for EP MoE.",
+        )
+        parser.add_argument(
+            "--deepep-mode",
+            type=str,
+            choices=["normal", "low_latency", "auto"],
+            default="auto",
+            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
+        )
+        parser.add_argument(
+            "--ep-num-redundant-experts",
+            type=int,
+            default=ServerArgs.ep_num_redundant_experts,
+            help="Allocate this number of redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--ep-dispatch-algorithm",
+            type=str,
+            default=ServerArgs.ep_dispatch_algorithm,
+            help="The algorithm to choose ranks for redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--init-expert-location",
+            type=str,
+            default=ServerArgs.init_expert_location,
+            help="Initial location of EP experts.",
+        )
+        parser.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            help="Enable EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-algorithm",
+            type=str,
+            default=ServerArgs.eplb_algorithm,
+            help="Chosen EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-num-iterations",
+            type=int,
+            default=ServerArgs.eplb_rebalance_num_iterations,
+            help="Number of iterations to automatically trigger a EPLB re-balance.",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-layers-per-chunk",
+            type=int,
+            default=ServerArgs.eplb_rebalance_layers_per_chunk,
+            help="Number of layers to rebalance per forward pass.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-mode",
+            type=str,
+            default=ServerArgs.expert_distribution_recorder_mode,
+            help="Mode of expert distribution recorder.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-buffer-size",
+            type=int,
+            default=ServerArgs.expert_distribution_recorder_buffer_size,
+            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
+        )
+        parser.add_argument(
+            "--enable-expert-distribution-metrics",
+            action="store_true",
+            help="Enable logging metrics for expert balancedness",
+        )
+        parser.add_argument(
+            "--deepep-config",
+            type=str,
+            default=ServerArgs.deepep_config,
+            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
+        )
+        parser.add_argument(
+            "--moe-dense-tp-size",
+            type=int,
+            default=ServerArgs.moe_dense_tp_size,
+            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
+        )
         # Double Sparsity
         parser.add_argument(
@@ -1146,6 +1303,18 @@ class ServerArgs:
             action="store_true",
             help="Disable RadixAttention for prefix caching.",
         )
+        parser.add_argument(
+            "--cuda-graph-max-bs",
+            type=int,
+            default=ServerArgs.cuda_graph_max_bs,
+            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
+        )
+        parser.add_argument(
+            "--cuda-graph-bs",
+            type=int,
+            nargs="+",
+            help="Set the list of batch sizes for cuda graph.",
+        )
         parser.add_argument(
             "--disable-cuda-graph",
             action="store_true",
@@ -1156,6 +1325,11 @@ class ServerArgs:
             action="store_true",
             help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
         )
+        parser.add_argument(
+            "--enable-profile-cuda-graph",
+            action="store_true",
+            help="Enable profiling of cuda graph capture.",
+        )
         parser.add_argument(
             "--enable-nccl-nvls",
             action="store_true",
@@ -1186,6 +1360,11 @@ class ServerArgs:
             action="store_true",
             help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
         )
+        parser.add_argument(
+            "--disable-overlap-cg-plan",
+            action="store_true",
+            help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
+        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -1201,11 +1380,6 @@ class ServerArgs:
             action="store_true",
             help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
         )
-        parser.add_argument(
-            "--enable-ep-moe",
-            action="store_true",
-            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
-        )
         parser.add_argument(
             "--enable-two-batch-overlap",
             action="store_true",
@@ -1222,18 +1396,6 @@ class ServerArgs:
             default=ServerArgs.torch_compile_max_bs,
             help="Set the maximum batch size when using torch compile.",
         )
-        parser.add_argument(
-            "--cuda-graph-max-bs",
-            type=int,
-            default=ServerArgs.cuda_graph_max_bs,
-            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
-        )
-        parser.add_argument(
-            "--cuda-graph-bs",
-            type=int,
-            nargs="+",
-            help="Set the list of batch sizes for cuda graph.",
-        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -1290,13 +1452,6 @@ class ServerArgs:
             action="store_true",
             help="Enable users to pass custom logit processors to the server (disabled by default for security)",
         )
-        parser.add_argument(
-            "--tool-call-parser",
-            type=str,
-            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
-            default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
-        )
         parser.add_argument(
             "--enable-hierarchical-cache",
             action="store_true",
@@ -1322,86 +1477,9 @@ class ServerArgs:
             help="The write policy of hierarchical cache.",
         )
         parser.add_argument(
-            "--enable-deepep-moe",
-            action="store_true",
-            help="Enabling DeepEP MoE implementation for EP MoE.",
-        )
-        parser.add_argument(
-            "--moe-dense-tp-size",
-            type=int,
-            default=ServerArgs.moe_dense_tp_size,
-            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
-        )
-        parser.add_argument(
-            "--deepep-mode",
-            type=str,
-            choices=["normal", "low_latency", "auto"],
-            default="auto",
-            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
-        )
-        parser.add_argument(
-            "--ep-num-redundant-experts",
-            type=int,
-            default=ServerArgs.ep_num_redundant_experts,
-            help="Allocate this number of redundant experts in expert parallel.",
-        )
-        parser.add_argument(
-            "--ep-dispatch-algorithm",
-            type=str,
-            default=ServerArgs.ep_dispatch_algorithm,
-            help="The algorithm to choose ranks for redundant experts in expert parallel.",
-        )
-        parser.add_argument(
-            "--init-expert-location",
-            type=str,
-            default=ServerArgs.init_expert_location,
-            help="Initial location of EP experts.",
-        )
-        parser.add_argument(
-            "--enable-eplb",
-            action="store_true",
-            help="Enable EPLB algorithm",
-        )
-        parser.add_argument(
-            "--eplb-algorithm",
-            type=str,
-            default=ServerArgs.eplb_algorithm,
-            help="Chosen EPLB algorithm",
-        )
-        parser.add_argument(
-            "--eplb-rebalance-num-iterations",
-            type=int,
-            default=ServerArgs.eplb_rebalance_num_iterations,
-            help="Number of iterations to automatically trigger a EPLB re-balance.",
-        )
-        parser.add_argument(
-            "--eplb-rebalance-layers-per-chunk",
-            type=int,
-            default=ServerArgs.eplb_rebalance_layers_per_chunk,
-            help="Number of layers to rebalance per forward pass.",
-        )
-        parser.add_argument(
-            "--expert-distribution-recorder-mode",
-            type=str,
-            default=ServerArgs.expert_distribution_recorder_mode,
-            help="Mode of expert distribution recorder.",
-        )
-        parser.add_argument(
-            "--expert-distribution-recorder-buffer-size",
-            type=int,
-            default=ServerArgs.expert_distribution_recorder_buffer_size,
-            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
-        )
-        parser.add_argument(
-            "--enable-expert-distribution-metrics",
+            "--flashinfer-mla-disable-ragged",
             action="store_true",
-            help="Enable logging metrics for expert balancedness",
-        )
-        parser.add_argument(
-            "--deepep-config",
-            type=str,
-            default=ServerArgs.deepep_config,
-            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
+            help="Not using ragged prefill wrapper when running flashinfer mla",
         )
         parser.add_argument(
             "--disable-shared-experts-fusion",
@@ -1418,8 +1496,11 @@ class ServerArgs:
             action="store_true",
             help="Adopt base image processor instead of fast image processor.",
         )
-        # Server warmups
+        parser.add_argument(
+            "--enable-return-hidden-states",
+            action="store_true",
+            help="Enable returning hidden states with responses.",
+        )
         parser.add_argument(
             "--warmups",
             type=str,
@@ -1447,6 +1528,11 @@ class ServerArgs:
             default=ServerArgs.debug_tensor_dump_inject,
             help="Inject the outputs from jax as the input of every layer.",
         )
+        parser.add_argument(
+            "--debug-tensor-dump-prefill-only",
+            action="store_true",
+            help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
+        )
         # Disaggregation
         parser.add_argument(
@@ -1456,6 +1542,13 @@ class ServerArgs:
             choices=["null", "prefill", "decode"],
             help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
         )
+        parser.add_argument(
+            "--disaggregation-transfer-backend",
+            type=str,
+            default=ServerArgs.disaggregation_transfer_backend,
+            choices=["mooncake", "nixl"],
+            help="The backend for disaggregation transfer. Default is mooncake.",
+        )
         parser.add_argument(
             "--disaggregation-bootstrap-port",
             type=int,
@@ -1463,11 +1556,22 @@ class ServerArgs:
             help="Bootstrap server port on the prefill server. Default is 8998.",
         )
         parser.add_argument(
-            "--disaggregation-transfer-backend",
-            type=str,
-            default=ServerArgs.disaggregation_transfer_backend,
-            choices=["mooncake", "nixl"],
-            help="The backend for disaggregation transfer. Default is mooncake.",
+            "--disaggregation-decode-tp",
+            type=int,
+            default=ServerArgs.disaggregation_decode_tp,
+            help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
+        )
+        parser.add_argument(
+            "--disaggregation-decode-dp",
+            type=int,
+            default=ServerArgs.disaggregation_decode_dp,
+            help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
+        )
+        parser.add_argument(
+            "--disaggregation-prefill-pp",
+            type=int,
+            default=ServerArgs.disaggregation_prefill_pp,
+            help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
         )
         parser.add_argument(
             "--disaggregation-ib-device",
@@ -1477,19 +1581,29 @@ class ServerArgs:
             "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
             "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
         )
+        parser.add_argument(
+            "--num-reserved-decode-tokens",
+            type=int,
+            default=ServerArgs.num_reserved_decode_tokens,
+            help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
+        )
         parser.add_argument(
             "--pdlb-url",
             type=str,
             default=None,
             help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
         )
         parser.add_argument(
-            "--mm-attention-backend",
+            "--custom-weight-loader",
             type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
-            default=ServerArgs.mm_attention_backend,
-            help="Set multimodal attention backend.",
+            nargs="*",
+            default=None,
+            help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
+        )
+        parser.add_argument(
+            "--weight-loader-disable-mmap",
+            action="store_true",
+            help="Disable mmap while loading weight using safetensors.",
         )
     @classmethod
@@ -1615,9 +1729,8 @@ class PortArgs:
             dist_init_host, dist_init_port = dist_init_addr
             port_base = int(dist_init_port) + 1
             if dp_rank is None:
-                scheduler_input_port = (
-                    port_base + 3
-                )  # TokenizerManager to DataParallelController
+                # TokenizerManager to DataParallelController
+                scheduler_input_port = port_base + 3
             else:
                 scheduler_input_port = port_base + 3 + 1 + dp_rank

sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl