PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +26 -4
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +676 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +49 -8
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +42 -8
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +78 -13
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +133 -55
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +434 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +41 -19
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +60 -20
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +80 -53
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +25 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -19
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +78 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +87 -33
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +67 -30
sglang/srt/lora/mem_pool.py +117 -52
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +18 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +43 -5
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -30
sglang/srt/managers/scheduler.py +290 -31
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -24
sglang/srt/managers/tp_worker.py +4 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +36 -21
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +75 -8
sglang/srt/model_loader/loader.py +171 -3
sglang/srt/model_loader/weight_utils.py +51 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +31 -88
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +329 -73
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +694 -0
sglang/srt/models/gemma3_mm.py +468 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +201 -104
sglang/srt/openai_api/protocol.py +33 -7
sglang/srt/patch_torch.py +71 -0
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +114 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +140 -54
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +215 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +29 -2
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +56 -5
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import time
 from typing import Dict, List, Optional, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, root_validator
 from typing_extensions import Literal
@@ -28,6 +28,7 @@ class ModelCard(BaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "sglang"
     root: Optional[str] = None
+    max_model_len: Optional[int] = None
 class ModelList(BaseModel):
@@ -187,7 +188,7 @@ class CompletionResponseChoice(BaseModel):
     index: int
     text: str
     logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[str] = None
+    finish_reason: Literal["stop", "length", "content_filter"]
     matched_stop: Union[None, int, str] = None
@@ -204,7 +205,7 @@ class CompletionResponseStreamChoice(BaseModel):
     index: int
     text: str
     logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter"]] = None
     matched_stop: Union[None, int, str] = None
@@ -227,14 +228,25 @@ class ChatCompletionMessageContentImageURL(BaseModel):
     detail: Optional[Literal["auto", "low", "high"]] = "auto"
+class ChatCompletionMessageContentAudioURL(BaseModel):
+    url: str
 class ChatCompletionMessageContentImagePart(BaseModel):
     type: Literal["image_url"]
     image_url: ChatCompletionMessageContentImageURL
     modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
+class ChatCompletionMessageContentAudioPart(BaseModel):
+    type: Literal["audio_url"]
+    audio_url: ChatCompletionMessageContentAudioURL
 ChatCompletionMessageContentPart = Union[
-    ChatCompletionMessageContentTextPart, ChatCompletionMessageContentImagePart
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentAudioPart,
 ]
@@ -276,6 +288,7 @@ class Function(BaseModel):
     description: Optional[str] = Field(default=None, examples=[None])
     name: Optional[str] = None
     parameters: Optional[object] = None
+    strict: bool = False
 class Tool(BaseModel):
@@ -310,7 +323,7 @@ class ChatCompletionRequest(BaseModel):
     max_tokens: Optional[int] = None
     n: int = 1
     presence_penalty: float = 0.0
-    response_format: Union[ResponseFormat, StructuralTagResponseFormat] = None
+    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = None
     stream: bool = False
@@ -323,6 +336,15 @@ class ChatCompletionRequest(BaseModel):
         default="auto", examples=["none"]
     )  # noqa
+    @root_validator(pre=True)
+    def set_tool_choice_default(cls, values):
+        if values.get("tool_choice") is None:
+            if values.get("tools") is None:
+                values["tool_choice"] = "none"
+            else:
+                values["tool_choice"] = "auto"
+        return values
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
     top_k: int = -1
     min_p: float = 0.0
@@ -366,7 +388,9 @@ class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
     logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
-    finish_reason: str
+    finish_reason: Literal[
+        "stop", "length", "tool_calls", "content_filter", "function_call"
+    ]
     matched_stop: Union[None, int, str] = None
@@ -390,7 +414,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
     index: int
     delta: DeltaMessage
     logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[
+        Literal["stop", "length", "tool_calls", "content_filter", "function_call"]
+    ] = None
     matched_stop: Union[None, int, str] = None

sglang/srt/patch_torch.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from typing import Callable, Union
+import torch
+from torch.multiprocessing import reductions
+def monkey_patch_torch_reductions():
+    """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
+    if hasattr(reductions, "_reduce_tensor_original"):
+        return
+    reductions._reduce_tensor_original = reductions.reduce_tensor
+    reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
+    reductions.reduce_tensor = _reduce_tensor_modified
+    reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
+    reductions.init_reductions()
+# The signature has not been changed for years, and we will not need this when the next version is released,
+# so it looks safe to use a constant.
+_REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
+def _reduce_tensor_modified(*args, **kwargs):
+    output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
+    output_args = _modify_tuple(
+        output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
+    )
+    return output_fn, output_args
+def _rebuild_cuda_tensor_modified(*args):
+    args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
+    return reductions._rebuild_cuda_tensor_original(*args)
+def _device_to_uuid(device: int) -> str:
+    return str(torch.cuda.get_device_properties(device).uuid)
+def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
+    if isinstance(device_maybe_uuid, int):
+        return device_maybe_uuid
+    if isinstance(device_maybe_uuid, str):
+        for device in range(torch.cuda.device_count()):
+            if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
+                return device
+        raise Exception("Invalid device_uuid=" + device_maybe_uuid)
+    raise Exception(f"Unknown type: {device_maybe_uuid=}")
+def _modify_tuple(t, index: int, modifier: Callable):
+    return *t[:index], modifier(t[index]), *t[index + 1 :]

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -306,7 +306,7 @@ class SamplingBatchInfo:
         ]:
             self_val = getattr(self, item, None)
             other_val = getattr(other, item, None)
-            setattr(self, item, torch.concat([self_val, other_val]))
+            setattr(self, item, torch.cat([self_val, other_val]))
         self.is_all_greedy |= other.is_all_greedy
         self.need_min_p_sampling |= other.need_min_p_sampling

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -77,7 +77,7 @@ class SamplingParams:
         self.custom_params = custom_params
         # Process some special cases
-        if self.temperature < _SAMPLING_EPS:
+        if 0 <= self.temperature < _SAMPLING_EPS:
             # top_k = 1 means greedy sampling
             self.temperature = 1.0
             self.top_k = 1
@@ -93,9 +93,9 @@ class SamplingParams:
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
-        if self.top_k < -1 or self.top_k == 0:
+        if self.top_k < 1 or self.top_k == -1:
             raise ValueError(
-                f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
+                f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
             )
         if not -2.0 <= self.frequency_penalty <= 2.0:
             raise ValueError(
@@ -108,12 +108,12 @@ class SamplingParams:
             )
         if not 0.0 <= self.repetition_penalty <= 2.0:
             raise ValueError(
-                "repetition_penalty must be in (0, 2], got "
+                "repetition_penalty must be in [0, 2], got "
                 f"{self.repetition_penalty}."
             )
         if not 0 <= self.min_new_tokens:
             raise ValueError(
-                f"min_new_tokens must be in (0, max_new_tokens], got "
+                f"min_new_tokens must be in [0, max_new_tokens], got "
                 f"{self.min_new_tokens}."
             )
         if self.max_new_tokens is not None:
@@ -123,7 +123,7 @@ class SamplingParams:
                 )
             if not self.min_new_tokens <= self.max_new_tokens:
                 raise ValueError(
-                    f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
+                    f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
         grammars = [

sglang/srt/server_args.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import argparse
 import dataclasses
 import logging
+import os
 import random
 import tempfile
 from typing import List, Optional
@@ -23,13 +24,16 @@ from typing import List, Optional
 from sglang.srt.hf_transformers_utils import check_gguf_file
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
+    configure_ipv6,
     get_amdgpu_memory_capacity,
+    get_device,
     get_hpu_memory_capacity,
     get_nvgpu_memory_capacity,
     is_cuda,
     is_flashinfer_available,
     is_hip,
     is_port_available,
+    is_remote_url,
     is_valid_ipv6_address,
     nullable_str,
 )
@@ -49,11 +53,12 @@ class ServerArgs:
     dtype: str = "auto"
     kv_cache_dtype: str = "auto"
     quantization: Optional[str] = None
-    quantization_param_path: nullable_str = None
+    quantization_param_path: Optional[str] = None
     context_length: Optional[int] = None
-    device: str = "cuda"
+    device: Optional[str] = None
     served_model_name: Optional[str] = None
     chat_template: Optional[str] = None
+    completion_template: Optional[str] = None
     is_embedding: bool = False
     revision: Optional[str] = None
@@ -122,7 +127,7 @@ class ServerArgs:
     # Kernel backend
     attention_backend: Optional[str] = None
     sampling_backend: Optional[str] = None
-    grammar_backend: Optional[str] = "outlines"
+    grammar_backend: Optional[str] = "xgrammar"
     # Speculative decoding
     speculative_algorithm: Optional[str] = None
@@ -136,7 +141,7 @@ class ServerArgs:
     # Double Sparsity
     enable_double_sparsity: bool = False
-    ds_channel_config_path: str = None
+    ds_channel_config_path: Optional[str] = None
     ds_heavy_channel_num: int = 32
     ds_heavy_token_num: int = 256
     ds_heavy_channel_type: str = "qk"
@@ -154,6 +159,7 @@ class ServerArgs:
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
     enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
     cuda_graph_max_bs: Optional[int] = None
@@ -168,9 +174,11 @@ class ServerArgs:
     enable_memory_saver: bool = False
     allow_auto_truncate: bool = False
     enable_custom_logit_processor: bool = False
-    tool_call_parser: str = None
+    tool_call_parser: Optional[str] = None
     enable_hierarchical_cache: bool = False
+    hicache_ratio: float = 2.0
     enable_flashinfer_mla: bool = False
+    enable_flashmla: bool = False
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
@@ -179,11 +187,18 @@ class ServerArgs:
     debug_tensor_dump_input_file: Optional[str] = None
     debug_tensor_dump_inject: bool = False
+    # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
+    disaggregation_mode: str = "null"
+    disaggregation_bootstrap_port: int = 8998
     def __post_init__(self):
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
+        if self.device is None:
+            self.device = get_device()
         if self.served_model_name is None:
             self.served_model_name = self.model_path
@@ -222,6 +237,11 @@ class ServerArgs:
         assert self.chunked_prefill_size % self.page_size == 0
+        if self.enable_flashmla is True:
+            logger.warning(
+                "FlashMLA only supports a page_size of 64, change page_size to 64."
+            )
+            self.page_size = 64
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
             # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -272,15 +292,28 @@ class ServerArgs:
                 f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
             )
+        self.enable_sp_layernorm = False
+        # DeepEP MoE
+        if self.enable_deepep_moe:
+            self.ep_size = self.tp_size
+            self.enable_sp_layernorm = (
+                self.dp_size < self.tp_size if self.enable_dp_attention else True
+            )
+            logger.info(
+                f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
         # Speculative Decoding
         if self.speculative_algorithm == "NEXTN":
             # NEXTN shares the same implementation of EAGLE
             self.speculative_algorithm = "EAGLE"
-        if self.speculative_algorithm == "EAGLE":
+        if (
+            self.speculative_algorithm == "EAGLE"
+            or self.speculative_algorithm == "EAGLE3"
+        ):
             if self.max_running_requests is None:
                 self.max_running_requests = 32
-            self.disable_cuda_graph_padding = True
             self.disable_overlap_schedule = True
             logger.info(
                 "Overlap scheduler is disabled because of using "
@@ -296,10 +329,29 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
+        if is_remote_url(self.model_path):
+            self.load_format = "remote"
         # AMD-specific Triton attention KV splits default number
         if is_hip():
             self.triton_attention_num_kv_splits = 16
+        # PD disaggregation
+        if self.disaggregation_mode == "prefill":
+            self.disable_cuda_graph = True
+            logger.warning("KV cache is forced as chunk cache for decode server")
+            self.disable_overlap_schedule = True
+            logger.warning("Overlap scheduler is disabled for prefill server")
+        elif self.disaggregation_mode == "decode":
+            self.disable_radix_cache = True
+            logger.warning("Cuda graph is disabled for prefill server")
+            self.disable_overlap_schedule = True
+            logger.warning("Overlap scheduler is disabled for decode server")
+        os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
+            "1" if self.enable_torch_compile else "0"
+        )
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and port args
@@ -345,9 +397,11 @@ class ServerArgs:
                 "safetensors",
                 "npcache",
                 "dummy",
+                "sharded_state",
                 "gguf",
                 "bitsandbytes",
                 "layered",
+                "remote",
             ],
             help="The format of the model weights to load. "
             '"auto" will try to load the weights in the safetensors format '
@@ -429,9 +483,8 @@ class ServerArgs:
         parser.add_argument(
             "--device",
             type=str,
-            default="cuda",
-            choices=["cuda", "xpu", "hpu", "cpu"],
-            help="The device type.",
+            default=ServerArgs.device,
+            help="The device to use ('cuda', 'xpu', 'hpu', 'cpu'). Defaults to auto-detection if not specified.",
         )
         parser.add_argument(
             "--served-model-name",
@@ -445,6 +498,12 @@ class ServerArgs:
             default=ServerArgs.chat_template,
             help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
         )
+        parser.add_argument(
+            "--completion-template",
+            type=str,
+            default=ServerArgs.completion_template,
+            help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
+        )
         parser.add_argument(
             "--is-embedding",
             action="store_true",
@@ -722,7 +781,7 @@ class ServerArgs:
         parser.add_argument(
             "--attention-backend",
             type=str,
-            choices=["flashinfer", "triton", "torch_native"],
+            choices=["flashinfer", "triton", "torch_native", "fa3"],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
         )
@@ -745,6 +804,11 @@ class ServerArgs:
             action="store_true",
             help="Enable FlashInfer MLA optimization",
         )
+        parser.add_argument(
+            "--enable-flashmla",
+            action="store_true",
+            help="Enable FlashMLA decode optimization",
+        )
         parser.add_argument(
             "--flashinfer-mla-disable-ragged",
             action="store_true",
@@ -755,7 +819,7 @@ class ServerArgs:
         parser.add_argument(
             "--speculative-algorithm",
             type=str,
-            choices=["EAGLE", "NEXTN"],
+            choices=["EAGLE", "EAGLE3", "NEXTN"],
             help="Speculative algorithm.",
         )
         parser.add_argument(
@@ -984,6 +1048,18 @@ class ServerArgs:
             action="store_true",
             help="Enable hierarchical cache",
         )
+        parser.add_argument(
+            "--hicache-ratio",
+            type=float,
+            required=False,
+            default=ServerArgs.hicache_ratio,
+            help="The ratio of the size of host KV cache memory pool to the size of device pool.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="Enabling DeepEP MoE implementation for EP MoE.",
+        )
         # Server warmups
         parser.add_argument(
@@ -1014,6 +1090,21 @@ class ServerArgs:
             help="Inject the outputs from jax as the input of every layer.",
         )
+        # Disaggregation
+        parser.add_argument(
+            "--disaggregation-mode",
+            type=str,
+            default="null",
+            choices=["null", "prefill", "decode"],
+            help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
+        )
+        parser.add_argument(
+            "--disaggregation-bootstrap-port",
+            type=int,
+            default=ServerArgs.disaggregation_bootstrap_port,
+            help="Bootstrap server port on the prefill server. Default is 8998.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
@@ -1088,6 +1179,9 @@ class PortArgs:
     # The port for nccl initialization (torch.dist)
     nccl_port: int
+    # The ipc filename for rpc call between Engine and Scheduler
+    rpc_ipc_name: str
     @staticmethod
     def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
         port = server_args.port + random.randint(100, 1000)
@@ -1106,13 +1200,18 @@ class PortArgs:
                 scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
                 detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
                 nccl_port=port,
+                rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
             )
         else:
             # DP attention. Use TCP + port to handle both single-node and multi-node.
             if server_args.nnodes == 1 and server_args.dist_init_addr is None:
                 dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
+            elif server_args.dist_init_addr.startswith("["):  # ipv6 address
+                port_num, host = configure_ipv6(server_args.dist_init_addr)
+                dist_init_addr = (host, str(port_num))
             else:
                 dist_init_addr = server_args.dist_init_addr.split(":")
             assert (
                 len(dist_init_addr) == 2
             ), "please provide --dist-init-addr as host:port of head node"
@@ -1121,16 +1220,17 @@ class PortArgs:
             port_base = int(dist_init_port) + 1
             if dp_rank is None:
                 scheduler_input_port = (
-                    port_base + 2
+                    port_base + 3
                 )  # TokenizerManager to DataParallelController
             else:
-                scheduler_input_port = port_base + 2 + 1 + dp_rank
+                scheduler_input_port = port_base + 3 + 1 + dp_rank
             return PortArgs(
                 tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
                 scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
                 detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
                 nccl_port=port,
+                rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
             )

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl