PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +36 -2
sglang/srt/conversation.py +56 -3
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +50 -18
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +20 -5
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
sglang/srt/layers/moe/ep_moe/layer.py +141 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +35 -3
sglang/srt/managers/mm_utils.py +59 -96
sglang/srt/managers/schedule_batch.py +17 -6
sglang/srt/managers/scheduler.py +38 -6
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +176 -101
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +78 -19
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +372 -82
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +63 -61
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +26 -4
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +191 -48
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/bench_serving.py CHANGED Viewed

@@ -814,9 +814,9 @@ def sample_mmmu_requests(
         List of tuples (prompt, prompt_token_len, output_token_len).
     """
     try:
-        import base64
         import io
+        import pybase64
         from datasets import load_dataset
     except ImportError:
         raise ImportError("Please install datasets: pip install datasets")
@@ -867,7 +867,7 @@ def sample_mmmu_requests(
                     # Encode image to base64
                     buffered = io.BytesIO()
                     image.save(buffered, format="JPEG")
-                    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                    img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
                     image_data = f"data:image/jpeg;base64,{img_str}"
                 else:
                     continue

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -25,6 +25,7 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import (
     get_config,
     get_context_length,
+    get_generation_config,
     get_hf_text_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
@@ -83,6 +84,13 @@ class ModelConfig:
             **kwargs,
         )
+        self.hf_generation_config = get_generation_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
@@ -359,7 +367,17 @@ class ModelConfig:
                 if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
                     quant_cfg = modelopt_quant_config
             elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
-                quant_cfg = modelopt_quant_config
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                json_quant_configs = quant_config_dict["quantization"]
+                quant_algo = json_quant_configs.get("quant_algo", None)
+                if quant_algo == "MIXED_PRECISION":
+                    quant_cfg = {"quant_method": "w4afp8"}
+                else:
+                    quant_cfg = modelopt_quant_config
         return quant_cfg
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +407,7 @@ class ModelConfig:
             "w8a8_fp8",
             "moe_wna16",
             "qoq",
+            "w4afp8",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
@@ -402,7 +421,9 @@ class ModelConfig:
         quant_cfg = self._parse_quant_hf_config()
         if quant_cfg is not None:
-            quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_cfg.get(
+                "quant_method", "" if not self.quantization else self.quantization
+            ).lower()
             # Detect which checkpoint is it
             for _, method in QUANTIZATION_METHODS.items():
@@ -454,6 +475,19 @@ class ModelConfig:
         if eos_ids:
             # it can be either int or list of int
             eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        if eos_ids is None:
+            eos_ids = set()
+        if self.hf_generation_config:
+            generation_eos_ids = getattr(
+                self.hf_generation_config, "eos_token_id", None
+            )
+            if generation_eos_ids:
+                generation_eos_ids = (
+                    {generation_eos_ids}
+                    if isinstance(generation_eos_ids, int)
+                    else set(generation_eos_ids)
+                )
+                eos_ids = eos_ids | generation_eos_ids
         return eos_ids
     def maybe_pull_model_tokenizer_from_remote(self) -> None:

sglang/srt/conversation.py CHANGED Viewed

@@ -88,9 +88,11 @@ class Conversation:
     stop_str: Union[str, List[str]] = None
     # The string that represents an image token in the prompt
     image_token: str = "<image>"
+    video_token: str = "<video>"
     audio_token: str = "<audio>"
     image_data: Optional[List[str]] = None
+    video_data: Optional[List[str]] = None
     modalities: Optional[List[str]] = None
     stop_token_ids: Optional[int] = None
@@ -380,11 +382,15 @@ class Conversation:
         self.messages.append([role, message])
     def append_image(self, image: str):
-        """Append a new message."""
+        """Append a new image."""
         self.image_data.append(image)
+    def append_video(self, video: str):
+        """Append a new video."""
+        self.video_data.append(video)
     def append_audio(self, audio: str):
-        """Append a new message."""
+        """Append a new audio."""
         self.audio_data.append(audio)
     def update_last_message(self, message: str):
@@ -433,6 +439,7 @@ class Conversation:
             sep2=self.sep2,
             stop_str=self.stop_str,
             image_token=self.image_token,
+            video_token=self.video_token,
             audio_token=self.audio_token,
         )
@@ -495,8 +502,12 @@ def generate_embedding_convs(
             sep2=conv_template.sep2,
             stop_str=conv_template.stop_str,
             image_data=[],
+            video_data=[],
+            audio_data=[],
             modalities=[],
             image_token=conv_template.image_token,
+            video_token=conv_template.video_token,
+            audio_token=conv_template.audio_token,
         )
         real_content = ""
@@ -557,10 +568,12 @@ def generate_chat_conv(
         sep2=conv.sep2,
         stop_str=conv.stop_str,
         image_data=[],
+        video_data=[],
         audio_data=[],
         modalities=[],
         image_token=conv.image_token,
         audio_token=conv.audio_token,
+        video_token=conv.video_token,
     )
     if isinstance(request.messages, str):
@@ -602,6 +615,7 @@ def generate_chat_conv(
                     image_token = ""
                 audio_token = conv.audio_token
+                video_token = conv.video_token
                 for content in message.content:
                     if content.type == "text":
                         if num_image_url > 16:
@@ -614,6 +628,9 @@ def generate_chat_conv(
                         else:
                             real_content += image_token
                         conv.append_image(content.image_url.url)
+                    elif content.type == "video_url":
+                        real_content += video_token
+                        conv.append_video(content.video_url.url)
                     elif content.type == "audio_url":
                         real_content += audio_token
                         conv.append_audio(content.audio_url.url)
@@ -810,6 +827,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=["<|im_end|>"],
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
     )
 )
@@ -870,6 +888,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=("<|im_end|>", "<|endoftext|>"),
         image_token="(<image>./</image>)",
+        video_token="(<video>./</video>)",
     )
 )
@@ -921,6 +940,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="mimo-vl",
+        system_message="You are MiMo, an AI assistant developed by Xiaomi.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
 register_conv_template(
     Conversation(
@@ -935,6 +967,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="llama_4_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA4,
+        sep="",
+        stop_str="<|eot|>",
+        image_token="<|image|>",
+    )
+)
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
@@ -943,9 +988,11 @@ def match_internvl(model_path: str):
 @register_conv_template_matching_function
-def match_llama_3_vision(model_path: str):
+def match_llama_vision(model_path: str):
     if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
         return "llama_3_vision"
+    if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
+        return "llama_4_vision"
 @register_conv_template_matching_function
@@ -1034,3 +1081,9 @@ def match_phi_4_mm(model_path: str):
 def match_vila(model_path: str):
     if re.search(r"vila", model_path, re.IGNORECASE):
         return "chatml"
+@register_conv_template_matching_function
+def match_mimo_vl(model_path: str):
+    if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
+        return "mimo-vl"

sglang/srt/disaggregation/ascend/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.ascend.conn import (
+    AscendKVBootstrapServer,
+    AscendKVManager,
+    AscendKVReceiver,
+    AscendKVSender,
+)

sglang/srt/disaggregation/ascend/conn.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
+from sglang.srt.disaggregation.mooncake.conn import (
+    MooncakeKVBootstrapServer,
+    MooncakeKVManager,
+    MooncakeKVReceiver,
+    MooncakeKVSender,
+)
+from sglang.srt.utils import get_local_ip_by_remote
+logger = logging.getLogger(__name__)
+class AscendKVManager(MooncakeKVManager):
+    def init_engine(self):
+        # TransferEngine initialized on ascend.
+        local_ip = get_local_ip_by_remote()
+        self.engine = AscendTransferEngine(
+            hostname=local_ip,
+            npu_id=self.kv_args.gpu_id,
+            disaggregation_mode=self.disaggregation_mode,
+        )
+    def register_buffer_to_engine(self):
+        self.engine.register(
+            self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
+        )
+        # The Ascend backend optimize batch registration for small memory blocks.
+        self.engine.batch_register(
+            self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+        )
+class AscendKVSender(MooncakeKVSender):
+    pass
+class AscendKVReceiver(MooncakeKVReceiver):
+    pass
+class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
+    pass

sglang/srt/disaggregation/ascend/transfer_engine.py ADDED Viewed

@@ -0,0 +1,58 @@
+import logging
+import os
+from typing import List, Optional
+from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
+from sglang.srt.disaggregation.utils import DisaggregationMode
+logger = logging.getLogger(__name__)
+class AscendTransferEngine(MooncakeTransferEngine):
+    def __init__(
+        self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
+    ):
+        try:
+            from mf_adapter import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
+            ) from e
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.npu_id = npu_id
+        # Centralized storage address of the AscendTransferEngine
+        self.store_url = os.getenv("ASCEND_MF_STORE_URL")
+        if disaggregation_mode == DisaggregationMode.PREFILL:
+            self.role = "Prefill"
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            self.role = "Decode"
+        else:
+            logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+            raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+        self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
+        self.initialize()
+    def initialize(self) -> None:
+        """Initialize the ascend transfer instance."""
+        ret_value = self.engine.initialize(
+            self.store_url,
+            self.session_id,
+            self.role,
+            self.npu_id,
+        )
+        if ret_value != 0:
+            logger.error("Ascend Transfer Engine initialization failed.")
+            raise RuntimeError("Ascend Transfer Engine initialization failed.")
+    def batch_register(self, ptrs: List[int], lengths: List[int]):
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+        if ret_value != 0:
+            logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")

sglang/srt/disaggregation/mooncake/conn.py CHANGED Viewed

@@ -132,13 +132,9 @@ class MooncakeKVManager(BaseKVManager):
     ):
         self.kv_args = args
         self.local_ip = get_local_ip_auto()
-        self.engine = MooncakeTransferEngine(
-            hostname=self.local_ip,
-            gpu_id=self.kv_args.gpu_id,
-            ib_device=self.kv_args.ib_device,
-        )
         self.is_mla_backend = is_mla_backend
         self.disaggregation_mode = disaggregation_mode
+        self.init_engine()
         # for p/d multi node infer
         self.bootstrap_port = server_args.disaggregation_bootstrap_port
         self.dist_init_addr = server_args.dist_init_addr
@@ -185,9 +181,11 @@ class MooncakeKVManager(BaseKVManager):
                 threading.Thread(
                     target=self.transfer_worker, args=(queue, executor), daemon=True
                 ).start()
-            self.bootstrap_time_out = get_int_env_var(
-                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 120
+            # If a timeout happens on the prefill side, it means prefill instances
+            # fail to receive the KV indices from the decode instance of this request.
+            # These timeout requests should be aborted to release the tree cache.
+            self.bootstrap_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
             )
         elif self.disaggregation_mode == DisaggregationMode.DECODE:
             self.heartbeat_failures = {}
@@ -209,6 +207,12 @@ class MooncakeKVManager(BaseKVManager):
             self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
             self.prefill_tp_size_table: Dict[str, int] = {}
             self.prefill_dp_size_table: Dict[str, int] = {}
+            # If a timeout happens on the decode side, it means decode instances
+            # fail to receive the KV Cache transfer done signal after bootstrapping.
+            # These timeout requests should be aborted to release the tree cache.
+            self.waiting_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_WAITING_TIMEOUT", 300
+            )
         else:
             raise ValueError(
                 f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
@@ -217,6 +221,13 @@ class MooncakeKVManager(BaseKVManager):
         self.failure_records: Dict[int, str] = {}
         self.failure_lock = threading.Lock()
+    def init_engine(self):
+        self.engine = MooncakeTransferEngine(
+            hostname=self.local_ip,
+            gpu_id=self.kv_args.gpu_id,
+            ib_device=self.kv_args.ib_device,
+        )
     def register_buffer_to_engine(self):
         for kv_data_ptr, kv_data_len in zip(
             self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -259,19 +270,17 @@ class MooncakeKVManager(BaseKVManager):
         # Worker function for processing a single layer
         def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
-            src_addr_list = []
-            dst_addr_list = []
-            length_list = []
             for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
                 src_addr = src_ptr + int(prefill_index[0]) * item_len
                 dst_addr = dst_ptr + int(decode_index[0]) * item_len
                 length = item_len * len(prefill_index)
-                src_addr_list.append(src_addr)
-                dst_addr_list.append(dst_addr)
-                length_list.append(length)
-            return self.engine.batch_transfer_sync(
-                mooncake_session_id, src_addr_list, dst_addr_list, length_list
-            )
+                status = self.engine.transfer_sync(
+                    mooncake_session_id, src_addr, dst_addr, length
+                )
+                if status != 0:
+                    return status
+            return 0
         futures = [
             executor.submit(
@@ -938,7 +947,12 @@ class MooncakeKVSender(BaseKVSender):
                 if self.init_time is not None:
                     now = time.time()
                     elapsed = now - self.init_time
-                    if elapsed >= self.kv_mgr.bootstrap_time_out:
+                    if elapsed >= self.kv_mgr.bootstrap_timeout:
+                        logger.warning_once(
+                            "Some requests timed out when bootstrapping, "
+                            "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
                         self.kv_mgr.record_failure(
                             self.bootstrap_room,
                             f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
@@ -987,6 +1001,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
         self.session_id = self.kv_mgr.get_session_id()
         self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
         self.conclude_state = None
+        self.init_time = None
         self.data_parallel_rank = data_parallel_rank
         if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
@@ -1222,14 +1237,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
                         str(self.required_dst_info_num).encode("ascii"),
                     ]
                 )
+        self.init_time = time.time()
     def poll(self) -> KVPoll:
         if self.conclude_state is None:
             status = self.kv_mgr.check_status(self.bootstrap_room)
             if status in (KVPoll.Success, KVPoll.Failed):
                 self.conclude_state = status
+            elif status == KVPoll.WaitingForInput:
+                if self.init_time is not None:
+                    now = time.time()
+                    elapsed = now - self.init_time
+                    if elapsed >= self.kv_mgr.waiting_timeout:
+                        logger.warning_once(
+                            "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
+                        )
+                        self.conclude_state = KVPoll.Failed
+                        return KVPoll.Failed
             return status
         else:
             return self.conclude_state

sglang/srt/disaggregation/mooncake/transfer_engine.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import json
 import logging
-from dataclasses import dataclass
 from typing import List, Optional
+from sglang.srt.utils import get_bool_env_var, get_free_port
 logger = logging.getLogger(__name__)
@@ -55,12 +55,21 @@ class MooncakeTransferEngine:
         device_name: Optional[str],
     ) -> None:
         """Initialize the mooncake instance."""
-        ret_value = self.engine.initialize(
-            hostname,
-            "P2PHANDSHAKE",
-            "rdma",
-            device_name if device_name is not None else "",
-        )
+        if get_bool_env_var("ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE", "false"):
+            hostname += f":{get_free_port()}:npu_{self.gpu_id}"
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "ascend",
+                device_name if device_name is not None else "",
+            )
+        else:
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "rdma",
+                device_name if device_name is not None else "",
+            )
         if ret_value != 0:
             logger.error("Mooncake Transfer Engine initialization failed.")
             raise RuntimeError("Mooncake Transfer Engine initialization failed.")

sglang/srt/disaggregation/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ import requests
 import torch
 import torch.distributed as dist
-from sglang.srt.utils import get_ip
+from sglang.srt.utils import get_ip, is_npu
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import Req
@@ -94,8 +94,12 @@ class MetadataBuffers:
         custom_mem_pool: torch.cuda.MemPool = None,
     ):
         self.custom_mem_pool = custom_mem_pool
-        device = "cuda" if self.custom_mem_pool else "cpu"
+        device = "cpu"
+        if is_npu():
+            # For ascend backend, output tokens are placed in the NPU and will be transferred by D2D channel.
+            device = "npu"
+        elif self.custom_mem_pool:
+            device = "cuda"
         with (
             torch.cuda.use_mem_pool(self.custom_mem_pool)
             if self.custom_mem_pool
@@ -200,6 +204,7 @@ class MetadataBuffers:
 class TransferBackend(Enum):
     MOONCAKE = "mooncake"
     NIXL = "nixl"
+    ASCEND = "ascend"
     FAKE = "fake"
@@ -231,6 +236,23 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
             KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
         }
         return class_mapping.get(class_type)
+    elif transfer_backend == TransferBackend.ASCEND:
+        from sglang.srt.disaggregation.ascend import (
+            AscendKVBootstrapServer,
+            AscendKVManager,
+            AscendKVReceiver,
+            AscendKVSender,
+        )
+        from sglang.srt.disaggregation.base import KVArgs
+        class_mapping = {
+            KVClassType.KVARGS: KVArgs,
+            KVClassType.MANAGER: AscendKVManager,
+            KVClassType.SENDER: AscendKVSender,
+            KVClassType.RECEIVER: (AscendKVReceiver),
+            KVClassType.BOOTSTRAP_SERVER: AscendKVBootstrapServer,
+        }
+        return class_mapping.get(class_type)
     elif transfer_backend == TransferBackend.NIXL:
         from sglang.srt.disaggregation.base import KVArgs
         from sglang.srt.disaggregation.nixl import (

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -650,7 +650,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if _is_cuda:
         assert_pkg_version(
             "sgl-kernel",
-            "0.2.4",
+            "0.2.5",
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -418,6 +418,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
     await _global_state.tokenizer_manager.start_profile(
         output_dir=obj.output_dir,
+        start_step=obj.start_step,
         num_steps=obj.num_steps,
         activities=obj.activities,
         with_stack=obj.with_stack,

sglang/srt/entrypoints/http_server_engine.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import base64
 import copy
 import dataclasses
 import multiprocessing
@@ -7,6 +6,7 @@ import threading
 import time
 from typing import Any, Dict, List, Optional, Tuple, Union
+import pybase64
 import requests
 import torch
 import torch.distributed as dist

sglang/srt/entrypoints/openai/protocol.py CHANGED Viewed

@@ -267,6 +267,10 @@ class ChatCompletionMessageContentImageURL(BaseModel):
     detail: Optional[Literal["auto", "low", "high"]] = "auto"
+class ChatCompletionMessageContentVideoURL(BaseModel):
+    url: str
 class ChatCompletionMessageContentAudioURL(BaseModel):
     url: str
@@ -277,6 +281,11 @@ class ChatCompletionMessageContentImagePart(BaseModel):
     modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
+class ChatCompletionMessageContentVideoPart(BaseModel):
+    type: Literal["video_url"]
+    video_url: ChatCompletionMessageContentVideoURL
 class ChatCompletionMessageContentAudioPart(BaseModel):
     type: Literal["audio_url"]
     audio_url: ChatCompletionMessageContentAudioURL
@@ -285,6 +294,7 @@ class ChatCompletionMessageContentAudioPart(BaseModel):
 ChatCompletionMessageContentPart = Union[
     ChatCompletionMessageContentTextPart,
     ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentVideoPart,
     ChatCompletionMessageContentAudioPart,
 ]
@@ -629,6 +639,7 @@ class MessageProcessingResult:
     prompt_ids: Union[str, List[int]]
     image_data: Optional[Any]
     audio_data: Optional[Any]
+    video_data: Optional[Any]
     modalities: List[str]
     stop: List[str]
     tool_call_constraint: Optional[Any] = None

sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl