PyPI - sglang - Versions diffs - 0.4.9__tar.gz → 0.4.9.post2__tar.gz - Mend

sglang 0.4.9tar.gz → 0.4.9.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (751) hide show

{sglang-0.4.9/sglang.egg-info → sglang-0.4.9.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.9
+Version: 0.4.9.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,8 +239,10 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: pynvml; extra == "runtime-common"
+Requires-Dist: pybase64; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
+Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
@@ -248,10 +250,10 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
+Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
 Requires-Dist: torch==2.7.1; extra == "srt"
 Requires-Dist: torchaudio==2.7.1; extra == "srt"
 Requires-Dist: torchvision==0.22.1; extra == "srt"
@@ -419,7 +421,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
 ## Adoption and Sponsorship
-SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
+SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

{sglang-0.4.9 → sglang-0.4.9.post2}/README.md RENAMED Viewed

@@ -65,7 +65,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
 ## Adoption and Sponsorship
-SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
+SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

{sglang-0.4.9 → sglang-0.4.9.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.9"
+version = "0.4.9.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -38,8 +38,10 @@ runtime_common = [
     "psutil",
     "pydantic",
     "pynvml",
+    "pybase64",
     "python-multipart",
     "pyzmq>=25.1.2",
+    "sentencepiece",
     "soundfile==0.13.1",
     "scipy",
     "torchao==0.9.0",
@@ -47,12 +49,12 @@ runtime_common = [
     "timm==1.0.16",
     "uvicorn",
     "uvloop",
-    "xgrammar==0.1.19",
+    "xgrammar==0.1.21",
 ]
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.2.4",
+    "sgl-kernel==0.2.5",
     "torch==2.7.1",
     "torchaudio==2.7.1",
     "torchvision==0.22.1",

{sglang-0.4.9 → sglang-0.4.9.post2}/sglang/bench_serving.py RENAMED Viewed

@@ -814,9 +814,9 @@ def sample_mmmu_requests(
         List of tuples (prompt, prompt_token_len, output_token_len).
     """
     try:
-        import base64
         import io
+        import pybase64
         from datasets import load_dataset
     except ImportError:
         raise ImportError("Please install datasets: pip install datasets")
@@ -867,7 +867,7 @@ def sample_mmmu_requests(
                     # Encode image to base64
                     buffered = io.BytesIO()
                     image.save(buffered, format="JPEG")
-                    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                    img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
                     image_data = f"data:image/jpeg;base64,{img_str}"
                 else:
                     continue

{sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -25,6 +25,7 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import (
     get_config,
     get_context_length,
+    get_generation_config,
     get_hf_text_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
@@ -83,6 +84,13 @@ class ModelConfig:
             **kwargs,
         )
+        self.hf_generation_config = get_generation_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
@@ -359,7 +367,17 @@ class ModelConfig:
                 if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
                     quant_cfg = modelopt_quant_config
             elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
-                quant_cfg = modelopt_quant_config
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                json_quant_configs = quant_config_dict["quantization"]
+                quant_algo = json_quant_configs.get("quant_algo", None)
+                if quant_algo == "MIXED_PRECISION":
+                    quant_cfg = {"quant_method": "w4afp8"}
+                else:
+                    quant_cfg = modelopt_quant_config
         return quant_cfg
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -389,6 +407,7 @@ class ModelConfig:
             "w8a8_fp8",
             "moe_wna16",
             "qoq",
+            "w4afp8",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
@@ -402,7 +421,9 @@ class ModelConfig:
         quant_cfg = self._parse_quant_hf_config()
         if quant_cfg is not None:
-            quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_cfg.get(
+                "quant_method", "" if not self.quantization else self.quantization
+            ).lower()
             # Detect which checkpoint is it
             for _, method in QUANTIZATION_METHODS.items():
@@ -454,6 +475,19 @@ class ModelConfig:
         if eos_ids:
             # it can be either int or list of int
             eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        if eos_ids is None:
+            eos_ids = set()
+        if self.hf_generation_config:
+            generation_eos_ids = getattr(
+                self.hf_generation_config, "eos_token_id", None
+            )
+            if generation_eos_ids:
+                generation_eos_ids = (
+                    {generation_eos_ids}
+                    if isinstance(generation_eos_ids, int)
+                    else set(generation_eos_ids)
+                )
+                eos_ids = eos_ids | generation_eos_ids
         return eos_ids
     def maybe_pull_model_tokenizer_from_remote(self) -> None:

{sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/conversation.py RENAMED Viewed

@@ -88,9 +88,11 @@ class Conversation:
     stop_str: Union[str, List[str]] = None
     # The string that represents an image token in the prompt
     image_token: str = "<image>"
+    video_token: str = "<video>"
     audio_token: str = "<audio>"
     image_data: Optional[List[str]] = None
+    video_data: Optional[List[str]] = None
     modalities: Optional[List[str]] = None
     stop_token_ids: Optional[int] = None
@@ -380,11 +382,15 @@ class Conversation:
         self.messages.append([role, message])
     def append_image(self, image: str):
-        """Append a new message."""
+        """Append a new image."""
         self.image_data.append(image)
+    def append_video(self, video: str):
+        """Append a new video."""
+        self.video_data.append(video)
     def append_audio(self, audio: str):
-        """Append a new message."""
+        """Append a new audio."""
         self.audio_data.append(audio)
     def update_last_message(self, message: str):
@@ -433,6 +439,7 @@ class Conversation:
             sep2=self.sep2,
             stop_str=self.stop_str,
             image_token=self.image_token,
+            video_token=self.video_token,
             audio_token=self.audio_token,
         )
@@ -495,8 +502,12 @@ def generate_embedding_convs(
             sep2=conv_template.sep2,
             stop_str=conv_template.stop_str,
             image_data=[],
+            video_data=[],
+            audio_data=[],
             modalities=[],
             image_token=conv_template.image_token,
+            video_token=conv_template.video_token,
+            audio_token=conv_template.audio_token,
         )
         real_content = ""
@@ -557,10 +568,12 @@ def generate_chat_conv(
         sep2=conv.sep2,
         stop_str=conv.stop_str,
         image_data=[],
+        video_data=[],
         audio_data=[],
         modalities=[],
         image_token=conv.image_token,
         audio_token=conv.audio_token,
+        video_token=conv.video_token,
     )
     if isinstance(request.messages, str):
@@ -602,6 +615,7 @@ def generate_chat_conv(
                     image_token = ""
                 audio_token = conv.audio_token
+                video_token = conv.video_token
                 for content in message.content:
                     if content.type == "text":
                         if num_image_url > 16:
@@ -614,6 +628,9 @@ def generate_chat_conv(
                         else:
                             real_content += image_token
                         conv.append_image(content.image_url.url)
+                    elif content.type == "video_url":
+                        real_content += video_token
+                        conv.append_video(content.video_url.url)
                     elif content.type == "audio_url":
                         real_content += audio_token
                         conv.append_audio(content.audio_url.url)
@@ -810,6 +827,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=["<|im_end|>"],
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
     )
 )
@@ -870,6 +888,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=("<|im_end|>", "<|endoftext|>"),
         image_token="(<image>./</image>)",
+        video_token="(<video>./</video>)",
     )
 )
@@ -921,6 +940,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="mimo-vl",
+        system_message="You are MiMo, an AI assistant developed by Xiaomi.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
 register_conv_template(
     Conversation(
@@ -935,6 +967,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="llama_4_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA4,
+        sep="",
+        stop_str="<|eot|>",
+        image_token="<|image|>",
+    )
+)
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
@@ -943,9 +988,11 @@ def match_internvl(model_path: str):
 @register_conv_template_matching_function
-def match_llama_3_vision(model_path: str):
+def match_llama_vision(model_path: str):
     if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
         return "llama_3_vision"
+    if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
+        return "llama_4_vision"
 @register_conv_template_matching_function
@@ -1034,3 +1081,9 @@ def match_phi_4_mm(model_path: str):
 def match_vila(model_path: str):
     if re.search(r"vila", model_path, re.IGNORECASE):
         return "chatml"
+@register_conv_template_matching_function
+def match_mimo_vl(model_path: str):
+    if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
+        return "mimo-vl"

sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.ascend.conn import (
+    AscendKVBootstrapServer,
+    AscendKVManager,
+    AscendKVReceiver,
+    AscendKVSender,
+)

sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/conn.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
+from sglang.srt.disaggregation.mooncake.conn import (
+    MooncakeKVBootstrapServer,
+    MooncakeKVManager,
+    MooncakeKVReceiver,
+    MooncakeKVSender,
+)
+from sglang.srt.utils import get_local_ip_by_remote
+logger = logging.getLogger(__name__)
+class AscendKVManager(MooncakeKVManager):
+    def init_engine(self):
+        # TransferEngine initialized on ascend.
+        local_ip = get_local_ip_by_remote()
+        self.engine = AscendTransferEngine(
+            hostname=local_ip,
+            npu_id=self.kv_args.gpu_id,
+            disaggregation_mode=self.disaggregation_mode,
+        )
+    def register_buffer_to_engine(self):
+        self.engine.register(
+            self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
+        )
+        # The Ascend backend optimize batch registration for small memory blocks.
+        self.engine.batch_register(
+            self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+        )
+class AscendKVSender(MooncakeKVSender):
+    pass
+class AscendKVReceiver(MooncakeKVReceiver):
+    pass
+class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
+    pass

sglang-0.4.9.post2/sglang/srt/disaggregation/ascend/transfer_engine.py ADDED Viewed

@@ -0,0 +1,58 @@
+import logging
+import os
+from typing import List, Optional
+from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
+from sglang.srt.disaggregation.utils import DisaggregationMode
+logger = logging.getLogger(__name__)
+class AscendTransferEngine(MooncakeTransferEngine):
+    def __init__(
+        self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
+    ):
+        try:
+            from mf_adapter import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
+            ) from e
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.npu_id = npu_id
+        # Centralized storage address of the AscendTransferEngine
+        self.store_url = os.getenv("ASCEND_MF_STORE_URL")
+        if disaggregation_mode == DisaggregationMode.PREFILL:
+            self.role = "Prefill"
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            self.role = "Decode"
+        else:
+            logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+            raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+        self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
+        self.initialize()
+    def initialize(self) -> None:
+        """Initialize the ascend transfer instance."""
+        ret_value = self.engine.initialize(
+            self.store_url,
+            self.session_id,
+            self.role,
+            self.npu_id,
+        )
+        if ret_value != 0:
+            logger.error("Ascend Transfer Engine initialization failed.")
+            raise RuntimeError("Ascend Transfer Engine initialization failed.")
+    def batch_register(self, ptrs: List[int], lengths: List[int]):
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+        if ret_value != 0:
+            logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")

{sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/conn.py RENAMED Viewed

@@ -132,13 +132,9 @@ class MooncakeKVManager(BaseKVManager):
     ):
         self.kv_args = args
         self.local_ip = get_local_ip_auto()
-        self.engine = MooncakeTransferEngine(
-            hostname=self.local_ip,
-            gpu_id=self.kv_args.gpu_id,
-            ib_device=self.kv_args.ib_device,
-        )
         self.is_mla_backend = is_mla_backend
         self.disaggregation_mode = disaggregation_mode
+        self.init_engine()
         # for p/d multi node infer
         self.bootstrap_port = server_args.disaggregation_bootstrap_port
         self.dist_init_addr = server_args.dist_init_addr
@@ -185,9 +181,11 @@ class MooncakeKVManager(BaseKVManager):
                 threading.Thread(
                     target=self.transfer_worker, args=(queue, executor), daemon=True
                 ).start()
-            self.bootstrap_time_out = get_int_env_var(
-                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 120
+            # If a timeout happens on the prefill side, it means prefill instances
+            # fail to receive the KV indices from the decode instance of this request.
+            # These timeout requests should be aborted to release the tree cache.
+            self.bootstrap_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
             )
         elif self.disaggregation_mode == DisaggregationMode.DECODE:
             self.heartbeat_failures = {}
@@ -209,6 +207,12 @@ class MooncakeKVManager(BaseKVManager):
             self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
             self.prefill_tp_size_table: Dict[str, int] = {}
             self.prefill_dp_size_table: Dict[str, int] = {}
+            # If a timeout happens on the decode side, it means decode instances
+            # fail to receive the KV Cache transfer done signal after bootstrapping.
+            # These timeout requests should be aborted to release the tree cache.
+            self.waiting_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_WAITING_TIMEOUT", 300
+            )
         else:
             raise ValueError(
                 f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
@@ -217,6 +221,13 @@ class MooncakeKVManager(BaseKVManager):
         self.failure_records: Dict[int, str] = {}
         self.failure_lock = threading.Lock()
+    def init_engine(self):
+        self.engine = MooncakeTransferEngine(
+            hostname=self.local_ip,
+            gpu_id=self.kv_args.gpu_id,
+            ib_device=self.kv_args.ib_device,
+        )
     def register_buffer_to_engine(self):
         for kv_data_ptr, kv_data_len in zip(
             self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
@@ -259,19 +270,17 @@ class MooncakeKVManager(BaseKVManager):
         # Worker function for processing a single layer
         def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
-            src_addr_list = []
-            dst_addr_list = []
-            length_list = []
             for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
                 src_addr = src_ptr + int(prefill_index[0]) * item_len
                 dst_addr = dst_ptr + int(decode_index[0]) * item_len
                 length = item_len * len(prefill_index)
-                src_addr_list.append(src_addr)
-                dst_addr_list.append(dst_addr)
-                length_list.append(length)
-            return self.engine.batch_transfer_sync(
-                mooncake_session_id, src_addr_list, dst_addr_list, length_list
-            )
+                status = self.engine.transfer_sync(
+                    mooncake_session_id, src_addr, dst_addr, length
+                )
+                if status != 0:
+                    return status
+            return 0
         futures = [
             executor.submit(
@@ -938,7 +947,12 @@ class MooncakeKVSender(BaseKVSender):
                 if self.init_time is not None:
                     now = time.time()
                     elapsed = now - self.init_time
-                    if elapsed >= self.kv_mgr.bootstrap_time_out:
+                    if elapsed >= self.kv_mgr.bootstrap_timeout:
+                        logger.warning_once(
+                            "Some requests timed out when bootstrapping, "
+                            "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
                         self.kv_mgr.record_failure(
                             self.bootstrap_room,
                             f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
@@ -987,6 +1001,7 @@ class MooncakeKVReceiver(BaseKVReceiver):
         self.session_id = self.kv_mgr.get_session_id()
         self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
         self.conclude_state = None
+        self.init_time = None
         self.data_parallel_rank = data_parallel_rank
         if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
@@ -1222,14 +1237,31 @@ class MooncakeKVReceiver(BaseKVReceiver):
                         str(self.required_dst_info_num).encode("ascii"),
                     ]
                 )
+        self.init_time = time.time()
     def poll(self) -> KVPoll:
         if self.conclude_state is None:
             status = self.kv_mgr.check_status(self.bootstrap_room)
             if status in (KVPoll.Success, KVPoll.Failed):
                 self.conclude_state = status
+            elif status == KVPoll.WaitingForInput:
+                if self.init_time is not None:
+                    now = time.time()
+                    elapsed = now - self.init_time
+                    if elapsed >= self.kv_mgr.waiting_timeout:
+                        logger.warning_once(
+                            "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
+                        )
+                        self.conclude_state = KVPoll.Failed
+                        return KVPoll.Failed
             return status
         else:
             return self.conclude_state

{sglang-0.4.9 → sglang-0.4.9.post2}/sglang/srt/disaggregation/mooncake/transfer_engine.py RENAMED Viewed

@@ -1,8 +1,8 @@
-import json
 import logging
-from dataclasses import dataclass
 from typing import List, Optional
+from sglang.srt.utils import get_bool_env_var, get_free_port
 logger = logging.getLogger(__name__)
@@ -55,12 +55,21 @@ class MooncakeTransferEngine:
         device_name: Optional[str],
     ) -> None:
         """Initialize the mooncake instance."""
-        ret_value = self.engine.initialize(
-            hostname,
-            "P2PHANDSHAKE",
-            "rdma",
-            device_name if device_name is not None else "",
-        )
+        if get_bool_env_var("ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE", "false"):
+            hostname += f":{get_free_port()}:npu_{self.gpu_id}"
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "ascend",
+                device_name if device_name is not None else "",
+            )
+        else:
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "rdma",
+                device_name if device_name is not None else "",
+            )
         if ret_value != 0:
             logger.error("Mooncake Transfer Engine initialization failed.")
             raise RuntimeError("Mooncake Transfer Engine initialization failed.")

sglang 0.4.9__tar.gz → 0.4.9.post2__tar.gz

sglang 0.4.9tar.gz → 0.4.9.post2tar.gz