PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/conversation.py CHANGED Viewed

@@ -21,6 +21,7 @@ from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
 from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.utils import read_system_prompt_from_file
 class SeparatorStyle(IntEnum):
@@ -561,14 +562,11 @@ def generate_chat_conv(
                     if content.type == "image_url":
                         num_image_url += 1
                         conv.modalities.append(content.modalities)
-                if num_image_url > 1:
-                    image_token = conv.image_token
-                else:
-                    image_token = (
-                        conv.image_token + "\n"
-                        if conv.name != "qwen2-vl"
-                        else conv.image_token
-                    )
+                image_token = (
+                    conv.image_token + "\n"
+                    if conv.name != "qwen2-vl"
+                    else conv.image_token
+                )
                 add_token_as_needed: bool = (
                     conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
                 )
@@ -648,6 +646,20 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="devstral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        system_message=read_system_prompt_from_file("mistralai/Devstral-Small-2505"),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
 # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
 register_conv_template(
     Conversation(
@@ -661,6 +673,20 @@ register_conv_template(
     )
 )
+# TODO (lifuhuang): Refactor BaseMultimodalProcessor to support the default image token "<|image_{index}|>" in the future.
+register_conv_template(
+    Conversation(
+        name="phi-4-mm",
+        system_message="",
+        system_template="{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|end|>",
+        stop_str="<|end|>",
+        image_token="<|endoftext10|>",
+    )
+)
 register_conv_template(
     Conversation(
         name="chatml",
@@ -945,3 +971,21 @@ def match_openbmb_minicpm(model_path: str):
 def match_moonshot_kimivl(model_path: str):
     if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
         return "kimi-vl"
+@register_conv_template_matching_function
+def match_devstral(model_path: str):
+    if re.search(r"devstral", model_path, re.IGNORECASE):
+        return "devstral"
+@register_conv_template_matching_function
+def match_phi_4_mm(model_path: str):
+    if "phi-4-multimodal" in model_path.lower():
+        return "phi-4-mm"
+@register_conv_template_matching_function
+def match_vila(model_path: str):
+    if re.search(r"vila", model_path, re.IGNORECASE):
+        return "chatml"

sglang/srt/custom_op.py CHANGED Viewed

@@ -1,6 +1,3 @@
-from typing import Optional
-import torch
 from torch import nn
 from sglang.srt.utils import is_cuda, is_hip
@@ -14,6 +11,44 @@ class CustomOp(nn.Module):
         super().__init__()
         self._forward_method = self.dispatch_forward()
+        # States for torch.compile
+        self._original_forward_method = None
+        self.is_torch_compile = False
+    def enter_torch_compile(self, num_tokens: int):
+        # Skip if Op is already entered compile mode.
+        # NOTE(alcanderian): Some Ops(for example RotaryEmbedding) will be reused
+        # among layers and `enter_torch_compile` will be called many times.
+        # We should prevent `self._original_forward_method` from being overridden when
+        # it is not the first time `enter_torch_compile` called.
+        if self.is_torch_compile:
+            return
+        self._original_forward_method = self._forward_method
+        # NOTE: Temporarily workaround MoE
+        if "FusedMoE" in self.__class__.__name__:
+            if num_tokens == 1:
+                from sglang.srt.layers.moe.fused_moe_native import (
+                    fused_moe_forward_native,
+                )
+                # The performance of torch.compile on this layer is not always good when bs > 1,
+                # so we decide to only use torch.compile when bs =1
+                self._forward_method = fused_moe_forward_native
+        else:
+            self._forward_method = self.forward_native
+        self.is_torch_compile = True
+    def leave_torch_compile(self):
+        # Skip if Op is already exited compile mode.
+        if not self.is_torch_compile:
+            return
+        self._forward_method = self._original_forward_method
+        self._original_forward_method = None
+        self.is_torch_compile = False
+    # Please do not override this method, because `self._forward_method` can change when in torch compile mode
     def forward(self, *args, **kwargs):
         return self._forward_method(*args, **kwargs)

sglang/srt/debug_utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+import os
+import time
+from pathlib import Path
+import torch
+from sglang.srt.utils import get_bool_env_var
+class _Dumper:
+    """Utility to dump tensors, which can be useful when comparison checking models.
+    Example usage:
+    debug_utils.dumper.dump("layer_start_hidden_states", hidden_states, layer_id=self.layer_id)
+    """
+    def __init__(self):
+        self._enable = get_bool_env_var("SGLANG_DUMPER_ENABLE", "true")
+        self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
+        self._enable_write_file = get_bool_env_var("SGLANG_DUMPER_WRITE_FILE", "1")
+        self._partial_name = str(time.time())
+        self.forward_pass_id = None
+    def dump(self, name, value, **kwargs):
+        if not self._enable:
+            return
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
+        rank = get_tensor_model_parallel_rank()
+        full_kwargs = dict(
+            forward_pass_id=self.forward_pass_id,
+            name=name,
+            **kwargs,
+        )
+        full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
+        path = (
+            self._base_dir / f"sglang_dump_{self._partial_name}_{rank}" / full_filename
+        )
+        sample_value = self._get_sample_value(name, value)
+        print(
+            f"[{rank}, {time.time()}] {path} "
+            f"type={type(value)} "
+            f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
+            f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
+            f"sample_value={sample_value}"
+        )
+        if self._enable_write_file:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(value, str(path))
+    def _get_sample_value(self, name, value):
+        if value is None:
+            return None
+        if isinstance(value, tuple):
+            return [self._get_sample_value(name, x) for x in value]
+        if not isinstance(value, torch.Tensor):
+            return None
+        if value.numel() < 200:
+            return value
+        slices = [
+            slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
+        ]
+        return value[tuple(slices)]
+dumper = _Dumper()

sglang/srt/disaggregation/base/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .conn import (
+from sglang.srt.disaggregation.base.conn import (
     BaseKVBootstrapServer,
     BaseKVManager,
     BaseKVReceiver,

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -1,23 +1,32 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import numpy.typing as npt
-from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.server_args import ServerArgs
+if TYPE_CHECKING:
+    from sglang.srt.disaggregation.utils import DisaggregationMode
 class KVArgs:
     engine_rank: int
-    kv_data_ptrs: list[int]
-    kv_data_lens: list[int]
-    kv_item_lens: list[int]
-    aux_data_ptrs: list[int]
-    aux_data_lens: list[int]
-    aux_item_lens: list[int]
+    kv_data_ptrs: List[int]
+    kv_data_lens: List[int]
+    kv_item_lens: List[int]
+    aux_data_ptrs: List[int]
+    aux_data_lens: List[int]
+    aux_item_lens: List[int]
     ib_device: str
+    ib_traffic_class: str
     gpu_id: int
+    # for different tp
+    decode_tp_size: int
+    # for pp prefill
+    prefill_pp_size: int
 class KVPoll:
@@ -45,7 +54,12 @@ class BaseKVSender(ABC):
     @abstractmethod
     def __init__(
-        self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
     ): ...
     @abstractmethod
@@ -56,7 +70,7 @@ class BaseKVSender(ABC):
         ...
     @abstractmethod
-    def send(self, kv_indices: npt.NDArray[np.int64]):
+    def send(self, kv_indices: npt.NDArray[np.int32]):
         """
         Send the kv cache at the given kv indices to the decoder server
         """
@@ -88,7 +102,7 @@ class BaseKVReceiver(ABC):
     ): ...
     @abstractmethod
-    def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
         """
         Notify the prefill server about the kv indices and aux index
         """

sglang/srt/disaggregation/common/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from sglang.srt.disaggregation.common.conn import (
+    CommonKVBootstrapServer,
+    CommonKVManager,
+    CommonKVReceiver,
+)

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl