PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/conversation.py CHANGED Viewed

@@ -44,6 +44,7 @@ class SeparatorStyle(IntEnum):
     CHATGLM3 = auto()
     DEEPSEEK_CHAT = auto()
     METAMATH = auto()
+    QWEN2_VL_EMBED = auto()
 @dataclasses.dataclass
@@ -110,6 +111,15 @@ class Conversation:
                 else:
                     ret += role + "\n"
             return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            ret += self.stop_str
+            return ret
         elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
             ret = system_prompt
             for role, message in self.messages:
@@ -181,7 +191,7 @@ class Conversation:
             for i, (role, message) in enumerate(self.messages):
                 if i % 2 == 0:
-                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+                    ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
                 if message:
                     ret += f"{role}：{message}{self.sep}"
@@ -366,6 +376,46 @@ def chat_template_exists(template_name: str) -> bool:
     return template_name in chat_templates
+def generate_embedding_convs(
+    texts: List[str], images: List[str], template_name: str
+) -> List[Conversation]:
+    conv_template = chat_templates[template_name].copy()
+    convs = []
+    for text, image in zip(texts, images):
+        conv = Conversation(
+            name=conv_template.name,
+            system_template=conv_template.system_template,
+            system_message=conv_template.system_message,
+            roles=conv_template.roles,
+            messages=list(conv_template.messages),  # prevent in-place modification
+            offset=conv_template.offset,
+            sep_style=SeparatorStyle(conv_template.sep_style),
+            sep=conv_template.sep,
+            sep2=conv_template.sep2,
+            stop_str=conv_template.stop_str,
+            image_data=[],
+            modalities=[],
+            image_token=conv_template.image_token,
+        )
+        real_content = ""
+        if image is not None:
+            image_token = (
+                conv.image_token + "\n"
+                if conv.name != "gme-qwen2-vl"
+                else conv.image_token
+            )
+            real_content += image_token
+        if text is not None:
+            real_content += text
+        conv.append_message(conv.roles[0], real_content)
+        # Add a blank message for the assistant.
+        conv.append_message(conv.roles[1], None)
+        convs.append(conv)
+    return convs
 def generate_chat_conv(
     request: ChatCompletionRequest, template_name: str
 ) -> Conversation:
@@ -403,7 +453,6 @@ def generate_chat_conv(
                     conv.system_message = getattr(message.content[0], "text", "")
         elif msg_role == "user":
             # Handle the various types of Chat Request content types here.
-            role = conv.roles[0]
             if isinstance(message.content, str):
                 conv.append_message(conv.roles[0], message.content)
             else:
@@ -555,6 +604,20 @@ register_conv_template(
     )
 )
+# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
+register_conv_template(
+    Conversation(
+        name="gme-qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_VL_EMBED,
+        stop_str="<|endoftext|>",
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
 # Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
 register_conv_template(
     Conversation(
@@ -568,3 +631,18 @@ register_conv_template(
         image_token="(<image>./</image>)",
     )
 )
+# Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
+register_conv_template(
+    Conversation(
+        name="janus-pro",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
+        system_template="{system_message}.",
+        roles=("User", "Assistant"),
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        stop_str=["<|User|>", "<｜end▁of▁sentence｜>"],
+        image_token="<image_placeholder>",
+    )
+)

sglang/srt/custom_op.py CHANGED Viewed

@@ -1,8 +1,12 @@
+from typing import Optional
 import torch
 from torch import nn
-_is_cuda = torch.cuda.is_available() and torch.version.cuda
-_is_rocm = torch.cuda.is_available() and torch.version.hip
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
 class CustomOp(nn.Module):
@@ -34,7 +38,64 @@ class CustomOp(nn.Module):
     def dispatch_forward(self):
         if _is_cuda:
             return self.forward_cuda
-        elif _is_rocm:
+        elif _is_hip:
             return self.forward_hip
         else:
             return self.forward_native
+if _is_cuda:
+    from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        use_per_token_if_dynamic: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize input tensor to FP8 (8-bit floating point) format.
+        Args:
+            input (torch.Tensor): Input tensor to be quantized
+            scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
+                If None, scales will be computed dynamically.
+            use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
+                determines the quantization granularity:
+                - True: compute scale per token
+                - False: compute single scale per tensor
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+                - quantized_tensor: The FP8 quantized version of input
+                - scale_tensor: The scaling factors used for quantization
+        Raises:
+            AssertionError: If input is not 2D or if static scale's numel != 1
+        """
+        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
+        shape = input.shape
+        out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+        output = torch.empty(shape, device=input.device, dtype=out_dtype)
+        if scale is None:
+            # Dynamic scaling
+            if use_per_token_if_dynamic:
+                scale = torch.empty(
+                    (shape[0], 1), device=input.device, dtype=torch.float32
+                )
+                sgl_per_token_quant_fp8(input, output, scale)
+            else:
+                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+                sgl_per_tensor_quant_fp8(
+                    input, output, scale, is_static=False
+                )  # False for dynamic
+        else:
+            # Static scaling
+            assert (
+                scale.numel() == 1
+            ), f"Expected scalar scale, got numel={scale.numel()}"
+            sgl_per_tensor_quant_fp8(
+                input, output, scale, is_static=True
+            )  # True for static
+        return output, scale

sglang/srt/distributed/device_communicators/custom_all_reduce.py CHANGED Viewed

@@ -22,15 +22,16 @@ from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
 logger = logging.getLogger(__name__)
-is_hip_ = is_hip()
+_is_cuda = is_cuda()
+_is_hip = is_hip()
-if is_cuda():
+if _is_cuda:
     try:
         import pynvml
     except ImportError as e:
         logger.warning("Failed to import pynvml with %r", e)
-if is_hip_:
+if _is_hip:
     try:
         from amdsmi import (
             AmdSmiException,
@@ -43,7 +44,7 @@ if is_hip_:
         logger.warning("Failed to import amdsmi with %r", e)
 try:
-    if ops.use_vllm_custom_allreduce and not is_hip_:
+    if ops.use_vllm_custom_allreduce and not _is_hip:
         # Use vLLM custom allreduce
         ops.meta_size()
     else:
@@ -63,7 +64,7 @@ _R = TypeVar("_R")
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
     @wraps(fn)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-        if is_hip_:
+        if _is_hip:
             try:
                 amdsmi_init()
                 return fn(*args, **kwargs)
@@ -81,7 +82,7 @@ def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
 @with_nvml_context
 def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
-    if is_hip_:
+    if _is_hip:
         """
         query if the set of gpus are fully connected by xgmi (1 hop)
         """
@@ -145,7 +146,7 @@ def is_weak_contiguous(inp: torch.Tensor):
 class CustomAllreduce:
     _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
     _MAX_CAR_SIZE = 8192 * 1024
-    if is_hip_:
+    if _is_hip:
         # crossover is at 16MB buffer size for ROCm
         _MAX_CAR_SIZE = 2 * 8192 * 1024
@@ -229,7 +230,7 @@ class CustomAllreduce:
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        if is_cuda() or is_hip_:
+        if _is_cuda or _is_hip:
             full_nvlink = is_full_nvlink(physical_device_ids, world_size)
         if world_size > 2 and not full_nvlink:
@@ -243,7 +244,7 @@ class CustomAllreduce:
         # this is expensive to compute at the first time
         # then we cache the result
         # On AMD GPU, p2p is always enabled between XGMI connected GPUs
-        if not is_hip_ and not _can_p2p(rank, world_size):
+        if not _is_hip and not _can_p2p(rank, world_size):
             logger.warning(
                 "Custom allreduce is disabled because your platform lacks "
                 "GPU P2P capability or P2P test failed. To silence this "
@@ -256,7 +257,7 @@ class CustomAllreduce:
         self.world_size = world_size
         self.full_nvlink = full_nvlink
-        if ops.use_vllm_custom_allreduce and not is_hip_:
+        if ops.use_vllm_custom_allreduce and not _is_hip:
             # Buffers memory are owned by this Python class and passed to C++.
             # Meta data composes of two parts: meta data for synchronization and a
             # temporary buffer for storing intermediate allreduce results.
@@ -279,7 +280,7 @@ class CustomAllreduce:
             )
             ops.register_buffer(self._ptr, self.buffer_ptrs)
         else:
-            if is_hip_:
+            if _is_hip:
                 # meta data buffers need to be "uncached" for signal on MI200
                 self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
                 self.buffer = torch.empty(
@@ -418,7 +419,7 @@ class CustomAllreduce:
         ops.register_buffer(self._ptr, inp, handles, offsets)
     def register_graph_buffers(self):
-        if is_hip_:
+        if _is_hip:
             handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
             handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
             logger.info("Registering %d cuda graph addresses", len(offset))
@@ -454,12 +455,12 @@ class CustomAllreduce:
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if ops.use_vllm_custom_allreduce and not is_hip_:
+        if ops.use_vllm_custom_allreduce and not _is_hip:
             if self.world_size == 2 or self.full_nvlink:
                 return inp_size < self.max_size
             return False
-        if is_hip_:
+        if _is_hip:
             if self.full_nvlink:
                 if self.world_size == 8:
                     if self.MSCCL:
@@ -532,7 +533,7 @@ class CustomAllreduce:
             return None
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
-                if is_hip_:
+                if _is_hip:
                     return self.all_reduce_reg(input)
                 else:
                     return self.all_reduce(input, registered=True)
@@ -541,7 +542,7 @@ class CustomAllreduce:
                 # allreduce is out-of-place.
                 return torch.empty_like(input)
         else:
-            if is_hip_:
+            if _is_hip:
                 # note: outside of cuda graph context,
                 # custom allreduce incurs a cost of cudaMemcpy, which should
                 # be small(<=1% of overall latency) compared to the performance
@@ -556,7 +557,7 @@ class CustomAllreduce:
             if ops.use_vllm_custom_allreduce:
                 self.free_shared_buffer(self.meta_ptrs)
                 self.free_shared_buffer(self.buffer_ptrs)
-            elif is_cuda():
+            elif _is_cuda:
                 self.free_shared_buffer(self.buffer_ptrs)
                 self.free_shared_buffer(self.tmp_result_buffer_ptrs)
                 self.free_shared_buffer(self.barrier_in_ptrs)

sglang/srt/distributed/parallel_state.py CHANGED Viewed

@@ -1228,7 +1228,16 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         ray.shutdown()
     gc.collect()
     if not current_platform.is_cpu():
-        torch.cuda.empty_cache()
+        if hasattr(torch, "cuda") and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            if hasattr(torch._C, "_host_emptyCache"):
+                torch._C._host_emptyCache()
+            else:
+                logger.warning(
+                    "torch._C._host_emptyCache() only available in Pytorch >=2.5"
+                )
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
 def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -106,6 +106,8 @@ class Engine:
         tokenizer_manager, scheduler_info = _launch_subprocesses(
             server_args=server_args
         )
+        self.server_args = server_args
         self.tokenizer_manager = tokenizer_manager
         self.scheduler_info = scheduler_info
@@ -214,13 +216,13 @@ class Engine:
     def encode(
         self,
         prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[Union[List[str], str]] = None,
     ) -> Dict:
         """
         The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
         Please refer to `EmbeddingReqInput` for the documentation.
         """
-        obj = EmbeddingReqInput(text=prompt)
+        obj = EmbeddingReqInput(text=prompt, image_data=image_data)
         loop = asyncio.get_event_loop()
         generator = self.tokenizer_manager.generate_request(obj, None)
         ret = loop.run_until_complete(generator.__anext__())
@@ -374,7 +376,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.2.post1",
+            "0.2.3",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -614,7 +614,7 @@ def launch_server(
     Note:
     1. The HTTP server, Engine, and TokenizerManager both run in the main process.
-    2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
+    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
     """
     tokenizer_manager, scheduler_info = _launch_subprocesses(server_args=server_args)
     set_global_state(

sglang/srt/function_call_parser.py CHANGED Viewed

@@ -318,6 +318,10 @@ class Qwen25Detector(BaseFormatDetector):
         self.bot_token = "<tool_call>"
         self.eot_token = "</tool_call>"
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Qwen 2.5 format tool call."""
+        return self.bot_token in text
     def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
         """
         One-time parsing: Detects and parses tool calls in the provided text.
@@ -352,6 +356,10 @@ class MistralDetector(BaseFormatDetector):
         self.bot_token = "[TOOL_CALLS] ["
         self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Mistral format tool call."""
+        return self.bot_token in text
     def _clean_text(self, text: str) -> str:
         """
         clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
@@ -397,12 +405,21 @@ class Llama32Detector(BaseFormatDetector):
         super().__init__()
         self.bot_token = "<|python_tag|>"
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Llama 3.2 format tool call."""
+        # depending on the prompt format the Llama model may or may not
+        # prefix the output with the <|python_tag|> token
+        return "<|python_tag|>" in text or text.startswith("{")
     def detect_and_parse(self, text: str, tools: List[Function]) -> List[ToolCallItem]:
         """Parse function calls from text, handling multiple JSON objects."""
-        if "<|python_tag|>" not in text:
+        if "<|python_tag|>" not in text and not text.startswith("{"):
             return []
-        _, action_text = text.split("<|python_tag|>")
+        if "<|python_tag|>" in text:
+            _, action_text = text.split("<|python_tag|>")
+        else:
+            action_text = text
         # Split by semicolon and process each part
         json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
@@ -501,6 +518,20 @@ class FunctionCallParser:
         self.multi_format_parser = MultiFormatParser(detectors)
         self.tools = tools
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains a tool call in the format supported by this parser.
+        This delegates to the detector's implementation.
+        :param text: The text to check for tool calls
+        :return: True if the text contains a tool call, False otherwise
+        """
+        # Check all detectors in the multi_format_parser
+        for detector in self.multi_format_parser.detectors:
+            if detector.has_tool_call(text):
+                return True
+        return False
     def parse_non_stream(self, full_text: str):
         """
         Non-streaming call: one-time parsing

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -30,13 +30,20 @@ from transformers import (
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2_5_VLConfig
+from sglang.srt.configs import (
+    ChatGLMConfig,
+    DbrxConfig,
+    ExaoneConfig,
+    MultiModalityConfig,
+    Qwen2_5_VLConfig,
+)
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     ChatGLMConfig.model_type: ChatGLMConfig,
     DbrxConfig.model_type: DbrxConfig,
     ExaoneConfig.model_type: ExaoneConfig,
     Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
+    MultiModalityConfig.model_type: MultiModalityConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
@@ -66,6 +73,14 @@ def get_config(
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
     )
+    # FIXME: Pour contents of janus-pro's langauge_config to first-level
+    if isinstance(model, str) and model.lower().startswith("deepseek-ai/janus-pro"):
+        assert hasattr(config, "language_config")
+        for key, val in config.language_config.__dict__.items():
+            setattr(config, key, val)
+        setattr(config, "architectures", ["MultiModalityCausalLM"])
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
         config = config_class.from_pretrained(model, revision=revision)

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl