PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

sglang/bench_offline_throughput.py +6 -6
sglang/bench_one_batch.py +5 -4
sglang/bench_one_batch_server.py +23 -15
sglang/bench_serving.py +133 -57
sglang/compile_deep_gemm.py +4 -4
sglang/srt/configs/model_config.py +39 -28
sglang/srt/conversation.py +1 -1
sglang/srt/disaggregation/decode.py +122 -133
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
sglang/srt/disaggregation/fake/conn.py +3 -13
sglang/srt/disaggregation/kv_events.py +357 -0
sglang/srt/disaggregation/mini_lb.py +57 -24
sglang/srt/disaggregation/mooncake/conn.py +11 -2
sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
sglang/srt/disaggregation/nixl/conn.py +9 -19
sglang/srt/disaggregation/prefill.py +126 -44
sglang/srt/disaggregation/utils.py +116 -5
sglang/srt/distributed/utils.py +3 -3
sglang/srt/entrypoints/EngineBase.py +5 -0
sglang/srt/entrypoints/engine.py +28 -8
sglang/srt/entrypoints/http_server.py +6 -4
sglang/srt/entrypoints/http_server_engine.py +5 -2
sglang/srt/function_call/base_format_detector.py +250 -0
sglang/srt/function_call/core_types.py +34 -0
sglang/srt/function_call/deepseekv3_detector.py +157 -0
sglang/srt/function_call/ebnf_composer.py +234 -0
sglang/srt/function_call/function_call_parser.py +175 -0
sglang/srt/function_call/llama32_detector.py +74 -0
sglang/srt/function_call/mistral_detector.py +84 -0
sglang/srt/function_call/pythonic_detector.py +163 -0
sglang/srt/function_call/qwen25_detector.py +67 -0
sglang/srt/function_call/utils.py +35 -0
sglang/srt/hf_transformers_utils.py +46 -7
sglang/srt/layers/attention/aiter_backend.py +513 -0
sglang/srt/layers/attention/flashattention_backend.py +63 -17
sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
sglang/srt/layers/attention/flashmla_backend.py +340 -78
sglang/srt/layers/attention/triton_backend.py +3 -0
sglang/srt/layers/attention/utils.py +2 -2
sglang/srt/layers/attention/vision.py +1 -1
sglang/srt/layers/communicator.py +451 -0
sglang/srt/layers/dp_attention.py +0 -10
sglang/srt/layers/moe/cutlass_moe.py +207 -0
sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
sglang/srt/layers/moe/ep_moe/layer.py +104 -50
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
sglang/srt/layers/moe/topk.py +66 -9
sglang/srt/layers/multimodal.py +70 -0
sglang/srt/layers/quantization/__init__.py +7 -2
sglang/srt/layers/quantization/deep_gemm.py +5 -3
sglang/srt/layers/quantization/fp8.py +90 -0
sglang/srt/layers/quantization/fp8_utils.py +6 -0
sglang/srt/layers/quantization/gptq.py +298 -6
sglang/srt/layers/quantization/int8_kernel.py +18 -5
sglang/srt/layers/quantization/qoq.py +244 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/deepseek_eplb.py +278 -0
sglang/srt/managers/eplb_manager.py +55 -0
sglang/srt/managers/expert_distribution.py +704 -56
sglang/srt/managers/expert_location.py +394 -0
sglang/srt/managers/expert_location_dispatch.py +91 -0
sglang/srt/managers/io_struct.py +16 -3
sglang/srt/managers/mm_utils.py +293 -139
sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
sglang/srt/managers/multimodal_processors/internvl.py +14 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
sglang/srt/managers/multimodal_processors/llava.py +3 -3
sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
sglang/srt/managers/schedule_batch.py +49 -21
sglang/srt/managers/schedule_policy.py +4 -5
sglang/srt/managers/scheduler.py +92 -50
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +99 -24
sglang/srt/mem_cache/base_prefix_cache.py +3 -0
sglang/srt/mem_cache/chunk_cache.py +3 -1
sglang/srt/mem_cache/hiradix_cache.py +4 -4
sglang/srt/mem_cache/memory_pool.py +74 -52
sglang/srt/mem_cache/multimodal_cache.py +45 -0
sglang/srt/mem_cache/radix_cache.py +58 -5
sglang/srt/metrics/collector.py +2 -2
sglang/srt/mm_utils.py +10 -0
sglang/srt/model_executor/cuda_graph_runner.py +20 -9
sglang/srt/model_executor/expert_location_updater.py +422 -0
sglang/srt/model_executor/forward_batch_info.py +4 -0
sglang/srt/model_executor/model_runner.py +144 -54
sglang/srt/model_loader/loader.py +10 -6
sglang/srt/models/clip.py +5 -1
sglang/srt/models/deepseek_v2.py +297 -343
sglang/srt/models/exaone.py +8 -3
sglang/srt/models/gemma3_mm.py +70 -33
sglang/srt/models/llama4.py +10 -2
sglang/srt/models/llava.py +26 -18
sglang/srt/models/mimo_mtp.py +220 -0
sglang/srt/models/minicpmo.py +5 -12
sglang/srt/models/mistral.py +71 -1
sglang/srt/models/mllama.py +3 -3
sglang/srt/models/qwen2.py +95 -26
sglang/srt/models/qwen2_5_vl.py +8 -0
sglang/srt/models/qwen2_moe.py +330 -60
sglang/srt/models/qwen2_vl.py +6 -0
sglang/srt/models/qwen3.py +52 -10
sglang/srt/models/qwen3_moe.py +411 -48
sglang/srt/models/siglip.py +294 -0
sglang/srt/openai_api/adapter.py +28 -16
sglang/srt/openai_api/protocol.py +6 -0
sglang/srt/operations.py +154 -0
sglang/srt/operations_strategy.py +31 -0
sglang/srt/server_args.py +134 -24
sglang/srt/speculative/eagle_utils.py +131 -0
sglang/srt/speculative/eagle_worker.py +47 -2
sglang/srt/utils.py +68 -12
sglang/test/test_cutlass_moe.py +278 -0
sglang/test/test_utils.py +2 -36
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +20 -11
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +128 -102
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
sglang/srt/function_call_parser.py +0 -858
sglang/srt/platforms/interface.py +0 -371
/sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post4.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0

sglang/srt/function_call/pythonic_detector.py ADDED Viewed

@@ -0,0 +1,163 @@
+import ast
+import json
+import logging
+import re
+from typing import List, Optional
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+from sglang.srt.openai_api.protocol import Tool
+logger = logging.getLogger(__name__)
+class PythonicDetector(BaseFormatDetector):
+    """
+    Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
+    Assumes function call format:
+      [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
+    Arguments are Python literals (not JSON).
+    """
+    def __init__(self):
+        super().__init__()
+        self.tool_call_regex = re.compile(
+            r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+            re.DOTALL,
+        )
+    def has_tool_call(self, text: str) -> bool:
+        return bool(self.tool_call_regex.match(text.strip()))
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        # Try parsing the text as a Python list of function calls
+        text = text.strip()
+        if not (text.startswith("[") and text.endswith("]")):
+            # Not a pythonic tool call format
+            return StreamingParseResult(normal_text=text, calls=[])
+        try:
+            module = ast.parse(text)
+            parsed = getattr(module.body[0], "value", None)
+            if not (
+                isinstance(parsed, ast.List)
+                and all(isinstance(e, ast.Call) for e in parsed.elts)
+            ):
+                return StreamingParseResult(normal_text=text, calls=[])
+            calls = []
+            tool_indices = {
+                tool.function.name: i
+                for i, tool in enumerate(tools)
+                if tool.function.name
+            }
+            for call in parsed.elts:
+                if not isinstance(call.func, ast.Name):
+                    continue
+                function_name = call.func.id
+                arguments = {}
+                for keyword in call.keywords:
+                    arguments[keyword.arg] = self._get_parameter_value(keyword.value)
+                calls.append(
+                    ToolCallItem(
+                        tool_index=tool_indices.get(function_name, -1),
+                        name=function_name,
+                        parameters=json.dumps(arguments, ensure_ascii=False),
+                    )
+                )
+            return StreamingParseResult(normal_text="", calls=calls)
+        except Exception:
+            logger.exception("Error in pythonic tool call parsing.")
+            return StreamingParseResult(normal_text=text, calls=[])
+    def _find_matching_bracket(self, buffer: str, start: int) -> int:
+        """
+        Find the matching closing bracket for the opening bracket at start position.
+        Properly handles nested brackets.
+        Args:
+            buffer: The text buffer to search in
+            start: Position of the opening bracket '['
+        Returns:
+            Position of the matching closing bracket ']', or -1 if not found
+        """
+        bracket_count = 0
+        for i in range(start, len(buffer)):
+            if buffer[i] == "[":
+                bracket_count += 1
+            elif buffer[i] == "]":
+                bracket_count -= 1
+                if bracket_count == 0:
+                    return i
+        return -1  # No matching bracket found
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for pythonic tool calls.
+        Buffers input until a complete pythonic tool call (from [ to ]) is found,
+        then parses and emits any detected calls.
+        """
+        self._buffer += new_text
+        start = self._buffer.find("[")
+        if start == -1:
+            normal_text = self._buffer
+            self._buffer = ""
+            return StreamingParseResult(normal_text=normal_text)
+        normal_text = self._buffer[:start] if start > 0 else ""
+        end = self._find_matching_bracket(self._buffer, start)
+        if end != -1:
+            call_text = self._buffer[start : end + 1]
+            result = self.detect_and_parse(call_text, tools)
+            self._buffer = self._buffer[end + 1 :]
+            # If we had normal text before the tool call, add it to the result
+            if normal_text:
+                result.normal_text = normal_text + (result.normal_text or "")
+            return result
+        # We have an opening bracket but no closing bracket yet
+        if normal_text:
+            self._buffer = self._buffer[start:]
+            return StreamingParseResult(normal_text=normal_text)
+        # Otherwise, we're still accumulating a potential tool call
+        return StreamingParseResult(normal_text="")
+    def _get_parameter_value(self, val):
+        if isinstance(val, ast.Constant):
+            return val.value
+        elif isinstance(val, ast.Dict):
+            return {
+                k.value: self._get_parameter_value(v)
+                for k, v in zip(val.keys, val.values)
+            }
+        elif isinstance(val, ast.List):
+            return [self._get_parameter_value(v) for v in val.elts]
+        else:
+            raise ValueError("Tool call arguments must be literals")
+    def structure_info(self) -> _GetInfoFunc:
+        def info(name: str):
+            return StructureInfo(begin=f"[{name}(", end=")]", trigger=f"[{name}(")
+        return info
+    def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
+        return EBNFComposer.build_ebnf(
+            tools,
+            bot_token="[",
+            eot_token="]",
+            tool_call_separator=",",
+            function_format="pythonic",
+        )

sglang/srt/function_call/qwen25_detector.py ADDED Viewed

@@ -0,0 +1,67 @@
+import json
+import re
+from typing import List
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+from sglang.srt.openai_api.protocol import Tool
+class Qwen25Detector(BaseFormatDetector):
+    """
+    Detector for Qwen 2.5 models.
+    Assumes function call format:
+      <tool_call>{"name":"xxx", "arguments":{...}}</tool_call>
+    """
+    def __init__(self):
+        """
+        Initializes the detector with necessary state variables.
+        """
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Qwen 2.5 format tool call."""
+        return self.bot_token in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        pattern = rf"{self.bot_token}(.*?){self.eot_token}"
+        match_result_list = re.findall(pattern, text, re.DOTALL)
+        calls = []
+        for match_result in match_result_list:
+            match_result = json.loads(match_result)
+            calls.extend(self.parse_base_json(match_result, tools))
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<tool_call>{"name":"' + name + '", "arguments":',
+            end="}</tool_call>",
+            trigger="<tool_call>",
+        )
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            bot_token=self.bot_token,
+            eot_token=self.eot_token,
+            function_format="json",
+        )

sglang/srt/function_call/utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+import json
+from json import JSONDecodeError, JSONDecoder
+from typing import Any, Tuple
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+def _find_common_prefix(s1: str, s2: str) -> str:
+    prefix = ""
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        raise
+def _is_complete_json(input_str: str) -> bool:
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import warnings
 from pathlib import Path
 from typing import Dict, Optional, Type, Union
-import transformers
+import torch
 from huggingface_hub import snapshot_download
 from transformers import (
     AutoConfig,
@@ -66,6 +66,43 @@ def download_from_hf(model_path: str):
     return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
+def get_hf_text_config(config: PretrainedConfig):
+    """Get the "sub" config relevant to llm for multi modal models.
+    No op for pure text models.
+    """
+    if config.architectures is not None:
+        class_name = config.architectures[0]
+        if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
+            # We support non-hf version of llava models, so we do not want to
+            # read the wrong values from the unused default text_config.
+            # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
+            # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
+            setattr(config, "torch_dtype", torch.float16)
+            return config
+    if hasattr(config, "text_config"):
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(config.text_config, "num_attention_heads")
+        return config.text_config
+    if hasattr(config, "language_config"):
+        return config.language_config
+    if hasattr(config, "thinker_config"):
+        # qwen2.5 omni
+        thinker_config = config.thinker_config
+        if hasattr(thinker_config, "text_config"):
+            setattr(
+                thinker_config.text_config,
+                "torch_dtype",
+                getattr(thinker_config, "torch_dtype", None),
+            )
+            return thinker_config.text_config
+        return thinker_config
+    else:
+        return config
 def get_config(
     model: str,
     trust_remote_code: bool,
@@ -81,13 +118,12 @@ def get_config(
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
     )
+    text_config = get_hf_text_config(config=config)
-    # FIXME: Pour contents of janus-pro's langauge_config to first-level
-    if isinstance(model, str) and model.lower().startswith("deepseek-ai/janus-pro"):
-        assert hasattr(config, "language_config")
-        for key, val in config.language_config.__dict__.items():
-            setattr(config, key, val)
-        setattr(config, "architectures", ["MultiModalityCausalLM"])
+    if isinstance(model, str) and text_config is not None:
+        for key, val in text_config.__dict__.items():
+            if not hasattr(config, key) and getattr(text_config, key, None) is not None:
+                setattr(config, key, val)
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
@@ -100,6 +136,9 @@ def get_config(
             if not hasattr(config, key):
                 setattr(config, key, val)
+    if config.model_type == "multi_modality":
+        config.update({"architectures": ["MultiModalityCausalLM"]})
     if model_override_args:
         config.update(model_override_args)

sglang 0.4.6.post4__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.6.post5py3-none-any.whl