PyPI - sglang - Versions diffs - 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl - Mend

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/configs/step3_vl.py +172 -0
sglang/srt/conversation.py +23 -0
sglang/srt/disaggregation/decode.py +2 -8
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/disaggregation/prefill.py +2 -6
sglang/srt/distributed/parallel_state.py +86 -1
sglang/srt/entrypoints/engine.py +14 -18
sglang/srt/entrypoints/http_server.py +10 -2
sglang/srt/entrypoints/openai/serving_chat.py +2 -21
sglang/srt/eplb/expert_distribution.py +5 -0
sglang/srt/eplb/expert_location.py +17 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -0
sglang/srt/eplb/expert_location_updater.py +2 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/step3_detector.py +436 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/jinja_template_utils.py +4 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/moe/cutlass_moe.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +39 -674
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
sglang/srt/layers/quantization/fp8.py +52 -18
sglang/srt/layers/quantization/unquant.py +0 -8
sglang/srt/layers/quantization/w4afp8.py +1 -0
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/managers/cache_controller.py +165 -67
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +0 -2
sglang/srt/managers/scheduler.py +90 -671
sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
sglang/srt/managers/template_manager.py +62 -19
sglang/srt/managers/tokenizer_manager.py +123 -74
sglang/srt/managers/tp_worker.py +4 -0
sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
sglang/srt/mem_cache/hicache_storage.py +60 -17
sglang/srt/mem_cache/hiradix_cache.py +36 -8
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +418 -29
sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
sglang/srt/model_executor/cuda_graph_runner.py +25 -1
sglang/srt/model_executor/model_runner.py +13 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/arcee.py +532 -0
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/glm4_moe.py +6 -4
sglang/srt/models/granitemoe.py +3 -0
sglang/srt/models/grok.py +3 -0
sglang/srt/models/hunyuan.py +1 -0
sglang/srt/models/llama4.py +3 -0
sglang/srt/models/mixtral.py +3 -0
sglang/srt/models/olmoe.py +3 -0
sglang/srt/models/phimoe.py +1 -0
sglang/srt/models/step3_vl.py +991 -0
sglang/srt/multimodal/processors/base_processor.py +15 -16
sglang/srt/multimodal/processors/step3_vl.py +515 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +49 -18
sglang/srt/speculative/eagle_worker.py +2 -0
sglang/srt/utils.py +1 -0
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/utils.py +0 -11
sglang/version.py +1 -1
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -238,6 +238,9 @@ async def health() -> Response:
 @app.get("/health_generate")
 async def health_generate(request: Request) -> Response:
     """Check the health of the inference server by generating one token."""
+    if _global_state.tokenizer_manager.gracefully_exit:
+        logger.info("Health check request received during shutdown. Returning 503.")
+        return Response(status_code=503)
     sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
     rid = f"HEALTH_CHECK_{time.time()}"
@@ -260,9 +263,14 @@ async def health_generate(request: Request) -> Response:
         async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
             break
-    tic = time.perf_counter()
+    # This request is a special request.
+    # If the server already has something running, this request will be ignored, so it creates zero overhead.
+    # If the server is not running, this request will be run, so we know whether the server is healthy.
     task = asyncio.create_task(gen())
-    while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
+    # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
+    tic = time.time()
+    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
         await asyncio.sleep(1)
         if _global_state.tokenizer_manager.last_receive_tstamp > tic:
             task.cancel()

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -127,12 +127,12 @@ class OpenAIServingChat(OpenAIServingBase):
             request.skip_special_tokens = False
             if not isinstance(request.tool_choice, str):
                 tools = [
-                    item.model_dump()
+                    item.function.model_dump()
                     for item in request.tools
                     if item.function.name == request.tool_choice.function.name
                 ]
             else:
-                tools = [item.model_dump() for item in request.tools]
+                tools = [item.function.model_dump() for item in request.tools]
             tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
             parser = FunctionCallParser(request.tools, tool_call_parser)
@@ -178,25 +178,6 @@ class OpenAIServingChat(OpenAIServingBase):
                 audio_data,
                 modalities,
             )
-            if "tool_calls" in processed_msg and isinstance(
-                processed_msg.get("tool_calls"), list
-            ):
-                for call in processed_msg["tool_calls"]:
-                    try:
-                        if "arguments" in call["function"] and isinstance(
-                            call["function"]["arguments"], str
-                        ):
-                            call["function"]["arguments"] = json.loads(
-                                call["function"]["arguments"]
-                            )
-                    except json.JSONDecodeError as e:
-                        # Log a warning or error if JSON parsing fails for arguments
-                        logger.warning(
-                            f"Failed to parse tool call arguments as JSON: {e}"
-                        )
-                        # Decide whether to continue or raise the exception based on desired behavior
-                        continue  # Or raise e if strict parsing is required
             openai_compatible_messages.append(processed_msg)
         # Handle assistant prefix for continue_final_message

sglang/srt/eplb/expert_distribution.py CHANGED Viewed

@@ -47,6 +47,11 @@ class ExpertDistributionRecorder(ABC):
         rank: int,
     ):
         if server_args.expert_distribution_recorder_mode is not None:
+            assert (
+                expert_location_metadata is not None
+            ), "ExpertLocationMetadata is required for expert distribution recording. One possible"
+            "reason is that you are using a model that does not support expert distribution"
+            "recording. Try setting `get_model_config_for_expert_location` in your model."
             return _ExpertDistributionRecorderReal(
                 server_args, expert_location_metadata, rank
             )

sglang/srt/eplb/expert_location.py CHANGED Viewed

@@ -82,6 +82,10 @@ class ExpertLocationMetadata:
     def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
         """Trivial location - logical expert i corresponds to physical expert i"""
         common = ExpertLocationMetadata._init_common(server_args, model_config)
+        if common is None:
+            return None
         num_physical_experts = common["num_physical_experts"]
         model_config_for_expert_location = common["model_config_for_expert_location"]
         num_layers = model_config_for_expert_location.num_layers
@@ -109,6 +113,10 @@ class ExpertLocationMetadata:
         physical_to_logical_map = physical_to_logical_map.to(server_args.device)
         common = ExpertLocationMetadata._init_common(server_args, model_config)
+        if common is None:
+            return None
         model_config_for_expert_location = common["model_config_for_expert_location"]
         logical_to_all_physical_map = _compute_logical_to_all_physical_map(
             physical_to_logical_map,
@@ -133,6 +141,10 @@ class ExpertLocationMetadata:
         logical_count = logical_count.to(server_args.device)
         common = ExpertLocationMetadata._init_common(server_args, model_config)
+        if common is None:
+            return None
         model_config_for_expert_location = common["model_config_for_expert_location"]
         num_physical_experts = common["num_physical_experts"]
         num_groups = model_config_for_expert_location.num_groups
@@ -168,6 +180,9 @@ class ExpertLocationMetadata:
             ModelConfigForExpertLocation.from_model_config(model_config)
         )
+        if model_config_for_expert_location is None:
+            return None
         num_physical_experts = (
             model_config_for_expert_location.num_logical_experts
             + server_args.ep_num_redundant_experts
@@ -398,10 +413,6 @@ class ModelConfigForExpertLocation:
     num_logical_experts: int
     num_groups: Optional[int] = None
-    @staticmethod
-    def init_dummy():
-        return ModelConfigForExpertLocation(num_layers=1, num_logical_experts=1)
     @staticmethod
     def from_model_config(model_config: ModelConfig):
         model_class, _ = get_model_architecture(model_config)
@@ -410,12 +421,12 @@ class ModelConfigForExpertLocation:
                 model_config.hf_config
             )
         else:
-            return ModelConfigForExpertLocation.init_dummy()
+            return None
 def compute_initial_expert_location_metadata(
     server_args: ServerArgs, model_config: ModelConfig
-) -> ExpertLocationMetadata:
+) -> Optional[ExpertLocationMetadata]:
     data = server_args.init_expert_location
     if data == "trivial":
         return ExpertLocationMetadata.init_trivial(server_args, model_config)

sglang/srt/eplb/expert_location_dispatch.py CHANGED Viewed

@@ -36,6 +36,7 @@ class ExpertLocationDispatchInfo:
     def init_new(cls, layer_id: int):
         ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
         expert_location_metadata = get_global_expert_location_metadata()
+        assert expert_location_metadata is not None
         if ep_dispatch_algorithm is None:
             return None

sglang/srt/eplb/expert_location_updater.py CHANGED Viewed

@@ -50,6 +50,8 @@ class ExpertLocationUpdater:
             torch.cuda.empty_cache()
         old_expert_location_metadata = get_global_expert_location_metadata()
+        assert old_expert_location_metadata is not None
         _update_expert_weights(
             routed_experts_weights_of_layer=routed_experts_weights_of_layer,
             old_expert_location_metadata=old_expert_location_metadata,

sglang/srt/function_call/function_call_parser.py CHANGED Viewed

@@ -17,6 +17,7 @@ from sglang.srt.function_call.mistral_detector import MistralDetector
 from sglang.srt.function_call.pythonic_detector import PythonicDetector
 from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
 from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+from sglang.srt.function_call.step3_detector import Step3Detector
 logger = logging.getLogger(__name__)
@@ -39,6 +40,7 @@ class FunctionCallParser:
         "kimi_k2": KimiK2Detector,
         "qwen3_coder": Qwen3CoderDetector,
         "glm45": Glm4MoeDetector,
+        "step3": Step3Detector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):

sglang/srt/function_call/step3_detector.py ADDED Viewed

@@ -0,0 +1,436 @@
+import ast
+import json
+import logging
+import re
+from typing import Any, Dict, List
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+logger = logging.getLogger(__name__)
+def get_argument_type(func_name: str, arg_key: str, defined_tools: List[Tool]) -> str:
+    """Get the expected type for a function argument from tool schema."""
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    parameters = tool.function.parameters or {}
+    properties = parameters.get("properties", {})
+    if arg_key not in properties:
+        return None
+    return properties[arg_key].get("type", None)
+def parse_arguments(value: str) -> tuple[Any, bool]:
+    """Parse a string value to appropriate type. Returns (parsed_value, success)."""
+    try:
+        try:
+            parsed_value = json.loads(value)
+        except:
+            parsed_value = ast.literal_eval(value)
+        return parsed_value, True
+    except:
+        return value, False
+class Step3Detector(BaseFormatDetector):
+    """
+    Detector for Step3 model function call format.
+    The Step3 format uses special Unicode tokens to delimit function calls
+    with steptml XML format for invocations.
+    Format Structure:
+    ```
+    <｜tool_calls_begin｜>
+    <｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="function_name">
+    <steptml:parameter name="param1">value1</steptml:parameter>
+    <steptml:parameter name="param2">value2</steptml:parameter>
+    </steptml:invoke><｜tool_call_end｜>
+    <｜tool_calls_end｜>
+    ```
+    """
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool_calls_begin｜>"
+        self.eot_token = "<｜tool_calls_end｜>"
+        self.tool_call_begin = "<｜tool_call_begin｜>"
+        self.tool_call_end = "<｜tool_call_end｜>"
+        self.tool_sep = "<｜tool_sep｜>"
+        # Regex for parsing steptml invocations
+        self.invoke_regex = re.compile(
+            r'<steptml:invoke name="([^"]+)">(.+?)</steptml:invoke>', re.DOTALL
+        )
+        self.param_regex = re.compile(
+            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>', re.DOTALL
+        )
+        # Streaming state variables
+        self._in_tool_block: bool = False
+        self._tool_block_finished: bool = False
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Step3 format tool call."""
+        return self.bot_token in text
+    def _parse_steptml_invoke(
+        self, text: str, tools: List[Tool] = None
+    ) -> tuple[str, dict]:
+        """Parse steptml invoke format to extract function name and parameters."""
+        invoke_match = self.invoke_regex.search(text)
+        if not invoke_match:
+            return None, {}
+        func_name = invoke_match.group(1)
+        params_text = invoke_match.group(2)
+        params = {}
+        for param_match in self.param_regex.finditer(params_text):
+            param_name = param_match.group(1)
+            param_value = param_match.group(2).strip()
+            # If tools provided, use schema-aware parsing
+            if tools:
+                arg_type = get_argument_type(func_name, param_name, tools)
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    params[param_name] = parsed_value
+                else:
+                    params[param_name] = param_value
+            else:
+                # Fallback to generic parsing if no tools provided
+                parsed_value, _ = parse_arguments(param_value)
+                params[param_name] = parsed_value
+        return func_name, params
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+        try:
+            pre_text, rest = text.split(self.bot_token, 1)
+            # If no end token, return everything as normal text
+            if self.eot_token not in rest:
+                return StreamingParseResult(normal_text=text, calls=[])
+            tool_section, post_text = rest.split(self.eot_token, 1)
+            # Find all individual tool calls using regex
+            calls = []
+            tool_call_pattern = (
+                f"{re.escape(self.tool_call_begin)}(.*?){re.escape(self.tool_call_end)}"
+            )
+            for match in re.finditer(tool_call_pattern, tool_section, re.DOTALL):
+                call_content = match.group(1)
+                # Check if it's a function call
+                if self.tool_sep not in call_content:
+                    continue
+                type_part, invoke_part = call_content.split(self.tool_sep, 1)
+                if type_part.strip() != "function":
+                    continue
+                func_name, params = self._parse_steptml_invoke(invoke_part, tools)
+                if func_name:
+                    # Use parse_base_json to create the ToolCallItem
+                    action = {"name": func_name, "arguments": params}
+                    calls.extend(self.parse_base_json(action, tools))
+            # Combine pre and post text
+            normal_text = pre_text + post_text
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # Return the original text if parsing fails
+            return StreamingParseResult(normal_text=text)
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for Step3 format.
+        """
+        self._buffer += new_text
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+        # If we've finished the tool block, everything is normal text
+        if self._tool_block_finished:
+            normal_text = self._buffer
+            self._buffer = ""
+            return StreamingParseResult(normal_text=normal_text)
+        # Check if tool block hasn't started yet
+        if not self._in_tool_block:
+            if self.bot_token in self._buffer:
+                idx = self._buffer.find(self.bot_token)
+                normal_text = self._buffer[:idx]
+                self._buffer = self._buffer[idx + len(self.bot_token) :]
+                self._in_tool_block = True
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Check if we might have a partial bot_token
+                partial_len = self._ends_with_partial_token(
+                    self._buffer, self.bot_token
+                )
+                if partial_len:
+                    return StreamingParseResult()  # Wait for more text
+                else:
+                    normal_text = self._buffer
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text)
+        # We're inside the tool block
+        calls: List[ToolCallItem] = []
+        # Check if tool block is ending
+        if self.eot_token in self._buffer:
+            idx = self._buffer.find(self.eot_token)
+            # If we're in the middle of a tool call, we need to handle it
+            if self._in_tool_call:
+                # The buffer before eot_token might contain the end of the current tool call
+                before_eot = self._buffer[:idx]
+                if self.tool_call_end in before_eot:
+                    # Parse this final tool call
+                    result = self._parse_partial_tool_call(tools)
+                    calls.extend(result.calls)
+                else:
+                    # Incomplete tool call - log warning
+                    logger.warning("Tool block ended with incomplete tool call")
+            remaining = self._buffer[idx + len(self.eot_token) :]
+            self._buffer = ""
+            self._tool_block_finished = True
+            # Reset any partial tool call state
+            self._reset_streaming_state()
+            return StreamingParseResult(normal_text=remaining, calls=calls)
+        # Check if we're in a tool call or need to start one
+        if not self._in_tool_call:
+            if self.tool_call_begin in self._buffer:
+                idx = self._buffer.find(self.tool_call_begin)
+                # Remove any content before tool call begin (shouldn't happen but be safe)
+                self._buffer = self._buffer[idx + len(self.tool_call_begin) :]
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                # Fall through to parse the partial tool call
+            else:
+                # Wait for tool call to begin
+                return StreamingParseResult()
+        # Parse partial tool call
+        if self._in_tool_call:
+            return self._parse_partial_tool_call(tools)
+        return StreamingParseResult()
+    def _parse_partial_tool_call(self, tools: List[Tool]) -> StreamingParseResult:
+        """Parse partial tool call for streaming scenarios."""
+        calls = []
+        # Check if we have tool_sep (means we're past the type declaration)
+        if self.tool_sep not in self._buffer:
+            return StreamingParseResult(calls=calls)  # Wait for more text
+        type_part, invoke_part = self._buffer.split(self.tool_sep, 1)
+        if type_part.strip() != "function":
+            # Invalid tool type, skip this tool call
+            self._reset_streaming_state()
+            return StreamingParseResult(calls=calls)
+        # Try to extract function name if not sent yet
+        if not self._function_name_sent:
+            name_match = re.search(r'<steptml:invoke name="([^"]+)">', invoke_part)
+            if name_match:
+                func_name = name_match.group(1)
+                # Validate function name
+                if func_name in self._tool_indices:
+                    self._current_function_name = func_name
+                    self._function_name_sent = True
+                    # Initialize tool tracking
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                    # Ensure tracking arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+                    # Store tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                    # Send tool name with empty parameters
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                else:
+                    # Invalid function name
+                    logger.warning(f"Invalid function name: {func_name}")
+                    self._reset_streaming_state()
+                    return StreamingParseResult(calls=calls)
+            else:
+                # Function name not complete yet
+                return StreamingParseResult(calls=calls)
+        # Parse parameters incrementally
+        if self._function_name_sent:
+            # Extract all complete parameters
+            new_params = {}
+            for param_match in self.param_regex.finditer(invoke_part):
+                param_name = param_match.group(1)
+                param_value = param_match.group(2).strip()
+                # Use schema-aware parsing
+                arg_type = get_argument_type(
+                    self._current_function_name, param_name, tools
+                )
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    new_params[param_name] = parsed_value
+                else:
+                    new_params[param_name] = param_value
+            # Check if we have new parameters to stream
+            if new_params != self._current_parameters:
+                # Build the JSON content without the closing brace for streaming
+                if not self._current_parameters:
+                    # First parameters - send opening brace and content
+                    params_content = json.dumps(new_params, ensure_ascii=False)
+                    if len(params_content) > 2:  # More than just "{}"
+                        # Send everything except the closing brace
+                        diff = params_content[:-1]
+                    else:
+                        diff = "{"
+                else:
+                    # Subsequent parameters - calculate the incremental diff
+                    old_json = json.dumps(self._current_parameters, ensure_ascii=False)
+                    new_json = json.dumps(new_params, ensure_ascii=False)
+                    # Remove closing braces for comparison
+                    old_without_brace = old_json[:-1]
+                    new_without_brace = new_json[:-1]
+                    # The new content should extend the old content
+                    if new_without_brace.startswith(old_without_brace):
+                        diff = new_without_brace[len(old_without_brace) :]
+                    else:
+                        # Parameters changed in unexpected way - shouldn't happen in normal streaming
+                        diff = ""
+                if diff:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters=diff,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                # Update current state
+                self._current_parameters = new_params
+                self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+            # Check if tool call is complete
+            if self.tool_call_end in self._buffer:
+                # Send closing brace if we've sent any parameters
+                if self.streamed_args_for_tool[self.current_tool_id]:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters="}",
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += "}"
+                # Find the end position
+                end_idx = self._buffer.find(self.tool_call_end)
+                # Remove the processed tool call from buffer
+                self._buffer = self._buffer[end_idx + len(self.tool_call_end) :]
+                # Reset state for next tool call
+                self._reset_streaming_state()
+                self.current_tool_id += 1
+        return StreamingParseResult(calls=calls)
+    def _reset_streaming_state(self):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = False
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+    def supports_structural_tag(self) -> bool:
+        """Return True if this detector supports structural tag format."""
+        return False
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        """
+        Build EBNF grammar for Step3 tool call format.
+        """
+        # Custom call rule for steptml format
+        call_rule_fmt = (
+            '"function" "<｜tool_sep｜>" "<steptml:invoke name=\\"{name}\\">" '
+            '{arguments_rule} "</steptml:invoke>"'
+        )
+        # Custom key-value rule for steptml parameters
+        key_value_rule_fmt = (
+            '"<steptml:parameter name=\\"{key}\\">" {valrule} "</steptml:parameter>"'
+        )
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            individual_call_start_token=self.tool_call_begin,
+            individual_call_end_token=self.tool_call_end,
+            tool_call_separator="",
+            function_format="xml",
+            call_rule_fmt=call_rule_fmt,
+            key_value_rule_fmt=key_value_rule_fmt,
+            key_value_separator="",
+        )

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -41,6 +41,7 @@ from sglang.srt.configs import (
     ExaoneConfig,
     KimiVLConfig,
     MultiModalityConfig,
+    Step3VLConfig,
 )
 from sglang.srt.configs.internvl import InternVLChatConfig
 from sglang.srt.connector import create_remote_connector
@@ -54,6 +55,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     MultiModalityConfig.model_type: MultiModalityConfig,
     KimiVLConfig.model_type: KimiVLConfig,
     InternVLChatConfig.model_type: InternVLChatConfig,
+    Step3VLConfig.model_type: Step3VLConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():

sglang/srt/jinja_template_utils.py CHANGED Viewed

@@ -165,7 +165,7 @@ def process_content_for_template_format(
         new_msg["content"] = processed_content_parts
         return new_msg
-    else:  # content_format == "string"
+    elif content_format == "string":
         # String format: flatten to text only (for templates like DeepSeek)
         text_parts = []
         for chunk in msg_dict["content"]:
@@ -179,3 +179,6 @@ def process_content_for_template_format(
         new_msg["content"] = " ".join(text_parts) if text_parts else ""
         new_msg = {k: v for k, v in new_msg.items() if v is not None}
         return new_msg
+    else:
+        raise ValueError(f"Invalid content format: {content_format}")

sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl