PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +7 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +13 -1
sglang/srt/entrypoints/openai/protocol.py +3 -1
sglang/srt/entrypoints/openai/serving_base.py +5 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +323 -242
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
sglang/srt/layers/moe/topk.py +90 -24
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +27 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/data_parallel_controller.py +4 -0
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +64 -1
sglang/srt/managers/scheduler_input_blocker.py +106 -0
sglang/srt/managers/tokenizer_manager.py +80 -15
sglang/srt/managers/tp_worker.py +8 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +83 -27
sglang/srt/models/deepseek_v2.py +75 -84
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +17 -71
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/poll_based_barrier.py +31 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +65 -6
sglang/srt/two_batch_overlap.py +8 -3
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +118 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +5 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +97 -80
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/engine.py CHANGED Viewed

@@ -640,7 +640,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if server_args.attention_backend == "flashinfer":
         assert_pkg_version(
             "flashinfer_python",
-            "0.2.9rc1",
+            "0.2.9rc2",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -765,7 +765,9 @@ def _launch_subprocesses(
             # When using `Engine` as a Python API, we don't want to block here.
             return None, None, None
-        launch_dummy_health_check_server(server_args.host, server_args.port)
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )
         for proc in scheduler_procs:
             proc.join()

sglang/srt/entrypoints/http_server.py CHANGED Viewed

@@ -38,7 +38,7 @@ import orjson
 import requests
 import uvicorn
 import uvloop
-from fastapi import Depends, FastAPI, Request, UploadFile
+from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
@@ -174,6 +174,18 @@ app.add_middleware(
 )
+@app.exception_handler(HTTPException)
+async def validation_exception_handler(request: Request, exc: HTTPException):
+    """Enrich HTTP exception with status code and other details"""
+    error = ErrorResponse(
+        object="error",
+        message=exc.detail,
+        type=str(exc.status_code),
+        code=exc.status_code,
+    )
+    return ORJSONResponse(content=error.model_dump(), status_code=exc.status_code)
 # Custom exception handlers to change validation error status codes
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request: Request, exc: RequestValidationError):

sglang/srt/entrypoints/openai/protocol.py CHANGED Viewed

@@ -317,7 +317,9 @@ class ToolCall(BaseModel):
 class ChatCompletionMessageGenericParam(BaseModel):
     role: Literal["system", "assistant", "tool"]
-    content: Union[str, List[ChatCompletionMessageContentTextPart], None]
+    content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
+        default=None
+    )
     tool_call_id: Optional[str] = None
     name: Optional[str] = None
     reasoning_content: Optional[str] = None

sglang/srt/entrypoints/openai/serving_base.py CHANGED Viewed

@@ -4,7 +4,7 @@ import uuid
 from abc import ABC, abstractmethod
 from typing import Any, Optional, Union
-from fastapi import Request
+from fastapi import HTTPException, Request
 from fastapi.responses import ORJSONResponse, StreamingResponse
 from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
@@ -45,7 +45,10 @@ class OpenAIServingBase(ABC):
                 return await self._handle_non_streaming_request(
                     adapted_request, processed_request, raw_request
                 )
+        except HTTPException as e:
+            return self.create_error_response(
+                message=e.detail, err_type=str(e.status_code), status_code=e.status_code
+            )
         except Exception as e:
             logger.exception(f"Error in request: {e}")
             return self.create_error_response(

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -412,6 +412,8 @@ class OpenAIServingChat(OpenAIServingBase):
         is_firsts = {}
         stream_buffers = {}
         n_prev_tokens = {}
+        has_tool_calls = {}
+        finish_reasons = {}
         # Usage tracking
         prompt_tokens = {}
@@ -443,6 +445,10 @@ class OpenAIServingChat(OpenAIServingBase):
                 finish_reason = content["meta_info"]["finish_reason"]
                 finish_reason_type = finish_reason["type"] if finish_reason else None
+                # Track finish_reason for each index
+                if finish_reason_type:
+                    finish_reasons[index] = finish_reason
                 # First chunk with role
                 if is_firsts.get(index, True):
                     is_firsts[index] = False
@@ -450,13 +456,8 @@ class OpenAIServingChat(OpenAIServingBase):
                     choice_data = ChatCompletionResponseStreamChoice(
                         index=index,
                         delta=delta,
-                        finish_reason=finish_reason_type,
-                        matched_stop=(
-                            finish_reason["matched"]
-                            if finish_reason and "matched" in finish_reason
-                            else None
-                        ),
-                        logprobs=choice_logprobs,
+                        finish_reason=None,
+                        logprobs=None,
                     )
                     chunk = ChatCompletionStreamResponse(
                         id=content["meta_info"]["id"],
@@ -483,7 +484,7 @@ class OpenAIServingChat(OpenAIServingBase):
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=index,
                             delta=DeltaMessage(reasoning_content=reasoning_text),
-                            finish_reason=finish_reason_type,
+                            finish_reason=None,
                         )
                         chunk = ChatCompletionStreamResponse(
                             id=content["meta_info"]["id"],
@@ -493,45 +494,36 @@ class OpenAIServingChat(OpenAIServingBase):
                         )
                         yield f"data: {chunk.model_dump_json()}\n\n"
-                    if not delta:
-                        continue
                 # Handle tool calls
                 if request.tool_choice != "none" and request.tools:
-                    async for (
-                        chunk,
-                        tool_call_finish_reason_type,
-                    ) in self._process_tool_call_stream(
+                    async for chunk in self._process_tool_call_stream(
                         index,
                         delta,
                         parser_dict,
                         content,
                         request,
-                        finish_reason_type,
+                        has_tool_calls,
                     ):
                         if chunk:
                             yield chunk
-                        finish_reason_type = tool_call_finish_reason_type
+                    # Send any remaining tool call arguments when generation finishes
+                    if finish_reason_type is not None and index in parser_dict:
+                        parser = parser_dict[index]
+                        remaining_chunk = self._check_for_unstreamed_tool_args(
+                            parser, content, request, index
+                        )
+                        if remaining_chunk:
+                            yield remaining_chunk
                 else:
                     # Regular content
-                    if delta or not (
-                        request.stream_options and request.stream_options.include_usage
-                    ):
+                    if delta:
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=index,
                             delta=DeltaMessage(content=delta if delta else None),
-                            finish_reason=(
-                                None
-                                if request.stream_options
-                                and request.stream_options.include_usage
-                                else finish_reason_type
-                            ),
-                            matched_stop=(
-                                finish_reason["matched"]
-                                if finish_reason and "matched" in finish_reason
-                                else None
-                            ),
+                            finish_reason=None,
+                            matched_stop=None,
                             logprobs=choice_logprobs,
                         )
                         chunk = ChatCompletionStreamResponse(
@@ -542,26 +534,36 @@ class OpenAIServingChat(OpenAIServingBase):
                         )
                         yield f"data: {chunk.model_dump_json()}\n\n"
-            # Final chunk with finish_reason
-            finish_reason_chunk = ChatCompletionStreamResponse(
-                id=content["meta_info"]["id"],
-                created=int(time.time()),
-                choices=[
-                    ChatCompletionResponseStreamChoice(
-                        index=index,
-                        delta=DeltaMessage(),
-                        finish_reason=finish_reason_type,
-                        matched_stop=(
-                            finish_reason["matched"]
-                            if finish_reason and "matched" in finish_reason
-                            else None
-                        ),
-                    )
-                ],
-                model=request.model,
-                usage=None,
-            )
-            yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
+            # Send finish_reason chunks for each index that completed
+            for idx, finish_reason_data in finish_reasons.items():
+                finish_reason_type = finish_reason_data["type"]
+                # Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
+                final_finish_reason = finish_reason_type
+                if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
+                    final_finish_reason = "tool_calls"
+                finish_reason_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"][
+                        "id"
+                    ],  # NOTE: openai uses the same chatcmpl-id for all indices
+                    created=int(time.time()),
+                    choices=[
+                        ChatCompletionResponseStreamChoice(
+                            index=idx,
+                            delta=DeltaMessage(),
+                            finish_reason=final_finish_reason,
+                            matched_stop=(
+                                finish_reason_data["matched"]
+                                if "matched" in finish_reason_data
+                                else None
+                            ),
+                        )
+                    ],
+                    model=request.model,
+                    usage=None,
+                )
+                yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
             # Send hidden states if requested
             if request.return_hidden_states and hidden_states:
@@ -581,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
                                     delta=DeltaMessage(
                                         hidden_states=last_token_hidden_states
                                     ),
-                                    finish_reason=finish_reason_type,
+                                    finish_reason=None,  # Hidden states don't need finish_reason
                                 )
                             ],
                             model=request.model,
@@ -860,7 +862,7 @@ class OpenAIServingChat(OpenAIServingBase):
         parser_dict: Dict[int, FunctionCallParser],
         content: Dict[str, Any],
         request: ChatCompletionRequest,
-        finish_reason_type: Optional[str],
+        has_tool_calls: Dict[int, bool],
     ):
         """Process tool calls in streaming response"""
         if index not in parser_dict:
@@ -877,7 +879,7 @@ class OpenAIServingChat(OpenAIServingBase):
             choice_data = ChatCompletionResponseStreamChoice(
                 index=index,
                 delta=DeltaMessage(content=normal_text),
-                finish_reason=finish_reason_type,
+                finish_reason=None,
             )
             chunk = ChatCompletionStreamResponse(
                 id=content["meta_info"]["id"],
@@ -885,10 +887,13 @@ class OpenAIServingChat(OpenAIServingBase):
                 choices=[choice_data],
                 model=request.model,
             )
-            yield f"data: {chunk.model_dump_json()}\n\n", finish_reason_type
+            yield f"data: {chunk.model_dump_json()}\n\n"
         # Yield tool calls
         for call_item in calls:
+            # Mark that this choice has tool calls
+            has_tool_calls[index] = True
             # Tool call ID should be generated only once per tool call
             if call_item.name:
                 # First chunk: include ID and function name
@@ -899,23 +904,6 @@ class OpenAIServingChat(OpenAIServingBase):
                 tool_call_id = None
                 function_name = None
-            if finish_reason_type == "stop":
-                # Handle remaining arguments
-                latest_delta_len = 0
-                if isinstance(call_item.parameters, str):
-                    latest_delta_len = len(call_item.parameters)
-                expected_call = json.dumps(
-                    parser.detector.prev_tool_call_arr[index].get("arguments", {}),
-                    ensure_ascii=False,
-                )
-                actual_call = parser.detector.streamed_args_for_tool[index]
-                if latest_delta_len > 0:
-                    actual_call = actual_call[:-latest_delta_len]
-                remaining_call = expected_call.replace(actual_call, "", 1)
-                call_item.parameters = remaining_call
-                finish_reason_type = "tool_calls"
             tool_call = ToolCall(
                 id=tool_call_id,
                 index=call_item.tool_index,
@@ -928,19 +916,84 @@ class OpenAIServingChat(OpenAIServingBase):
             choice_data = ChatCompletionResponseStreamChoice(
                 index=index,
                 delta=DeltaMessage(tool_calls=[tool_call]),
-                finish_reason=(
-                    None
-                    if request.stream_options and request.stream_options.include_usage
-                    else finish_reason_type
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+            yield f"data: {chunk.model_dump_json()}\n\n"
+    def _check_for_unstreamed_tool_args(
+        self,
+        parser: FunctionCallParser,
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+        index: int,
+    ) -> Optional[str]:
+        """
+        Check for any remaining tool call arguments that need to be streamed
+        when generation finishes. This ensures tool calls are properly completed
+        even if the model generates the final arguments in the last chunk.
+        """
+        # Only check if we have tool calls and the parser has tracked data
+        if (
+            not hasattr(parser.detector, "prev_tool_call_arr")
+            or not parser.detector.prev_tool_call_arr
+        ):
+            return None
+        if (
+            not hasattr(parser.detector, "streamed_args_for_tool")
+            or not parser.detector.streamed_args_for_tool
+        ):
+            return None
+        # Get the last tool call that was being processed
+        tool_index = len(parser.detector.prev_tool_call_arr) - 1
+        if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
+            return None
+        # Get expected vs actual arguments
+        expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
+            "arguments", {}
+        )
+        expected_call = json.dumps(expected_args, ensure_ascii=False)
+        actual_call = parser.detector.streamed_args_for_tool[tool_index]
+        # Check if there are remaining arguments to send
+        remaining_call = (
+            expected_call.replace(actual_call, "", 1)
+            if actual_call in expected_call
+            else ""
+        )
+        if remaining_call:
+            # Create tool call chunk with remaining arguments
+            tool_call = ToolCall(
+                id=None,  # No ID for argument deltas
+                index=tool_index,
+                function=FunctionResponse(
+                    name=None,  # No name for argument deltas
+                    arguments=remaining_call,
                 ),
             )
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(tool_calls=[tool_call]),
+                finish_reason=None,  # Don't send finish_reason with this chunk
+            )
             chunk = ChatCompletionStreamResponse(
                 id=content["meta_info"]["id"],
                 created=int(time.time()),
                 choices=[choice_data],
                 model=request.model,
             )
-            yield f"data: {chunk.model_dump_json()}\n\n", finish_reason_type
-        if finish_reason_type == "stop":
-            yield None, "tool_calls"
+            return f"data: {chunk.model_dump_json()}\n\n"
+        return None

sglang/srt/function_call/ebnf_composer.py CHANGED Viewed

@@ -165,6 +165,7 @@ class EBNFComposer:
         tool_call_separator: Optional[str] = None,
         call_rule_fmt: Optional[str] = None,
         key_value_rule_fmt: Optional[str] = None,
+        key_value_separator: str = ",",
     ):
         """
         Generalized EBNF builder for all detectors.
@@ -279,7 +280,11 @@ class EBNFComposer:
             # Add required properties joined by commas
             if required:
-                rule_parts.append(' "," '.join(prop_kv_pairs[k] for k in required))
+                rule_parts.append(
+                    f' "{key_value_separator}" '.join(
+                        prop_kv_pairs[k] for k in required
+                    )
+                )
             # Add optional properties with flexible ordering
             if optional:
@@ -292,13 +297,15 @@ class EBNFComposer:
                         if j == i:
                             opt_parts.append(prop_kv_pairs[optional[j]])
                         else:
-                            opt_parts.append(f' ( "," {prop_kv_pairs[optional[j]]} )?')
+                            opt_parts.append(
+                                f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
+                            )
                     opt_alternatives.append("".join(opt_parts))
                 # Wrap with appropriate comma handling based on whether we have required properties
                 if required:
                     # Required properties exist, so optional group needs outer comma
-                    rule_parts.append(' ( "," ( ')
+                    rule_parts.append(f' ( "{key_value_separator}" ( ')
                     rule_parts.append(" | ".join(opt_alternatives))
                     rule_parts.append(" ) )?")
                 else:

sglang/srt/function_call/function_call_parser.py CHANGED Viewed

@@ -10,6 +10,7 @@ from sglang.srt.entrypoints.openai.protocol import (
 from sglang.srt.function_call.base_format_detector import BaseFormatDetector
 from sglang.srt.function_call.core_types import ToolCallItem
 from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
+from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
 from sglang.srt.function_call.kimik2_detector import KimiK2Detector
 from sglang.srt.function_call.llama32_detector import Llama32Detector
 from sglang.srt.function_call.mistral_detector import MistralDetector
@@ -37,6 +38,7 @@ class FunctionCallParser:
         "pythonic": PythonicDetector,
         "kimi_k2": KimiK2Detector,
         "qwen3_coder": Qwen3CoderDetector,
+        "glm45": Glm4MoeDetector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):

sglang/srt/function_call/glm4_moe_detector.py ADDED Viewed

@@ -0,0 +1,164 @@
+import ast
+import json
+import logging
+import re
+from typing import List
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+logger = logging.getLogger(__name__)
+def get_argument_type(func_name: str, arg_key: str, defined_tools: list):
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    if arg_key not in tool.function.parameters["properties"]:
+        return None
+    return tool.function.parameters["properties"][arg_key].get("type", None)
+def parse_arguments(json_value):
+    try:
+        try:
+            parsed_value = json.loads(json_value)
+        except:
+            parsed_value = ast.literal_eval(json_value)
+        return parsed_value, True
+    except:
+        return json_value, False
+class Glm4MoeDetector(BaseFormatDetector):
+    """
+    Detector for GLM-4.5 models.
+    Assumes function call format:
+      <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
+    """
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.func_call_regex = r"<tool_call>.*?</tool_call>"
+        self.func_detail_regex = r"<tool_call>([^\n]*)\n(.*)</tool_call>"
+        self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a glm-4.5 format tool call."""
+        return self.bot_token in text
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(1)
+                func_args = func_detail.group(2)
+                pairs = re.findall(
+                    r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
+                    func_args,
+                    re.DOTALL,
+                )
+                arguments = {}
+                for arg_key, arg_value in pairs:
+                    arg_key = arg_key.strip()
+                    arg_value = arg_value.strip()
+                    arg_type = get_argument_type(func_name, arg_key, tools)
+                    if arg_type != "string":
+                        arg_value, is_good_json = parse_arguments(arg_value)
+                    arguments[arg_key] = arg_value
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": arguments}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for GLM-4.5 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+        start = current_text.find(self.bot_token)
+        if start == -1:
+            self._buffer = ""
+            if self.current_tool_id > 0:
+                current_text = ""
+            return StreamingParseResult(normal_text=current_text)
+        # find ensures we find the first self.eot_token so there will be at most one tool_call in current_text[:end+len(self.eot_token)
+        end = current_text.find(self.eot_token)
+        if end != -1:
+            # Initialize state if this is the first tool call
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+            # Ensure we have enough entries in our tracking arrays
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+            result = self.detect_and_parse(
+                current_text[: end + len(self.eot_token)], tools=tools
+            )
+            if result.calls:
+                self.prev_tool_call_arr[self.current_tool_id] = {
+                    "name": result.calls[0].name,
+                    "arguments": json.loads(result.calls[0].parameters),
+                }
+                self.streamed_args_for_tool[self.current_tool_id] = result.calls[
+                    0
+                ].parameters
+                result.calls[0].tool_index = self.current_tool_id
+                self.current_tool_id += 1
+            self._buffer = current_text[end + len(self.eot_token) :]
+            return result
+        normal_text = current_text[:start]
+        self._buffer = current_text[start:]
+        return StreamingParseResult(normal_text=normal_text)
+    def supports_structural_tag(self) -> bool:
+        return False
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.bot_token,
+            individual_call_end_token=self.eot_token,
+            tool_call_separator="\\n",
+            function_format="xml",
+            call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
+            key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
+            key_value_separator="\\n",
+        )

sglang/srt/function_call/qwen3_coder_detector.py CHANGED Viewed

@@ -148,4 +148,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
             function_format="xml",
             call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
             key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
+            key_value_separator="\\n",
         )

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl