PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -0
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +7 -7
sglang/srt/disaggregation/decode.py +8 -3
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +4 -5
sglang/srt/entrypoints/openai/protocol.py +0 -9
sglang/srt/entrypoints/openai/serving_chat.py +59 -265
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +8 -10
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/quantization/__init__.py +5 -3
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/modelopt_quant.py +6 -11
sglang/srt/layers/quantization/mxfp4.py +4 -1
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +21 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +6 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +35 -20
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +15 -7
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +25 -26
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +22 -3
sglang/srt/model_executor/forward_batch_info.py +26 -5
sglang/srt/model_executor/model_runner.py +129 -35
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_v2.py +74 -35
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +9 -9
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +136 -19
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/registry.py +1 -1
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/server_args.py +115 -139
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +12 -4
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -7,18 +7,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 from fastapi import Request
 from fastapi.responses import ORJSONResponse, StreamingResponse
-from openai_harmony import Message as OpenAIMessage
 from sglang.srt.conversation import generate_chat_conv
-from sglang.srt.entrypoints.harmony_utils import (
-    get_developer_message,
-    get_stop_tokens_for_assistant_actions,
-    get_streamable_parser_for_assistant,
-    get_system_message,
-    parse_chat_input,
-    parse_output_into_messages,
-    render_for_completion,
-)
 from sglang.srt.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -57,30 +47,12 @@ class OpenAIServingChat(OpenAIServingBase):
     """Handler for /v1/chat/completions requests"""
     def __init__(
-        self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
     ):
         super().__init__(tokenizer_manager)
         self.template_manager = template_manager
-        self.use_harmony = (
-            self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
-        )
-        if self.use_harmony:
-            from sglang.srt.function_call.harmony_tool_parser import (
-                HarmonyToolCallParser,
-            )
-            self.harmony_tool_parser = HarmonyToolCallParser()
-        # NOTE While OpenAI's chat completion API supports browsing
-        # for some models, currently vLLM doesn't support it. Please use the
-        # Responses API instead.
-        self.supports_browsing = False
-        self.browser_tool = None
-        # NOTE: Chat completion API does not support code interpreter.
-        # Please use the Responses API instead.
-        self.supports_code_interpreter = False
-        self.python_tool = None
     def _request_id_prefix(self) -> str:
         return "chatcmpl-"
@@ -97,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
         ):
             return "Tools cannot be empty if tool choice is set to required."
+        max_output_tokens = request.max_completion_tokens or request.max_tokens
+        server_context_length = self.tokenizer_manager.server_args.context_length
+        if (
+            max_output_tokens
+            and server_context_length
+            and max_output_tokens > server_context_length
+        ):
+            return (
+                f"max_completion_tokens is too large: {max_output_tokens}."
+                f"This model supports at most {server_context_length} completion tokens."
+            )
         return None
     def _convert_to_internal_request(
@@ -107,66 +91,43 @@ class OpenAIServingChat(OpenAIServingBase):
         is_multimodal = self.tokenizer_manager.model_config.is_multimodal
         # Process messages and apply chat template
-        if not self.use_harmony:
-            processed_messages = self._process_messages(request, is_multimodal)
-            # Build sampling parameters
-            sampling_params = self._build_sampling_params(
-                request,
-                processed_messages.stop,
-                processed_messages.tool_call_constraint,
-            )
+        processed_messages = self._process_messages(request, is_multimodal)
-            # Handle single vs multiple requests
-            if is_multimodal:
-                prompt_kwargs = {"text": processed_messages.prompt}
-            else:
-                if isinstance(processed_messages.prompt_ids, str):
-                    prompt_kwargs = {"text": processed_messages.prompt_ids}
-                else:
-                    prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
-            adapted_request = GenerateReqInput(
-                **prompt_kwargs,
-                image_data=processed_messages.image_data,
-                video_data=processed_messages.video_data,
-                audio_data=processed_messages.audio_data,
-                sampling_params=sampling_params,
-                return_logprob=request.logprobs,
-                logprob_start_len=-1,
-                top_logprobs_num=request.top_logprobs or 0,
-                stream=request.stream,
-                return_text_in_logprobs=True,
-                modalities=processed_messages.modalities,
-                lora_path=request.lora_path,
-                bootstrap_host=request.bootstrap_host,
-                bootstrap_port=request.bootstrap_port,
-                bootstrap_room=request.bootstrap_room,
-                return_hidden_states=request.return_hidden_states,
-                rid=request.rid,
-            )
+        # Build sampling parameters
+        sampling_params = self._build_sampling_params(
+            request,
+            processed_messages.stop,
+            processed_messages.tool_call_constraint,
+        )
+        # Handle single vs multiple requests
+        if is_multimodal:
+            prompt_kwargs = {"text": processed_messages.prompt}
         else:
-            processed_messages, prompt_ids = self._make_request_with_harmony(request)
-            adapted_request = GenerateReqInput(
-                input_ids=prompt_ids,
-                sampling_params=self._build_sampling_params(
-                    request,
-                    request.stop,
-                    tool_call_constraint=None,
-                ),
-                stream=request.stream,
-                return_logprob=request.logprobs,
-                logprob_start_len=-1,
-                top_logprobs_num=request.top_logprobs or 0,
-                return_text_in_logprobs=True,
-                lora_path=request.lora_path,
-                bootstrap_host=request.bootstrap_host,
-                bootstrap_port=request.bootstrap_port,
-                bootstrap_room=request.bootstrap_room,
-                return_hidden_states=request.return_hidden_states,
-                rid=request.rid,
-            )
+            if isinstance(processed_messages.prompt_ids, str):
+                prompt_kwargs = {"text": processed_messages.prompt_ids}
+            else:
+                prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
+        adapted_request = GenerateReqInput(
+            **prompt_kwargs,
+            image_data=processed_messages.image_data,
+            video_data=processed_messages.video_data,
+            audio_data=processed_messages.audio_data,
+            sampling_params=sampling_params,
+            return_logprob=request.logprobs,
+            logprob_start_len=-1,
+            top_logprobs_num=request.top_logprobs or 0,
+            stream=request.stream,
+            return_text_in_logprobs=True,
+            modalities=processed_messages.modalities,
+            lora_path=request.lora_path,
+            bootstrap_host=request.bootstrap_host,
+            bootstrap_port=request.bootstrap_port,
+            bootstrap_room=request.bootstrap_room,
+            return_hidden_states=request.return_hidden_states,
+            rid=request.rid,
+        )
         return adapted_request, request
@@ -251,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
                 tokenize=True,
                 add_generation_prompt=True,
                 tools=tools,
+                reasoning_effort=request.reasoning_effort,
                 **(
                     request.chat_template_kwargs if request.chat_template_kwargs else {}
                 ),
             )
         except Exception:
-            #  This except branch will be triggered when the chosen model
-            #  has a different tools input format that is not compatible
-            #  with openAI's apply_chat_template tool_call format, like Mistral.
+            # This except branch will be triggered when the chosen model
+            # has a different tools input format that is not compatible
+            # with openAI's apply_chat_template tool_call format, like Mistral.
             tools = (
                 [t if "function" in t else {"function": t} for t in tools]
                 if tools
@@ -269,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
                 tokenize=True,
                 add_generation_prompt=True,
                 tools=tools,
+                reasoning_effort=request.reasoning_effort,
                 **(
                     request.chat_template_kwargs if request.chat_template_kwargs else {}
                 ),
@@ -459,12 +422,6 @@ class OpenAIServingChat(OpenAIServingBase):
         cached_tokens = {}
         hidden_states = {}
-        # Harmony tracking
-        if self.use_harmony:
-            harmony_parsers = [
-                get_streamable_parser_for_assistant() for _ in range(request.n)
-            ]
         try:
             async for content in self.tokenizer_manager.generate_request(
                 adapted_request, raw_request
@@ -511,58 +468,14 @@ class OpenAIServingChat(OpenAIServingBase):
                     )
                     yield f"data: {chunk.model_dump_json()}\n\n"
-                # Process content delta
-                if self.use_harmony:
-                    harmony_parser = harmony_parsers[index]
-                    new_token_ids = content["output_ids"]
-                    for token_id in new_token_ids:
-                        harmony_parser.process(token_id)
-                    is_final = harmony_parser.current_channel == "final"
-                    is_analysis = harmony_parser.current_channel == "analysis"
-                    delta = harmony_parser.last_content_delta or ""
-                    if is_analysis:
-                        choice_data = ChatCompletionResponseStreamChoice(
-                            index=index,
-                            delta=DeltaMessage(reasoning_content=delta),
-                            finish_reason=None,
-                        )
-                        chunk = ChatCompletionStreamResponse(
-                            id=content["meta_info"]["id"],
-                            created=int(time.time()),
-                            choices=[choice_data],
-                            model=request.model,
-                        )
-                        yield f"data: {chunk.model_dump_json()}\n\n"
-                        continue
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=index,
-                        delta=DeltaMessage(content=delta if delta else None),
-                        finish_reason=None,
-                        matched_stop=None,
-                        logprobs=choice_logprobs,
-                    )
-                    chunk = ChatCompletionStreamResponse(
-                        id=content["meta_info"]["id"],
-                        created=int(time.time()),
-                        choices=[choice_data],
-                        model=request.model,
-                    )
-                    yield f"data: {chunk.model_dump_json()}\n\n"
-                    continue
-                else:
-                    stream_buffer = stream_buffers.get(index, "")
-                    delta = content["text"][len(stream_buffer) :]
-                    stream_buffers[index] = stream_buffer + delta
+                stream_buffer = stream_buffers.get(index, "")
+                delta = content["text"][len(stream_buffer) :]
+                stream_buffers[index] = stream_buffer + delta
                 # Handle reasoning content
                 if (
                     self.tokenizer_manager.server_args.reasoning_parser
                     and request.separate_reasoning
-                    and not self.use_harmony
                 ):
                     reasoning_text, delta = self._process_reasoning_stream(
                         index, delta, reasoning_parser_dict, content, request
@@ -581,27 +494,8 @@ class OpenAIServingChat(OpenAIServingBase):
                         )
                         yield f"data: {chunk.model_dump_json()}\n\n"
-                if self.use_harmony and not is_final:
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=index,
-                        delta=DeltaMessage(reasoning_content=delta),
-                        finish_reason=None,
-                    )
-                    chunk = ChatCompletionStreamResponse(
-                        id=content["meta_info"]["id"],
-                        created=int(time.time()),
-                        choices=[choice_data],
-                        model=request.model,
-                    )
-                    yield f"data: {chunk.model_dump_json()}\n\n"
                 # Handle tool calls
-                # TODO: support tool call parsing for harmony
-                if (
-                    request.tool_choice != "none"
-                    and request.tools
-                    and not self.use_harmony
-                ):
+                if request.tool_choice != "none" and request.tools:
                     async for chunk in self._process_tool_call_stream(
                         index,
                         delta,
@@ -765,76 +659,6 @@ class OpenAIServingChat(OpenAIServingBase):
             finish_reason = ret_item["meta_info"]["finish_reason"]
             text = ret_item["text"]
-            output_ids = ret_item["output_ids"]
-            if self.use_harmony:
-                parser = parse_output_into_messages(output_ids)
-                output_msgs = parser.messages
-                if len(output_msgs) == 0:
-                    # The generation has stopped during reasoning.
-                    is_tool_call = False
-                    reasoning_content = parser.current_content
-                    final_content = None
-                elif len(output_msgs) == 1:
-                    # The generation has stopped during final message.
-                    is_tool_call = False
-                    reasoning_content = output_msgs[0].content[0].text
-                    final_content = parser.current_content
-                else:
-                    if len(output_msgs) != 2:
-                        raise ValueError(
-                            "Expected 2 output messages (reasoning and final), "
-                            f"but got {len(output_msgs)}."
-                        )
-                    reasoning_msg, final_msg = output_msgs
-                    reasoning_content = reasoning_msg.content[0].text
-                    final_content = final_msg.content[0].text
-                    is_tool_call = final_msg.recipient is not None
-                if is_tool_call:
-                    # Extract tool call information from final message
-                    tool_call = (
-                        self.harmony_tool_parser.extract_tool_calls_from_message(
-                            final_msg
-                        )
-                    )
-                    tool_calls = [tool_call] if tool_call else []
-                    message = ChatMessage(
-                        role="assistant",
-                        reasoning_content=reasoning_content,
-                        content=None,  # Tool calls don't have regular content
-                        tool_calls=tool_calls,
-                    )
-                else:
-                    # Normal message
-                    message = ChatMessage(
-                        role="assistant",
-                        reasoning_content=reasoning_content,
-                        content=final_content,
-                    )
-                if is_tool_call:
-                    finish_reason_type = "tool_calls"
-                elif finish_reason:
-                    finish_reason_type = (
-                        finish_reason["type"] if finish_reason else "stop"
-                    )
-                else:
-                    finish_reason_type = "stop"
-                choice_data = ChatCompletionResponseChoice(
-                    index=idx,
-                    message=message,
-                    logprobs=choice_logprobs,
-                    finish_reason=finish_reason_type,
-                    matched_stop=(
-                        finish_reason["matched"]
-                        if finish_reason and "matched" in finish_reason
-                        else None
-                    ),
-                )
-                choices.append(choice_data)
-                continue
             # Handle reasoning content
             reasoning_text = None
@@ -1184,33 +1008,3 @@ class OpenAIServingChat(OpenAIServingBase):
             return f"data: {chunk.model_dump_json()}\n\n"
         return None
-    def _make_request_with_harmony(
-        self,
-        request: ChatCompletionRequest,
-    ):
-        messages: list[OpenAIMessage] = []
-        # Add system message.
-        # In Chat Completion API, browsing is enabled by default if the model
-        # supports it.
-        assert not self.supports_browsing
-        assert not self.supports_code_interpreter
-        sys_msg = get_system_message(
-            reasoning_effort=request.reasoning_effort,
-            browser_description=None,
-            python_description=None,
-        )
-        messages.append(sys_msg)
-        # Add developer message.
-        dev_msg = get_developer_message()
-        messages.append(dev_msg)
-        # Add user message.
-        for chat_msg in request.messages:
-            messages.append(parse_chat_input(chat_msg))
-        # Render prompt token ids.
-        prompt_token_ids = render_for_completion(messages)
-        return messages, prompt_token_ids

sglang/srt/entrypoints/openai/tool_server.py CHANGED Viewed

@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
 from typing import Any
-logger = logging.getLogger(__name__)
 try:
     from mcp import ClientSession
     from mcp.client.sse import sse_client
     from mcp.types import ListToolsResult
-except ImportError:
-    logger.warning("Ignoring mcp import error")
+except ImportError as e:
+    ClientSession = sse_client = ListToolsResult = e
 from openai_harmony import ToolDescription, ToolNamespaceConfig
+logger = logging.getLogger(__name__)
 async def list_server_and_tools(server_url: str):

sglang/srt/function_call/ebnf_composer.py CHANGED Viewed

@@ -316,6 +316,7 @@ class EBNFComposer:
             combined_args = "".join(rule_parts)
             arguments_rule = args_template.format(arg_rules=combined_args)
+            arguments_rule = arguments_rule or '""'
             # Add the function call rule and its arguments rule
             ebnf_lines.append(

sglang/srt/function_call/function_call_parser.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
 from sglang.srt.function_call.core_types import ToolCallItem
 from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
 from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
+from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
 from sglang.srt.function_call.kimik2_detector import KimiK2Detector
 from sglang.srt.function_call.llama32_detector import Llama32Detector
 from sglang.srt.function_call.mistral_detector import MistralDetector
@@ -41,6 +42,7 @@ class FunctionCallParser:
         "qwen3_coder": Qwen3CoderDetector,
         "glm45": Glm4MoeDetector,
         "step3": Step3Detector,
+        "gpt-oss": GptOssDetector,
     }
     def __init__(self, tools: List[Tool], tool_call_parser: str):

sglang/srt/function_call/glm4_moe_detector.py CHANGED Viewed

@@ -158,7 +158,7 @@ class Glm4MoeDetector(BaseFormatDetector):
             individual_call_end_token=self.eot_token,
             tool_call_separator="\\n",
             function_format="xml",
-            call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
+            call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
             key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
             key_value_separator="\\n",
         )

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc1py3-none-any.whl