PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

sglang/bench_one_batch.py +113 -17
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -117
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +3 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +22 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +8 -5
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +106 -15
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +55 -13
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +40 -15
sglang/srt/layers/communicator.py +35 -8
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +9 -8
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +87 -107
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +59 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +8 -7
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +15 -4
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +10 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +61 -32
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +21 -4
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +30 -8
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +170 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +59 -22
sglang/srt/managers/tokenizer_manager.py +137 -67
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +48 -17
sglang/srt/model_executor/model_runner.py +24 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +95 -50
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +102 -27
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +7 -4
sglang/srt/models/qwen3_moe.py +39 -14
sglang/srt/models/step3_vl.py +10 -1
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/operations_strategy.py +1 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +218 -23
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +163 -9
sglang/srt/utils.py +41 -26
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +4 -4
sglang/test/test_utils.py +4 -4
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/protocol.py CHANGED Viewed

@@ -14,9 +14,18 @@
 """Pydantic models for OpenAI API protocol"""
 import time
+import uuid
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, TypeAlias, Union
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseInputItemParam,
+    ResponseOutputItem,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response import ToolChoice
+from openai.types.responses.tool import Tool
 from pydantic import (
     BaseModel,
     Field,
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
     completion_tokens: Optional[int] = 0
     # only used to return cached tokens when --enable-cache-report is set
     prompt_tokens_details: Optional[Dict[str, int]] = None
+    reasoning_tokens: Optional[int] = 0
 class StreamOptions(BaseModel):
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
         default="auto", examples=["none"]
     )  # noqa
     return_hidden_states: bool = False
+    reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models. "
+        "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
+        "result in faster responses and fewer tokens used on reasoning in a response. "
+        "Currently only supported for OpenAI models.",
+    )
     @model_validator(mode="before")
     @classmethod
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
 ]
+# Response API protocol definitions
+class ResponseReasoningParam(BaseModel):
+    """Reasoning parameters for responses."""
+    effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models.",
+    )
+class ResponseTool(BaseModel):
+    """Tool definition for responses."""
+    type: Literal["web_search_preview", "code_interpreter"] = Field(
+        description="Type of tool to enable"
+    )
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam,
+    "ResponseReasoningItem",
+    ResponseFunctionToolCall,
+]
+class ResponsesRequest(BaseModel):
+    """Request body for v1/responses endpoint."""
+    # Core OpenAI API fields (ordered by official documentation)
+    background: Optional[bool] = False
+    include: Optional[
+        List[
+            Literal[
+                "code_interpreter_call.outputs",
+                "computer_call_output.output.image_url",
+                "file_search_call.results",
+                "message.input_image.image_url",
+                "message.output_text.logprobs",
+                "reasoning.encrypted_content",
+            ]
+        ]
+    ] = None
+    input: Union[str, List[ResponseInputOutputItem]]
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    max_tool_calls: Optional[int] = None
+    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[str] = None  # Made optional to match vLLM
+    parallel_tool_calls: Optional[bool] = True
+    previous_response_id: Optional[str] = None
+    reasoning: Optional[ResponseReasoningParam] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
+    store: Optional[bool] = True
+    stream: Optional[bool] = False
+    temperature: Optional[float] = None
+    tool_choice: Literal["auto", "required", "none"] = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+    top_logprobs: Optional[int] = 0
+    top_p: Optional[float] = None
+    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
+    user: Optional[str] = None
+    # Extra SGLang parameters
+    request_id: str = Field(
+        default_factory=lambda: f"resp_{uuid.uuid4().hex}",
+        description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
+    )
+    priority: int = Field(default=0, description="Request priority")
+    # SGLang-specific sampling parameters
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    stop: Optional[Union[str, List[str]]] = None
+    top_k: int = -1
+    min_p: float = 0.0
+    repetition_penalty: float = 1.0
+    # Default sampling parameters
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 0.7,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+        "repetition_penalty": 1.0,
+    }
+    def to_sampling_params(
+        self, default_max_tokens: int, default_params: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        """Convert to sampling parameters for generation."""
+        if default_params is None:
+            default_params = {}
+        # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
+        if self.max_output_tokens is not None:
+            max_tokens = min(self.max_output_tokens, default_max_tokens)
+        else:
+            max_tokens = default_max_tokens
+        # Avoid exceed the context length by minus 1 token
+        max_tokens -= 1
+        # Get parameters with defaults
+        temperature = self.temperature
+        if temperature is None:
+            temperature = default_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+        top_p = self.top_p
+        if top_p is None:
+            top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        params = {
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "top_k": self.top_k,
+            "min_p": self.min_p,
+            "repetition_penalty": self.repetition_penalty,
+        }
+        # Apply any additional default parameters
+        for key, value in default_params.items():
+            if key not in params or params[key] is None:
+                params[key] = value
+        return params
+class PromptTokenUsageInfo(BaseModel):
+    """Prompt token usage details."""
+    cached_tokens: int = 0
+class ResponsesResponse(BaseModel):
+    """Response body for v1/responses endpoint."""
+    id: str = Field(default_factory=lambda: f"resp_{time.time()}")
+    object: Literal["response"] = "response"
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    output: List[
+        Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+    ] = Field(default_factory=list)
+    status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
+    usage: Optional[UsageInfo] = None
+    parallel_tool_calls: bool = True
+    tool_choice: str = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+    @classmethod
+    def from_request(
+        cls,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        model_name: str,
+        created_time: int,
+        output: List[
+            Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+        ],
+        status: str,
+        usage: Optional[UsageInfo],
+    ) -> "ResponsesResponse":
+        """Create a response from a request."""
+        return cls(
+            id=request.request_id,
+            created_at=created_time,
+            model=model_name,
+            output=output,
+            status=status,
+            usage=usage,
+            parallel_tool_calls=request.parallel_tool_calls or True,
+            tool_choice=request.tool_choice,
+            tools=request.tools,
+        )
+class RequestResponseMetadata(BaseModel):
+    """Metadata for request/response tracking."""
+    request_id: str
+    final_usage_info: Optional[UsageInfo] = None
 @dataclass
 class MessageProcessingResult:
     """Result of processing chat messages and applying templates.
@@ -645,3 +852,22 @@ class MessageProcessingResult:
     modalities: List[str]
     stop: List[str]
     tool_call_constraint: Optional[Any] = None
+class ResponseReasoningTextContent(BaseModel):
+    text: str
+    type: Literal["reasoning_text"] = "reasoning_text"
+class ResponseReasoningItem(BaseModel):
+    id: str
+    content: list[ResponseReasoningTextContent] = Field(default_factory=list)
+    summary: list = Field(default_factory=list)
+    type: Literal["reasoning"] = "reasoning"
+    encrypted_content: Optional[str] = None
+    status: Optional[Literal["in_progress", "completed", "incomplete"]]
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
+]

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -7,8 +7,18 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
 from fastapi import Request
 from fastapi.responses import ORJSONResponse, StreamingResponse
+from openai_harmony import Message as OpenAIMessage
 from sglang.srt.conversation import generate_chat_conv
+from sglang.srt.entrypoints.harmony_utils import (
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_streamable_parser_for_assistant,
+    get_system_message,
+    parse_chat_input,
+    parse_output_into_messages,
+    render_for_completion,
+)
 from sglang.srt.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -51,6 +61,26 @@ class OpenAIServingChat(OpenAIServingBase):
     ):
         super().__init__(tokenizer_manager)
         self.template_manager = template_manager
+        self.use_harmony = (
+            self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+        if self.use_harmony:
+            from sglang.srt.function_call.harmony_tool_parser import (
+                HarmonyToolCallParser,
+            )
+            self.harmony_tool_parser = HarmonyToolCallParser()
+        # NOTE While OpenAI's chat completion API supports browsing
+        # for some models, currently vLLM doesn't support it. Please use the
+        # Responses API instead.
+        self.supports_browsing = False
+        self.browser_tool = None
+        # NOTE: Chat completion API does not support code interpreter.
+        # Please use the Responses API instead.
+        self.supports_code_interpreter = False
+        self.python_tool = None
     def _request_id_prefix(self) -> str:
         return "chatcmpl-"
@@ -77,41 +107,66 @@ class OpenAIServingChat(OpenAIServingBase):
         is_multimodal = self.tokenizer_manager.model_config.is_multimodal
         # Process messages and apply chat template
-        processed_messages = self._process_messages(request, is_multimodal)
-        # Build sampling parameters
-        sampling_params = self._build_sampling_params(
-            request, processed_messages.stop, processed_messages.tool_call_constraint
-        )
+        if not self.use_harmony:
+            processed_messages = self._process_messages(request, is_multimodal)
+            # Build sampling parameters
+            sampling_params = self._build_sampling_params(
+                request,
+                processed_messages.stop,
+                processed_messages.tool_call_constraint,
+            )
-        # Handle single vs multiple requests
-        if is_multimodal:
-            prompt_kwargs = {"text": processed_messages.prompt}
-        else:
-            if isinstance(processed_messages.prompt_ids, str):
-                prompt_kwargs = {"text": processed_messages.prompt_ids}
+            # Handle single vs multiple requests
+            if is_multimodal:
+                prompt_kwargs = {"text": processed_messages.prompt}
             else:
-                prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
-        adapted_request = GenerateReqInput(
-            **prompt_kwargs,
-            image_data=processed_messages.image_data,
-            video_data=processed_messages.video_data,
-            audio_data=processed_messages.audio_data,
-            sampling_params=sampling_params,
-            return_logprob=request.logprobs,
-            logprob_start_len=-1,
-            top_logprobs_num=request.top_logprobs or 0,
-            stream=request.stream,
-            return_text_in_logprobs=True,
-            modalities=processed_messages.modalities,
-            lora_path=request.lora_path,
-            bootstrap_host=request.bootstrap_host,
-            bootstrap_port=request.bootstrap_port,
-            bootstrap_room=request.bootstrap_room,
-            return_hidden_states=request.return_hidden_states,
-            rid=request.rid,
-        )
+                if isinstance(processed_messages.prompt_ids, str):
+                    prompt_kwargs = {"text": processed_messages.prompt_ids}
+                else:
+                    prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
+            adapted_request = GenerateReqInput(
+                **prompt_kwargs,
+                image_data=processed_messages.image_data,
+                video_data=processed_messages.video_data,
+                audio_data=processed_messages.audio_data,
+                sampling_params=sampling_params,
+                return_logprob=request.logprobs,
+                logprob_start_len=-1,
+                top_logprobs_num=request.top_logprobs or 0,
+                stream=request.stream,
+                return_text_in_logprobs=True,
+                modalities=processed_messages.modalities,
+                lora_path=request.lora_path,
+                bootstrap_host=request.bootstrap_host,
+                bootstrap_port=request.bootstrap_port,
+                bootstrap_room=request.bootstrap_room,
+                return_hidden_states=request.return_hidden_states,
+                rid=request.rid,
+            )
+        else:
+            processed_messages, prompt_ids = self._make_request_with_harmony(request)
+            adapted_request = GenerateReqInput(
+                input_ids=prompt_ids,
+                sampling_params=self._build_sampling_params(
+                    request,
+                    request.stop,
+                    tool_call_constraint=None,
+                ),
+                stream=request.stream,
+                return_logprob=request.logprobs,
+                logprob_start_len=-1,
+                top_logprobs_num=request.top_logprobs or 0,
+                return_text_in_logprobs=True,
+                lora_path=request.lora_path,
+                bootstrap_host=request.bootstrap_host,
+                bootstrap_port=request.bootstrap_port,
+                bootstrap_room=request.bootstrap_room,
+                return_hidden_states=request.return_hidden_states,
+                rid=request.rid,
+            )
         return adapted_request, request
@@ -277,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
                 prompt = prompt[: -len(conv.sep2)]
         else:
             prompt = conv.get_prompt()
+            if self._get_enable_thinking_from_request(request):
+                prompt += "<think>"  # Note(Xinyuan): hard code thinking token
         image_data = conv.image_data if conv.image_data else None
         video_data = conv.video_data if conv.video_data else None
@@ -402,6 +459,12 @@ class OpenAIServingChat(OpenAIServingBase):
         cached_tokens = {}
         hidden_states = {}
+        # Harmony tracking
+        if self.use_harmony:
+            harmony_parsers = [
+                get_streamable_parser_for_assistant() for _ in range(request.n)
+            ]
         try:
             async for content in self.tokenizer_manager.generate_request(
                 adapted_request, raw_request
@@ -449,14 +512,57 @@ class OpenAIServingChat(OpenAIServingBase):
                     yield f"data: {chunk.model_dump_json()}\n\n"
                 # Process content delta
-                stream_buffer = stream_buffers.get(index, "")
-                delta = content["text"][len(stream_buffer) :]
-                stream_buffers[index] = stream_buffer + delta
+                if self.use_harmony:
+                    harmony_parser = harmony_parsers[index]
+                    new_token_ids = content["output_ids"]
+                    for token_id in new_token_ids:
+                        harmony_parser.process(token_id)
+                    is_final = harmony_parser.current_channel == "final"
+                    is_analysis = harmony_parser.current_channel == "analysis"
+                    delta = harmony_parser.last_content_delta or ""
+                    if is_analysis:
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(reasoning_content=delta),
+                            finish_reason=None,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+                        continue
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=index,
+                        delta=DeltaMessage(content=delta if delta else None),
+                        finish_reason=None,
+                        matched_stop=None,
+                        logprobs=choice_logprobs,
+                    )
+                    chunk = ChatCompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=int(time.time()),
+                        choices=[choice_data],
+                        model=request.model,
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+                    continue
+                else:
+                    stream_buffer = stream_buffers.get(index, "")
+                    delta = content["text"][len(stream_buffer) :]
+                    stream_buffers[index] = stream_buffer + delta
                 # Handle reasoning content
                 if (
                     self.tokenizer_manager.server_args.reasoning_parser
                     and request.separate_reasoning
+                    and not self.use_harmony
                 ):
                     reasoning_text, delta = self._process_reasoning_stream(
                         index, delta, reasoning_parser_dict, content, request
@@ -475,8 +581,27 @@ class OpenAIServingChat(OpenAIServingBase):
                         )
                         yield f"data: {chunk.model_dump_json()}\n\n"
+                if self.use_harmony and not is_final:
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=index,
+                        delta=DeltaMessage(reasoning_content=delta),
+                        finish_reason=None,
+                    )
+                    chunk = ChatCompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=int(time.time()),
+                        choices=[choice_data],
+                        model=request.model,
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
                 # Handle tool calls
-                if request.tool_choice != "none" and request.tools:
+                # TODO: support tool call parsing for harmony
+                if (
+                    request.tool_choice != "none"
+                    and request.tools
+                    and not self.use_harmony
+                ):
                     async for chunk in self._process_tool_call_stream(
                         index,
                         delta,
@@ -502,7 +627,7 @@ class OpenAIServingChat(OpenAIServingBase):
                     if delta:
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=index,
-                            delta=DeltaMessage(content=delta if delta else None),
+                            delta=DeltaMessage(content=delta),
                             finish_reason=None,
                             matched_stop=None,
                             logprobs=choice_logprobs,
@@ -640,14 +765,90 @@ class OpenAIServingChat(OpenAIServingBase):
             finish_reason = ret_item["meta_info"]["finish_reason"]
             text = ret_item["text"]
+            output_ids = ret_item["output_ids"]
+            if self.use_harmony:
+                parser = parse_output_into_messages(output_ids)
+                output_msgs = parser.messages
+                if len(output_msgs) == 0:
+                    # The generation has stopped during reasoning.
+                    is_tool_call = False
+                    reasoning_content = parser.current_content
+                    final_content = None
+                elif len(output_msgs) == 1:
+                    # The generation has stopped during final message.
+                    is_tool_call = False
+                    reasoning_content = output_msgs[0].content[0].text
+                    final_content = parser.current_content
+                else:
+                    if len(output_msgs) != 2:
+                        raise ValueError(
+                            "Expected 2 output messages (reasoning and final), "
+                            f"but got {len(output_msgs)}."
+                        )
+                    reasoning_msg, final_msg = output_msgs
+                    reasoning_content = reasoning_msg.content[0].text
+                    final_content = final_msg.content[0].text
+                    is_tool_call = final_msg.recipient is not None
+                if is_tool_call:
+                    # Extract tool call information from final message
+                    tool_call = (
+                        self.harmony_tool_parser.extract_tool_calls_from_message(
+                            final_msg
+                        )
+                    )
+                    tool_calls = [tool_call] if tool_call else []
+                    message = ChatMessage(
+                        role="assistant",
+                        reasoning_content=reasoning_content,
+                        content=None,  # Tool calls don't have regular content
+                        tool_calls=tool_calls,
+                    )
+                else:
+                    # Normal message
+                    message = ChatMessage(
+                        role="assistant",
+                        reasoning_content=reasoning_content,
+                        content=final_content,
+                    )
+                if is_tool_call:
+                    finish_reason_type = "tool_calls"
+                elif finish_reason:
+                    finish_reason_type = (
+                        finish_reason["type"] if finish_reason else "stop"
+                    )
+                else:
+                    finish_reason_type = "stop"
+                choice_data = ChatCompletionResponseChoice(
+                    index=idx,
+                    message=message,
+                    logprobs=choice_logprobs,
+                    finish_reason=finish_reason_type,
+                    matched_stop=(
+                        finish_reason["matched"]
+                        if finish_reason and "matched" in finish_reason
+                        else None
+                    ),
+                )
+                choices.append(choice_data)
+                continue
             # Handle reasoning content
             reasoning_text = None
             reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
             if reasoning_parser and request.separate_reasoning:
+                is_force_reasoning = (
+                    self.template_manager.force_reasoning
+                    or self._get_enable_thinking_from_request(request)
+                )
                 try:
                     parser = ReasoningParser(
-                        model_type=reasoning_parser, stream_reasoning=False
+                        model_type=reasoning_parser,
+                        stream_reasoning=False,
+                        force_reasoning=is_force_reasoning,
                     )
                     reasoning_text, text = parser.parse_non_stream(text)
                 except Exception as e:
@@ -810,14 +1011,19 @@ class OpenAIServingChat(OpenAIServingBase):
     ) -> tuple[Optional[str], str]:
         """Process reasoning content in streaming response"""
         if index not in reasoning_parser_dict:
+            is_force_reasoning = (
+                self.template_manager.force_reasoning
+                or self._get_enable_thinking_from_request(request)
+            )
             reasoning_parser_dict[index] = ReasoningParser(
                 self.tokenizer_manager.server_args.reasoning_parser,
                 request.stream_reasoning,
+                is_force_reasoning,
             )
         reasoning_parser = reasoning_parser_dict[index]
         return reasoning_parser.parse_stream_chunk(delta)
-    def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
+    def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
         """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
         NOTE: This parameter is only useful for models that support enable_thinking
@@ -826,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
         Args:
             request_obj: The request object (or an item from a list of requests).
         Returns:
-            The boolean value of 'enable_thinking' if found and not True, otherwise True.
+            The boolean value of 'enable_thinking' if found, otherwise False.
         """
         if (
             hasattr(request, "chat_template_kwargs")
@@ -834,7 +1040,7 @@ class OpenAIServingChat(OpenAIServingBase):
             and request.chat_template_kwargs.get("enable_thinking") is not None
         ):
             return request.chat_template_kwargs.get("enable_thinking")
-        return True
+        return False
     async def _process_tool_call_stream(
         self,
@@ -978,3 +1184,33 @@ class OpenAIServingChat(OpenAIServingBase):
             return f"data: {chunk.model_dump_json()}\n\n"
         return None
+    def _make_request_with_harmony(
+        self,
+        request: ChatCompletionRequest,
+    ):
+        messages: list[OpenAIMessage] = []
+        # Add system message.
+        # In Chat Completion API, browsing is enabled by default if the model
+        # supports it.
+        assert not self.supports_browsing
+        assert not self.supports_code_interpreter
+        sys_msg = get_system_message(
+            reasoning_effort=request.reasoning_effort,
+            browser_description=None,
+            python_description=None,
+        )
+        messages.append(sys_msg)
+        # Add developer message.
+        dev_msg = get_developer_message()
+        messages.append(dev_msg)
+        # Add user message.
+        for chat_msg in request.messages:
+            messages.append(parse_chat_input(chat_msg))
+        # Render prompt token ids.
+        prompt_token_ids = render_for_completion(messages)
+        return messages, prompt_token_ids

sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl