sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,18 @@
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
16
  import time
17
+ import uuid
17
18
  from dataclasses import dataclass
18
- from typing import Any, Dict, List, Optional, Union
19
+ from typing import Any, Dict, List, Optional, TypeAlias, Union
19
20
 
21
+ from openai.types.responses import (
22
+ ResponseFunctionToolCall,
23
+ ResponseInputItemParam,
24
+ ResponseOutputItem,
25
+ ResponseReasoningItem,
26
+ )
27
+ from openai.types.responses.response import ToolChoice
28
+ from openai.types.responses.tool import Tool
20
29
  from pydantic import (
21
30
  BaseModel,
22
31
  Field,
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
84
93
  completion_tokens: Optional[int] = 0
85
94
  # only used to return cached tokens when --enable-cache-report is set
86
95
  prompt_tokens_details: Optional[Dict[str, int]] = None
96
+ reasoning_tokens: Optional[int] = 0
87
97
 
88
98
 
89
99
  class StreamOptions(BaseModel):
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
428
438
  default="auto", examples=["none"]
429
439
  ) # noqa
430
440
  return_hidden_states: bool = False
441
+ reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
442
+ default="medium",
443
+ description="Constrains effort on reasoning for reasoning models. "
444
+ "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
445
+ "result in faster responses and fewer tokens used on reasoning in a response. "
446
+ "Currently only supported for OpenAI models.",
447
+ )
431
448
 
432
449
  @model_validator(mode="before")
433
450
  @classmethod
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
619
636
  ]
620
637
 
621
638
 
639
+ # Response API protocol definitions
640
+ class ResponseReasoningParam(BaseModel):
641
+ """Reasoning parameters for responses."""
642
+
643
+ effort: Optional[Literal["low", "medium", "high"]] = Field(
644
+ default="medium",
645
+ description="Constrains effort on reasoning for reasoning models.",
646
+ )
647
+
648
+
649
+ class ResponseTool(BaseModel):
650
+ """Tool definition for responses."""
651
+
652
+ type: Literal["web_search_preview", "code_interpreter"] = Field(
653
+ description="Type of tool to enable"
654
+ )
655
+
656
+
657
+ ResponseInputOutputItem: TypeAlias = Union[
658
+ ResponseInputItemParam,
659
+ "ResponseReasoningItem",
660
+ ResponseFunctionToolCall,
661
+ ]
662
+
663
+
664
+ class ResponsesRequest(BaseModel):
665
+ """Request body for v1/responses endpoint."""
666
+
667
+ # Core OpenAI API fields (ordered by official documentation)
668
+ background: Optional[bool] = False
669
+ include: Optional[
670
+ List[
671
+ Literal[
672
+ "code_interpreter_call.outputs",
673
+ "computer_call_output.output.image_url",
674
+ "file_search_call.results",
675
+ "message.input_image.image_url",
676
+ "message.output_text.logprobs",
677
+ "reasoning.encrypted_content",
678
+ ]
679
+ ]
680
+ ] = None
681
+ input: Union[str, List[ResponseInputOutputItem]]
682
+ instructions: Optional[str] = None
683
+ max_output_tokens: Optional[int] = None
684
+ max_tool_calls: Optional[int] = None
685
+ metadata: Optional[Dict[str, Any]] = None
686
+ model: Optional[str] = None # Made optional to match vLLM
687
+ parallel_tool_calls: Optional[bool] = True
688
+ previous_response_id: Optional[str] = None
689
+ reasoning: Optional[ResponseReasoningParam] = None
690
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
691
+ store: Optional[bool] = True
692
+ stream: Optional[bool] = False
693
+ temperature: Optional[float] = None
694
+ tool_choice: Literal["auto", "required", "none"] = "auto"
695
+ tools: List[ResponseTool] = Field(default_factory=list)
696
+ top_logprobs: Optional[int] = 0
697
+ top_p: Optional[float] = None
698
+ truncation: Optional[Literal["auto", "disabled"]] = "disabled"
699
+ user: Optional[str] = None
700
+
701
+ # Extra SGLang parameters
702
+ request_id: str = Field(
703
+ default_factory=lambda: f"resp_{uuid.uuid4().hex}",
704
+ description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
705
+ )
706
+ priority: int = Field(default=0, description="Request priority")
707
+
708
+ # SGLang-specific sampling parameters
709
+ frequency_penalty: float = 0.0
710
+ presence_penalty: float = 0.0
711
+ stop: Optional[Union[str, List[str]]] = None
712
+ top_k: int = -1
713
+ min_p: float = 0.0
714
+ repetition_penalty: float = 1.0
715
+
716
+ # Default sampling parameters
717
+ _DEFAULT_SAMPLING_PARAMS = {
718
+ "temperature": 0.7,
719
+ "top_p": 1.0,
720
+ "top_k": -1,
721
+ "min_p": 0.0,
722
+ "repetition_penalty": 1.0,
723
+ }
724
+
725
+ def to_sampling_params(
726
+ self, default_max_tokens: int, default_params: Optional[Dict] = None
727
+ ) -> Dict[str, Any]:
728
+ """Convert to sampling parameters for generation."""
729
+ if default_params is None:
730
+ default_params = {}
731
+
732
+ # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
733
+ if self.max_output_tokens is not None:
734
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
735
+ else:
736
+ max_tokens = default_max_tokens
737
+
738
+ # Avoid exceed the context length by minus 1 token
739
+ max_tokens -= 1
740
+
741
+ # Get parameters with defaults
742
+ temperature = self.temperature
743
+ if temperature is None:
744
+ temperature = default_params.get(
745
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
746
+ )
747
+
748
+ top_p = self.top_p
749
+ if top_p is None:
750
+ top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
751
+
752
+ params = {
753
+ "max_new_tokens": max_tokens,
754
+ "temperature": temperature,
755
+ "top_p": top_p,
756
+ "frequency_penalty": self.frequency_penalty,
757
+ "presence_penalty": self.presence_penalty,
758
+ "stop": self.stop,
759
+ "top_k": self.top_k,
760
+ "min_p": self.min_p,
761
+ "repetition_penalty": self.repetition_penalty,
762
+ }
763
+
764
+ # Apply any additional default parameters
765
+ for key, value in default_params.items():
766
+ if key not in params or params[key] is None:
767
+ params[key] = value
768
+
769
+ return params
770
+
771
+
772
+ class PromptTokenUsageInfo(BaseModel):
773
+ """Prompt token usage details."""
774
+
775
+ cached_tokens: int = 0
776
+
777
+
778
+ class ResponsesResponse(BaseModel):
779
+ """Response body for v1/responses endpoint."""
780
+
781
+ id: str = Field(default_factory=lambda: f"resp_{time.time()}")
782
+ object: Literal["response"] = "response"
783
+ created_at: int = Field(default_factory=lambda: int(time.time()))
784
+ model: str
785
+
786
+ output: List[
787
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
788
+ ] = Field(default_factory=list)
789
+ status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
790
+ usage: Optional[UsageInfo] = None
791
+ parallel_tool_calls: bool = True
792
+ tool_choice: str = "auto"
793
+ tools: List[ResponseTool] = Field(default_factory=list)
794
+
795
+ @classmethod
796
+ def from_request(
797
+ cls,
798
+ request: ResponsesRequest,
799
+ sampling_params: Any,
800
+ model_name: str,
801
+ created_time: int,
802
+ output: List[
803
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
804
+ ],
805
+ status: str,
806
+ usage: Optional[UsageInfo],
807
+ ) -> "ResponsesResponse":
808
+ """Create a response from a request."""
809
+ return cls(
810
+ id=request.request_id,
811
+ created_at=created_time,
812
+ model=model_name,
813
+ output=output,
814
+ status=status,
815
+ usage=usage,
816
+ parallel_tool_calls=request.parallel_tool_calls or True,
817
+ tool_choice=request.tool_choice,
818
+ tools=request.tools,
819
+ )
820
+
821
+
822
+ class RequestResponseMetadata(BaseModel):
823
+ """Metadata for request/response tracking."""
824
+
825
+ request_id: str
826
+ final_usage_info: Optional[UsageInfo] = None
827
+
828
+
622
829
  @dataclass
623
830
  class MessageProcessingResult:
624
831
  """Result of processing chat messages and applying templates.
@@ -645,3 +852,13 @@ class MessageProcessingResult:
645
852
  modalities: List[str]
646
853
  stop: List[str]
647
854
  tool_call_constraint: Optional[Any] = None
855
+
856
+
857
+ class ResponseReasoningTextContent(BaseModel):
858
+ text: str
859
+ type: Literal["reasoning_text"] = "reasoning_text"
860
+
861
+
862
+ ResponseInputOutputItem: TypeAlias = Union[
863
+ ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
864
+ ]
@@ -47,7 +47,9 @@ class OpenAIServingChat(OpenAIServingBase):
47
47
  """Handler for /v1/chat/completions requests"""
48
48
 
49
49
  def __init__(
50
- self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
50
+ self,
51
+ tokenizer_manager: TokenizerManager,
52
+ template_manager: TemplateManager,
51
53
  ):
52
54
  super().__init__(tokenizer_manager)
53
55
  self.template_manager = template_manager
@@ -67,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
67
69
  ):
68
70
  return "Tools cannot be empty if tool choice is set to required."
69
71
 
72
+ max_output_tokens = request.max_completion_tokens or request.max_tokens
73
+ server_context_length = self.tokenizer_manager.server_args.context_length
74
+ if (
75
+ max_output_tokens
76
+ and server_context_length
77
+ and max_output_tokens > server_context_length
78
+ ):
79
+ return (
80
+ f"max_completion_tokens is too large: {max_output_tokens}."
81
+ f"This model supports at most {server_context_length} completion tokens."
82
+ )
83
+
70
84
  return None
71
85
 
72
86
  def _convert_to_internal_request(
@@ -81,7 +95,9 @@ class OpenAIServingChat(OpenAIServingBase):
81
95
 
82
96
  # Build sampling parameters
83
97
  sampling_params = self._build_sampling_params(
84
- request, processed_messages.stop, processed_messages.tool_call_constraint
98
+ request,
99
+ processed_messages.stop,
100
+ processed_messages.tool_call_constraint,
85
101
  )
86
102
 
87
103
  # Handle single vs multiple requests
@@ -196,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
196
212
  tokenize=True,
197
213
  add_generation_prompt=True,
198
214
  tools=tools,
215
+ reasoning_effort=request.reasoning_effort,
199
216
  **(
200
217
  request.chat_template_kwargs if request.chat_template_kwargs else {}
201
218
  ),
202
219
  )
203
220
  except Exception:
204
- # This except branch will be triggered when the chosen model
205
- # has a different tools input format that is not compatible
206
- # with openAI's apply_chat_template tool_call format, like Mistral.
221
+ # This except branch will be triggered when the chosen model
222
+ # has a different tools input format that is not compatible
223
+ # with openAI's apply_chat_template tool_call format, like Mistral.
207
224
  tools = (
208
225
  [t if "function" in t else {"function": t} for t in tools]
209
226
  if tools
@@ -214,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
214
231
  tokenize=True,
215
232
  add_generation_prompt=True,
216
233
  tools=tools,
234
+ reasoning_effort=request.reasoning_effort,
217
235
  **(
218
236
  request.chat_template_kwargs if request.chat_template_kwargs else {}
219
237
  ),
@@ -277,6 +295,8 @@ class OpenAIServingChat(OpenAIServingBase):
277
295
  prompt = prompt[: -len(conv.sep2)]
278
296
  else:
279
297
  prompt = conv.get_prompt()
298
+ if self._get_enable_thinking_from_request(request):
299
+ prompt += "<think>" # Note(Xinyuan): hard code thinking token
280
300
 
281
301
  image_data = conv.image_data if conv.image_data else None
282
302
  video_data = conv.video_data if conv.video_data else None
@@ -448,7 +468,6 @@ class OpenAIServingChat(OpenAIServingBase):
448
468
  )
449
469
  yield f"data: {chunk.model_dump_json()}\n\n"
450
470
 
451
- # Process content delta
452
471
  stream_buffer = stream_buffers.get(index, "")
453
472
  delta = content["text"][len(stream_buffer) :]
454
473
  stream_buffers[index] = stream_buffer + delta
@@ -502,7 +521,7 @@ class OpenAIServingChat(OpenAIServingBase):
502
521
  if delta:
503
522
  choice_data = ChatCompletionResponseStreamChoice(
504
523
  index=index,
505
- delta=DeltaMessage(content=delta if delta else None),
524
+ delta=DeltaMessage(content=delta),
506
525
  finish_reason=None,
507
526
  matched_stop=None,
508
527
  logprobs=choice_logprobs,
@@ -645,9 +664,15 @@ class OpenAIServingChat(OpenAIServingBase):
645
664
  reasoning_text = None
646
665
  reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
647
666
  if reasoning_parser and request.separate_reasoning:
667
+ is_force_reasoning = (
668
+ self.template_manager.force_reasoning
669
+ or self._get_enable_thinking_from_request(request)
670
+ )
648
671
  try:
649
672
  parser = ReasoningParser(
650
- model_type=reasoning_parser, stream_reasoning=False
673
+ model_type=reasoning_parser,
674
+ stream_reasoning=False,
675
+ force_reasoning=is_force_reasoning,
651
676
  )
652
677
  reasoning_text, text = parser.parse_non_stream(text)
653
678
  except Exception as e:
@@ -810,14 +835,19 @@ class OpenAIServingChat(OpenAIServingBase):
810
835
  ) -> tuple[Optional[str], str]:
811
836
  """Process reasoning content in streaming response"""
812
837
  if index not in reasoning_parser_dict:
838
+ is_force_reasoning = (
839
+ self.template_manager.force_reasoning
840
+ or self._get_enable_thinking_from_request(request)
841
+ )
813
842
  reasoning_parser_dict[index] = ReasoningParser(
814
843
  self.tokenizer_manager.server_args.reasoning_parser,
815
844
  request.stream_reasoning,
845
+ is_force_reasoning,
816
846
  )
817
847
  reasoning_parser = reasoning_parser_dict[index]
818
848
  return reasoning_parser.parse_stream_chunk(delta)
819
849
 
820
- def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
850
+ def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
821
851
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
822
852
 
823
853
  NOTE: This parameter is only useful for models that support enable_thinking
@@ -826,7 +856,7 @@ class OpenAIServingChat(OpenAIServingBase):
826
856
  Args:
827
857
  request_obj: The request object (or an item from a list of requests).
828
858
  Returns:
829
- The boolean value of 'enable_thinking' if found and not True, otherwise True.
859
+ The boolean value of 'enable_thinking' if found, otherwise False.
830
860
  """
831
861
  if (
832
862
  hasattr(request, "chat_template_kwargs")
@@ -834,7 +864,7 @@ class OpenAIServingChat(OpenAIServingBase):
834
864
  and request.chat_template_kwargs.get("enable_thinking") is not None
835
865
  ):
836
866
  return request.chat_template_kwargs.get("enable_thinking")
837
- return True
867
+ return False
838
868
 
839
869
  async def _process_tool_call_stream(
840
870
  self,