sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +35 -0
  5. sglang/srt/conversation.py +9 -117
  6. sglang/srt/disaggregation/base/conn.py +5 -2
  7. sglang/srt/disaggregation/decode.py +6 -1
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
  9. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  10. sglang/srt/disaggregation/prefill.py +3 -0
  11. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  12. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  13. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  14. sglang/srt/distributed/parallel_state.py +22 -9
  15. sglang/srt/entrypoints/context.py +244 -0
  16. sglang/srt/entrypoints/engine.py +8 -5
  17. sglang/srt/entrypoints/harmony_utils.py +370 -0
  18. sglang/srt/entrypoints/http_server.py +106 -15
  19. sglang/srt/entrypoints/openai/protocol.py +227 -1
  20. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  21. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  22. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  23. sglang/srt/entrypoints/tool.py +87 -0
  24. sglang/srt/eplb/expert_distribution.py +4 -2
  25. sglang/srt/eplb/expert_location.py +5 -1
  26. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  27. sglang/srt/hf_transformers_utils.py +55 -13
  28. sglang/srt/jinja_template_utils.py +8 -1
  29. sglang/srt/layers/attention/aiter_backend.py +5 -8
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  31. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  32. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  33. sglang/srt/layers/attention/triton_backend.py +85 -14
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  35. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  36. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
  38. sglang/srt/layers/attention/vision.py +40 -15
  39. sglang/srt/layers/communicator.py +35 -8
  40. sglang/srt/layers/dp_attention.py +12 -0
  41. sglang/srt/layers/linear.py +9 -8
  42. sglang/srt/layers/logits_processor.py +9 -1
  43. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  44. sglang/srt/layers/moe/ep_moe/layer.py +87 -107
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
  48. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
  49. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  50. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  51. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  52. sglang/srt/layers/moe/topk.py +12 -3
  53. sglang/srt/layers/moe/utils.py +59 -0
  54. sglang/srt/layers/quantization/__init__.py +22 -0
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  56. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  57. sglang/srt/layers/quantization/fp4.py +557 -0
  58. sglang/srt/layers/quantization/fp8.py +8 -7
  59. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  60. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  61. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  62. sglang/srt/layers/quantization/mxfp4.py +651 -0
  63. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  64. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  65. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  66. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  67. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  68. sglang/srt/layers/quantization/quark/utils.py +107 -0
  69. sglang/srt/layers/quantization/unquant.py +60 -6
  70. sglang/srt/layers/quantization/w4afp8.py +1 -1
  71. sglang/srt/layers/rotary_embedding.py +225 -1
  72. sglang/srt/layers/utils.py +9 -0
  73. sglang/srt/layers/vocab_parallel_embedding.py +15 -4
  74. sglang/srt/lora/lora_manager.py +70 -14
  75. sglang/srt/lora/lora_registry.py +10 -2
  76. sglang/srt/lora/mem_pool.py +43 -5
  77. sglang/srt/managers/cache_controller.py +61 -32
  78. sglang/srt/managers/data_parallel_controller.py +52 -2
  79. sglang/srt/managers/detokenizer_manager.py +1 -1
  80. sglang/srt/managers/io_struct.py +21 -4
  81. sglang/srt/managers/mm_utils.py +5 -11
  82. sglang/srt/managers/schedule_batch.py +30 -8
  83. sglang/srt/managers/schedule_policy.py +3 -1
  84. sglang/srt/managers/scheduler.py +170 -18
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  86. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  87. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  88. sglang/srt/managers/template_manager.py +59 -22
  89. sglang/srt/managers/tokenizer_manager.py +137 -67
  90. sglang/srt/managers/tp_worker.py +3 -0
  91. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  92. sglang/srt/managers/utils.py +45 -1
  93. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  94. sglang/srt/mem_cache/hicache_storage.py +13 -21
  95. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  96. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  97. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  98. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  99. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  100. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  101. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  102. sglang/srt/model_executor/forward_batch_info.py +48 -17
  103. sglang/srt/model_executor/model_runner.py +24 -2
  104. sglang/srt/model_loader/weight_utils.py +10 -0
  105. sglang/srt/models/bailing_moe.py +425 -0
  106. sglang/srt/models/deepseek_v2.py +95 -50
  107. sglang/srt/models/ernie4.py +426 -0
  108. sglang/srt/models/ernie4_eagle.py +203 -0
  109. sglang/srt/models/gemma3n_mm.py +39 -0
  110. sglang/srt/models/glm4_moe.py +102 -27
  111. sglang/srt/models/gpt_oss.py +1134 -0
  112. sglang/srt/models/grok.py +3 -3
  113. sglang/srt/models/llama4.py +13 -2
  114. sglang/srt/models/mixtral.py +3 -3
  115. sglang/srt/models/mllama4.py +428 -19
  116. sglang/srt/models/qwen2.py +6 -0
  117. sglang/srt/models/qwen2_moe.py +7 -4
  118. sglang/srt/models/qwen3_moe.py +39 -14
  119. sglang/srt/models/step3_vl.py +10 -1
  120. sglang/srt/models/transformers.py +2 -5
  121. sglang/srt/multimodal/processors/base_processor.py +4 -3
  122. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  123. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  124. sglang/srt/operations_strategy.py +1 -1
  125. sglang/srt/reasoning_parser.py +18 -39
  126. sglang/srt/server_args.py +218 -23
  127. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  128. sglang/srt/two_batch_overlap.py +163 -9
  129. sglang/srt/utils.py +41 -26
  130. sglang/srt/weight_sync/utils.py +1 -1
  131. sglang/test/runners.py +4 -4
  132. sglang/test/test_utils.py +4 -4
  133. sglang/version.py +1 -1
  134. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
  135. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
  136. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  137. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  138. /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
  139. /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
  140. /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
  141. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  143. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,18 @@
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
16
  import time
17
+ import uuid
17
18
  from dataclasses import dataclass
18
- from typing import Any, Dict, List, Optional, Union
19
+ from typing import Any, Dict, List, Optional, TypeAlias, Union
19
20
 
21
+ from openai.types.responses import (
22
+ ResponseFunctionToolCall,
23
+ ResponseInputItemParam,
24
+ ResponseOutputItem,
25
+ ResponseReasoningItem,
26
+ )
27
+ from openai.types.responses.response import ToolChoice
28
+ from openai.types.responses.tool import Tool
20
29
  from pydantic import (
21
30
  BaseModel,
22
31
  Field,
@@ -84,6 +93,7 @@ class UsageInfo(BaseModel):
84
93
  completion_tokens: Optional[int] = 0
85
94
  # only used to return cached tokens when --enable-cache-report is set
86
95
  prompt_tokens_details: Optional[Dict[str, int]] = None
96
+ reasoning_tokens: Optional[int] = 0
87
97
 
88
98
 
89
99
  class StreamOptions(BaseModel):
@@ -428,6 +438,13 @@ class ChatCompletionRequest(BaseModel):
428
438
  default="auto", examples=["none"]
429
439
  ) # noqa
430
440
  return_hidden_states: bool = False
441
+ reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
442
+ default="medium",
443
+ description="Constrains effort on reasoning for reasoning models. "
444
+ "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
445
+ "result in faster responses and fewer tokens used on reasoning in a response. "
446
+ "Currently only supported for OpenAI models.",
447
+ )
431
448
 
432
449
  @model_validator(mode="before")
433
450
  @classmethod
@@ -619,6 +636,196 @@ OpenAIServingRequest = Union[
619
636
  ]
620
637
 
621
638
 
639
+ # Response API protocol definitions
640
+ class ResponseReasoningParam(BaseModel):
641
+ """Reasoning parameters for responses."""
642
+
643
+ effort: Optional[Literal["low", "medium", "high"]] = Field(
644
+ default="medium",
645
+ description="Constrains effort on reasoning for reasoning models.",
646
+ )
647
+
648
+
649
+ class ResponseTool(BaseModel):
650
+ """Tool definition for responses."""
651
+
652
+ type: Literal["web_search_preview", "code_interpreter"] = Field(
653
+ description="Type of tool to enable"
654
+ )
655
+
656
+
657
+ ResponseInputOutputItem: TypeAlias = Union[
658
+ ResponseInputItemParam,
659
+ "ResponseReasoningItem",
660
+ ResponseFunctionToolCall,
661
+ ]
662
+
663
+
664
+ class ResponsesRequest(BaseModel):
665
+ """Request body for v1/responses endpoint."""
666
+
667
+ # Core OpenAI API fields (ordered by official documentation)
668
+ background: Optional[bool] = False
669
+ include: Optional[
670
+ List[
671
+ Literal[
672
+ "code_interpreter_call.outputs",
673
+ "computer_call_output.output.image_url",
674
+ "file_search_call.results",
675
+ "message.input_image.image_url",
676
+ "message.output_text.logprobs",
677
+ "reasoning.encrypted_content",
678
+ ]
679
+ ]
680
+ ] = None
681
+ input: Union[str, List[ResponseInputOutputItem]]
682
+ instructions: Optional[str] = None
683
+ max_output_tokens: Optional[int] = None
684
+ max_tool_calls: Optional[int] = None
685
+ metadata: Optional[Dict[str, Any]] = None
686
+ model: Optional[str] = None # Made optional to match vLLM
687
+ parallel_tool_calls: Optional[bool] = True
688
+ previous_response_id: Optional[str] = None
689
+ reasoning: Optional[ResponseReasoningParam] = None
690
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
691
+ store: Optional[bool] = True
692
+ stream: Optional[bool] = False
693
+ temperature: Optional[float] = None
694
+ tool_choice: Literal["auto", "required", "none"] = "auto"
695
+ tools: List[ResponseTool] = Field(default_factory=list)
696
+ top_logprobs: Optional[int] = 0
697
+ top_p: Optional[float] = None
698
+ truncation: Optional[Literal["auto", "disabled"]] = "disabled"
699
+ user: Optional[str] = None
700
+
701
+ # Extra SGLang parameters
702
+ request_id: str = Field(
703
+ default_factory=lambda: f"resp_{uuid.uuid4().hex}",
704
+ description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
705
+ )
706
+ priority: int = Field(default=0, description="Request priority")
707
+
708
+ # SGLang-specific sampling parameters
709
+ frequency_penalty: float = 0.0
710
+ presence_penalty: float = 0.0
711
+ stop: Optional[Union[str, List[str]]] = None
712
+ top_k: int = -1
713
+ min_p: float = 0.0
714
+ repetition_penalty: float = 1.0
715
+
716
+ # Default sampling parameters
717
+ _DEFAULT_SAMPLING_PARAMS = {
718
+ "temperature": 0.7,
719
+ "top_p": 1.0,
720
+ "top_k": -1,
721
+ "min_p": 0.0,
722
+ "repetition_penalty": 1.0,
723
+ }
724
+
725
+ def to_sampling_params(
726
+ self, default_max_tokens: int, default_params: Optional[Dict] = None
727
+ ) -> Dict[str, Any]:
728
+ """Convert to sampling parameters for generation."""
729
+ if default_params is None:
730
+ default_params = {}
731
+
732
+ # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
733
+ if self.max_output_tokens is not None:
734
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
735
+ else:
736
+ max_tokens = default_max_tokens
737
+
738
+ # Avoid exceed the context length by minus 1 token
739
+ max_tokens -= 1
740
+
741
+ # Get parameters with defaults
742
+ temperature = self.temperature
743
+ if temperature is None:
744
+ temperature = default_params.get(
745
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
746
+ )
747
+
748
+ top_p = self.top_p
749
+ if top_p is None:
750
+ top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
751
+
752
+ params = {
753
+ "max_new_tokens": max_tokens,
754
+ "temperature": temperature,
755
+ "top_p": top_p,
756
+ "frequency_penalty": self.frequency_penalty,
757
+ "presence_penalty": self.presence_penalty,
758
+ "stop": self.stop,
759
+ "top_k": self.top_k,
760
+ "min_p": self.min_p,
761
+ "repetition_penalty": self.repetition_penalty,
762
+ }
763
+
764
+ # Apply any additional default parameters
765
+ for key, value in default_params.items():
766
+ if key not in params or params[key] is None:
767
+ params[key] = value
768
+
769
+ return params
770
+
771
+
772
+ class PromptTokenUsageInfo(BaseModel):
773
+ """Prompt token usage details."""
774
+
775
+ cached_tokens: int = 0
776
+
777
+
778
+ class ResponsesResponse(BaseModel):
779
+ """Response body for v1/responses endpoint."""
780
+
781
+ id: str = Field(default_factory=lambda: f"resp_{time.time()}")
782
+ object: Literal["response"] = "response"
783
+ created_at: int = Field(default_factory=lambda: int(time.time()))
784
+ model: str
785
+
786
+ output: List[
787
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
788
+ ] = Field(default_factory=list)
789
+ status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
790
+ usage: Optional[UsageInfo] = None
791
+ parallel_tool_calls: bool = True
792
+ tool_choice: str = "auto"
793
+ tools: List[ResponseTool] = Field(default_factory=list)
794
+
795
+ @classmethod
796
+ def from_request(
797
+ cls,
798
+ request: ResponsesRequest,
799
+ sampling_params: Any,
800
+ model_name: str,
801
+ created_time: int,
802
+ output: List[
803
+ Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
804
+ ],
805
+ status: str,
806
+ usage: Optional[UsageInfo],
807
+ ) -> "ResponsesResponse":
808
+ """Create a response from a request."""
809
+ return cls(
810
+ id=request.request_id,
811
+ created_at=created_time,
812
+ model=model_name,
813
+ output=output,
814
+ status=status,
815
+ usage=usage,
816
+ parallel_tool_calls=request.parallel_tool_calls or True,
817
+ tool_choice=request.tool_choice,
818
+ tools=request.tools,
819
+ )
820
+
821
+
822
+ class RequestResponseMetadata(BaseModel):
823
+ """Metadata for request/response tracking."""
824
+
825
+ request_id: str
826
+ final_usage_info: Optional[UsageInfo] = None
827
+
828
+
622
829
  @dataclass
623
830
  class MessageProcessingResult:
624
831
  """Result of processing chat messages and applying templates.
@@ -645,3 +852,22 @@ class MessageProcessingResult:
645
852
  modalities: List[str]
646
853
  stop: List[str]
647
854
  tool_call_constraint: Optional[Any] = None
855
+
856
+
857
+ class ResponseReasoningTextContent(BaseModel):
858
+ text: str
859
+ type: Literal["reasoning_text"] = "reasoning_text"
860
+
861
+
862
+ class ResponseReasoningItem(BaseModel):
863
+ id: str
864
+ content: list[ResponseReasoningTextContent] = Field(default_factory=list)
865
+ summary: list = Field(default_factory=list)
866
+ type: Literal["reasoning"] = "reasoning"
867
+ encrypted_content: Optional[str] = None
868
+ status: Optional[Literal["in_progress", "completed", "incomplete"]]
869
+
870
+
871
+ ResponseInputOutputItem: TypeAlias = Union[
872
+ ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
873
+ ]
@@ -7,8 +7,18 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
7
7
 
8
8
  from fastapi import Request
9
9
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
+ from openai_harmony import Message as OpenAIMessage
10
11
 
11
12
  from sglang.srt.conversation import generate_chat_conv
13
+ from sglang.srt.entrypoints.harmony_utils import (
14
+ get_developer_message,
15
+ get_stop_tokens_for_assistant_actions,
16
+ get_streamable_parser_for_assistant,
17
+ get_system_message,
18
+ parse_chat_input,
19
+ parse_output_into_messages,
20
+ render_for_completion,
21
+ )
12
22
  from sglang.srt.entrypoints.openai.protocol import (
13
23
  ChatCompletionRequest,
14
24
  ChatCompletionResponse,
@@ -51,6 +61,26 @@ class OpenAIServingChat(OpenAIServingBase):
51
61
  ):
52
62
  super().__init__(tokenizer_manager)
53
63
  self.template_manager = template_manager
64
+ self.use_harmony = (
65
+ self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
66
+ )
67
+
68
+ if self.use_harmony:
69
+ from sglang.srt.function_call.harmony_tool_parser import (
70
+ HarmonyToolCallParser,
71
+ )
72
+
73
+ self.harmony_tool_parser = HarmonyToolCallParser()
74
+
75
+ # NOTE While OpenAI's chat completion API supports browsing
76
+ # for some models, currently vLLM doesn't support it. Please use the
77
+ # Responses API instead.
78
+ self.supports_browsing = False
79
+ self.browser_tool = None
80
+ # NOTE: Chat completion API does not support code interpreter.
81
+ # Please use the Responses API instead.
82
+ self.supports_code_interpreter = False
83
+ self.python_tool = None
54
84
 
55
85
  def _request_id_prefix(self) -> str:
56
86
  return "chatcmpl-"
@@ -77,41 +107,66 @@ class OpenAIServingChat(OpenAIServingBase):
77
107
  is_multimodal = self.tokenizer_manager.model_config.is_multimodal
78
108
 
79
109
  # Process messages and apply chat template
80
- processed_messages = self._process_messages(request, is_multimodal)
81
-
82
- # Build sampling parameters
83
- sampling_params = self._build_sampling_params(
84
- request, processed_messages.stop, processed_messages.tool_call_constraint
85
- )
110
+ if not self.use_harmony:
111
+ processed_messages = self._process_messages(request, is_multimodal)
112
+
113
+ # Build sampling parameters
114
+ sampling_params = self._build_sampling_params(
115
+ request,
116
+ processed_messages.stop,
117
+ processed_messages.tool_call_constraint,
118
+ )
86
119
 
87
- # Handle single vs multiple requests
88
- if is_multimodal:
89
- prompt_kwargs = {"text": processed_messages.prompt}
90
- else:
91
- if isinstance(processed_messages.prompt_ids, str):
92
- prompt_kwargs = {"text": processed_messages.prompt_ids}
120
+ # Handle single vs multiple requests
121
+ if is_multimodal:
122
+ prompt_kwargs = {"text": processed_messages.prompt}
93
123
  else:
94
- prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
95
-
96
- adapted_request = GenerateReqInput(
97
- **prompt_kwargs,
98
- image_data=processed_messages.image_data,
99
- video_data=processed_messages.video_data,
100
- audio_data=processed_messages.audio_data,
101
- sampling_params=sampling_params,
102
- return_logprob=request.logprobs,
103
- logprob_start_len=-1,
104
- top_logprobs_num=request.top_logprobs or 0,
105
- stream=request.stream,
106
- return_text_in_logprobs=True,
107
- modalities=processed_messages.modalities,
108
- lora_path=request.lora_path,
109
- bootstrap_host=request.bootstrap_host,
110
- bootstrap_port=request.bootstrap_port,
111
- bootstrap_room=request.bootstrap_room,
112
- return_hidden_states=request.return_hidden_states,
113
- rid=request.rid,
114
- )
124
+ if isinstance(processed_messages.prompt_ids, str):
125
+ prompt_kwargs = {"text": processed_messages.prompt_ids}
126
+ else:
127
+ prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
128
+
129
+ adapted_request = GenerateReqInput(
130
+ **prompt_kwargs,
131
+ image_data=processed_messages.image_data,
132
+ video_data=processed_messages.video_data,
133
+ audio_data=processed_messages.audio_data,
134
+ sampling_params=sampling_params,
135
+ return_logprob=request.logprobs,
136
+ logprob_start_len=-1,
137
+ top_logprobs_num=request.top_logprobs or 0,
138
+ stream=request.stream,
139
+ return_text_in_logprobs=True,
140
+ modalities=processed_messages.modalities,
141
+ lora_path=request.lora_path,
142
+ bootstrap_host=request.bootstrap_host,
143
+ bootstrap_port=request.bootstrap_port,
144
+ bootstrap_room=request.bootstrap_room,
145
+ return_hidden_states=request.return_hidden_states,
146
+ rid=request.rid,
147
+ )
148
+ else:
149
+ processed_messages, prompt_ids = self._make_request_with_harmony(request)
150
+
151
+ adapted_request = GenerateReqInput(
152
+ input_ids=prompt_ids,
153
+ sampling_params=self._build_sampling_params(
154
+ request,
155
+ request.stop,
156
+ tool_call_constraint=None,
157
+ ),
158
+ stream=request.stream,
159
+ return_logprob=request.logprobs,
160
+ logprob_start_len=-1,
161
+ top_logprobs_num=request.top_logprobs or 0,
162
+ return_text_in_logprobs=True,
163
+ lora_path=request.lora_path,
164
+ bootstrap_host=request.bootstrap_host,
165
+ bootstrap_port=request.bootstrap_port,
166
+ bootstrap_room=request.bootstrap_room,
167
+ return_hidden_states=request.return_hidden_states,
168
+ rid=request.rid,
169
+ )
115
170
 
116
171
  return adapted_request, request
117
172
 
@@ -277,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
277
332
  prompt = prompt[: -len(conv.sep2)]
278
333
  else:
279
334
  prompt = conv.get_prompt()
335
+ if self._get_enable_thinking_from_request(request):
336
+ prompt += "<think>" # Note(Xinyuan): hard code thinking token
280
337
 
281
338
  image_data = conv.image_data if conv.image_data else None
282
339
  video_data = conv.video_data if conv.video_data else None
@@ -402,6 +459,12 @@ class OpenAIServingChat(OpenAIServingBase):
402
459
  cached_tokens = {}
403
460
  hidden_states = {}
404
461
 
462
+ # Harmony tracking
463
+ if self.use_harmony:
464
+ harmony_parsers = [
465
+ get_streamable_parser_for_assistant() for _ in range(request.n)
466
+ ]
467
+
405
468
  try:
406
469
  async for content in self.tokenizer_manager.generate_request(
407
470
  adapted_request, raw_request
@@ -449,14 +512,57 @@ class OpenAIServingChat(OpenAIServingBase):
449
512
  yield f"data: {chunk.model_dump_json()}\n\n"
450
513
 
451
514
  # Process content delta
452
- stream_buffer = stream_buffers.get(index, "")
453
- delta = content["text"][len(stream_buffer) :]
454
- stream_buffers[index] = stream_buffer + delta
515
+ if self.use_harmony:
516
+ harmony_parser = harmony_parsers[index]
517
+
518
+ new_token_ids = content["output_ids"]
519
+ for token_id in new_token_ids:
520
+ harmony_parser.process(token_id)
521
+
522
+ is_final = harmony_parser.current_channel == "final"
523
+ is_analysis = harmony_parser.current_channel == "analysis"
524
+ delta = harmony_parser.last_content_delta or ""
525
+
526
+ if is_analysis:
527
+ choice_data = ChatCompletionResponseStreamChoice(
528
+ index=index,
529
+ delta=DeltaMessage(reasoning_content=delta),
530
+ finish_reason=None,
531
+ )
532
+ chunk = ChatCompletionStreamResponse(
533
+ id=content["meta_info"]["id"],
534
+ created=int(time.time()),
535
+ choices=[choice_data],
536
+ model=request.model,
537
+ )
538
+ yield f"data: {chunk.model_dump_json()}\n\n"
539
+ continue
540
+
541
+ choice_data = ChatCompletionResponseStreamChoice(
542
+ index=index,
543
+ delta=DeltaMessage(content=delta if delta else None),
544
+ finish_reason=None,
545
+ matched_stop=None,
546
+ logprobs=choice_logprobs,
547
+ )
548
+ chunk = ChatCompletionStreamResponse(
549
+ id=content["meta_info"]["id"],
550
+ created=int(time.time()),
551
+ choices=[choice_data],
552
+ model=request.model,
553
+ )
554
+ yield f"data: {chunk.model_dump_json()}\n\n"
555
+ continue
556
+ else:
557
+ stream_buffer = stream_buffers.get(index, "")
558
+ delta = content["text"][len(stream_buffer) :]
559
+ stream_buffers[index] = stream_buffer + delta
455
560
 
456
561
  # Handle reasoning content
457
562
  if (
458
563
  self.tokenizer_manager.server_args.reasoning_parser
459
564
  and request.separate_reasoning
565
+ and not self.use_harmony
460
566
  ):
461
567
  reasoning_text, delta = self._process_reasoning_stream(
462
568
  index, delta, reasoning_parser_dict, content, request
@@ -475,8 +581,27 @@ class OpenAIServingChat(OpenAIServingBase):
475
581
  )
476
582
  yield f"data: {chunk.model_dump_json()}\n\n"
477
583
 
584
+ if self.use_harmony and not is_final:
585
+ choice_data = ChatCompletionResponseStreamChoice(
586
+ index=index,
587
+ delta=DeltaMessage(reasoning_content=delta),
588
+ finish_reason=None,
589
+ )
590
+ chunk = ChatCompletionStreamResponse(
591
+ id=content["meta_info"]["id"],
592
+ created=int(time.time()),
593
+ choices=[choice_data],
594
+ model=request.model,
595
+ )
596
+ yield f"data: {chunk.model_dump_json()}\n\n"
597
+
478
598
  # Handle tool calls
479
- if request.tool_choice != "none" and request.tools:
599
+ # TODO: support tool call parsing for harmony
600
+ if (
601
+ request.tool_choice != "none"
602
+ and request.tools
603
+ and not self.use_harmony
604
+ ):
480
605
  async for chunk in self._process_tool_call_stream(
481
606
  index,
482
607
  delta,
@@ -502,7 +627,7 @@ class OpenAIServingChat(OpenAIServingBase):
502
627
  if delta:
503
628
  choice_data = ChatCompletionResponseStreamChoice(
504
629
  index=index,
505
- delta=DeltaMessage(content=delta if delta else None),
630
+ delta=DeltaMessage(content=delta),
506
631
  finish_reason=None,
507
632
  matched_stop=None,
508
633
  logprobs=choice_logprobs,
@@ -640,14 +765,90 @@ class OpenAIServingChat(OpenAIServingBase):
640
765
 
641
766
  finish_reason = ret_item["meta_info"]["finish_reason"]
642
767
  text = ret_item["text"]
768
+ output_ids = ret_item["output_ids"]
769
+
770
+ if self.use_harmony:
771
+ parser = parse_output_into_messages(output_ids)
772
+ output_msgs = parser.messages
773
+ if len(output_msgs) == 0:
774
+ # The generation has stopped during reasoning.
775
+ is_tool_call = False
776
+ reasoning_content = parser.current_content
777
+ final_content = None
778
+ elif len(output_msgs) == 1:
779
+ # The generation has stopped during final message.
780
+ is_tool_call = False
781
+ reasoning_content = output_msgs[0].content[0].text
782
+ final_content = parser.current_content
783
+ else:
784
+ if len(output_msgs) != 2:
785
+ raise ValueError(
786
+ "Expected 2 output messages (reasoning and final), "
787
+ f"but got {len(output_msgs)}."
788
+ )
789
+ reasoning_msg, final_msg = output_msgs
790
+ reasoning_content = reasoning_msg.content[0].text
791
+ final_content = final_msg.content[0].text
792
+ is_tool_call = final_msg.recipient is not None
793
+
794
+ if is_tool_call:
795
+ # Extract tool call information from final message
796
+ tool_call = (
797
+ self.harmony_tool_parser.extract_tool_calls_from_message(
798
+ final_msg
799
+ )
800
+ )
801
+ tool_calls = [tool_call] if tool_call else []
802
+
803
+ message = ChatMessage(
804
+ role="assistant",
805
+ reasoning_content=reasoning_content,
806
+ content=None, # Tool calls don't have regular content
807
+ tool_calls=tool_calls,
808
+ )
809
+ else:
810
+ # Normal message
811
+ message = ChatMessage(
812
+ role="assistant",
813
+ reasoning_content=reasoning_content,
814
+ content=final_content,
815
+ )
816
+
817
+ if is_tool_call:
818
+ finish_reason_type = "tool_calls"
819
+ elif finish_reason:
820
+ finish_reason_type = (
821
+ finish_reason["type"] if finish_reason else "stop"
822
+ )
823
+ else:
824
+ finish_reason_type = "stop"
825
+ choice_data = ChatCompletionResponseChoice(
826
+ index=idx,
827
+ message=message,
828
+ logprobs=choice_logprobs,
829
+ finish_reason=finish_reason_type,
830
+ matched_stop=(
831
+ finish_reason["matched"]
832
+ if finish_reason and "matched" in finish_reason
833
+ else None
834
+ ),
835
+ )
836
+ choices.append(choice_data)
837
+ continue
643
838
 
644
839
  # Handle reasoning content
645
840
  reasoning_text = None
646
841
  reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
647
842
  if reasoning_parser and request.separate_reasoning:
843
+ is_force_reasoning = (
844
+ self.template_manager.force_reasoning
845
+ or self._get_enable_thinking_from_request(request)
846
+ )
648
847
  try:
649
848
  parser = ReasoningParser(
650
- model_type=reasoning_parser, stream_reasoning=False
849
+ model_type=reasoning_parser,
850
+ stream_reasoning=False,
851
+ force_reasoning=is_force_reasoning,
651
852
  )
652
853
  reasoning_text, text = parser.parse_non_stream(text)
653
854
  except Exception as e:
@@ -810,14 +1011,19 @@ class OpenAIServingChat(OpenAIServingBase):
810
1011
  ) -> tuple[Optional[str], str]:
811
1012
  """Process reasoning content in streaming response"""
812
1013
  if index not in reasoning_parser_dict:
1014
+ is_force_reasoning = (
1015
+ self.template_manager.force_reasoning
1016
+ or self._get_enable_thinking_from_request(request)
1017
+ )
813
1018
  reasoning_parser_dict[index] = ReasoningParser(
814
1019
  self.tokenizer_manager.server_args.reasoning_parser,
815
1020
  request.stream_reasoning,
1021
+ is_force_reasoning,
816
1022
  )
817
1023
  reasoning_parser = reasoning_parser_dict[index]
818
1024
  return reasoning_parser.parse_stream_chunk(delta)
819
1025
 
820
- def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
1026
+ def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
821
1027
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
822
1028
 
823
1029
  NOTE: This parameter is only useful for models that support enable_thinking
@@ -826,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
826
1032
  Args:
827
1033
  request_obj: The request object (or an item from a list of requests).
828
1034
  Returns:
829
- The boolean value of 'enable_thinking' if found and not True, otherwise True.
1035
+ The boolean value of 'enable_thinking' if found, otherwise False.
830
1036
  """
831
1037
  if (
832
1038
  hasattr(request, "chat_template_kwargs")
@@ -834,7 +1040,7 @@ class OpenAIServingChat(OpenAIServingBase):
834
1040
  and request.chat_template_kwargs.get("enable_thinking") is not None
835
1041
  ):
836
1042
  return request.chat_template_kwargs.get("enable_thinking")
837
- return True
1043
+ return False
838
1044
 
839
1045
  async def _process_tool_call_stream(
840
1046
  self,
@@ -978,3 +1184,33 @@ class OpenAIServingChat(OpenAIServingBase):
978
1184
  return f"data: {chunk.model_dump_json()}\n\n"
979
1185
 
980
1186
  return None
1187
+
1188
+ def _make_request_with_harmony(
1189
+ self,
1190
+ request: ChatCompletionRequest,
1191
+ ):
1192
+ messages: list[OpenAIMessage] = []
1193
+
1194
+ # Add system message.
1195
+ # In Chat Completion API, browsing is enabled by default if the model
1196
+ # supports it.
1197
+ assert not self.supports_browsing
1198
+ assert not self.supports_code_interpreter
1199
+ sys_msg = get_system_message(
1200
+ reasoning_effort=request.reasoning_effort,
1201
+ browser_description=None,
1202
+ python_description=None,
1203
+ )
1204
+ messages.append(sys_msg)
1205
+
1206
+ # Add developer message.
1207
+ dev_msg = get_developer_message()
1208
+ messages.append(dev_msg)
1209
+
1210
+ # Add user message.
1211
+ for chat_msg in request.messages:
1212
+ messages.append(parse_chat_input(chat_msg))
1213
+
1214
+ # Render prompt token ids.
1215
+ prompt_token_ids = render_for_completion(messages)
1216
+ return messages, prompt_token_ids