sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,372 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
4
+ # Slight differences in processing chat messages
5
+ import datetime
6
+ import json
7
+ from collections.abc import Iterable
8
+ from typing import Literal, Optional, Union
9
+
10
+ from openai.types.responses import (
11
+ ResponseOutputItem,
12
+ ResponseOutputMessage,
13
+ ResponseOutputText,
14
+ ResponseReasoningItem,
15
+ )
16
+ from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
17
+ from openai.types.responses.response_function_web_search import (
18
+ ActionFind,
19
+ ActionOpenPage,
20
+ ActionSearch,
21
+ ResponseFunctionWebSearch,
22
+ )
23
+ from openai.types.responses.response_reasoning_item import (
24
+ Content as ResponseReasoningTextContent,
25
+ )
26
+ from openai.types.responses.tool import Tool
27
+ from openai_harmony import (
28
+ Author,
29
+ Conversation,
30
+ DeveloperContent,
31
+ HarmonyEncodingName,
32
+ Message,
33
+ ReasoningEffort,
34
+ Role,
35
+ StreamableParser,
36
+ SystemContent,
37
+ TextContent,
38
+ ToolDescription,
39
+ load_harmony_encoding,
40
+ )
41
+
42
+ from sglang.srt.entrypoints.openai.protocol import ResponseInputOutputItem
43
+ from sglang.srt.utils import random_uuid
44
+
45
+ REASONING_EFFORT = {
46
+ "high": ReasoningEffort.HIGH,
47
+ "medium": ReasoningEffort.MEDIUM,
48
+ "low": ReasoningEffort.LOW,
49
+ }
50
+
51
+ _harmony_encoding = None
52
+
53
+
54
+ def get_encoding():
55
+ global _harmony_encoding
56
+ if _harmony_encoding is None:
57
+ _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
58
+ return _harmony_encoding
59
+
60
+
61
+ def get_system_message(
62
+ model_identity: Optional[str] = None,
63
+ reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
64
+ start_date: Optional[str] = None,
65
+ browser_description: Optional[str] = None,
66
+ python_description: Optional[str] = None,
67
+ ) -> Message:
68
+ sys_msg_content = SystemContent.new()
69
+ if model_identity is not None:
70
+ sys_msg_content = sys_msg_content.with_model_identity(model_identity)
71
+ if reasoning_effort is not None:
72
+ sys_msg_content = sys_msg_content.with_reasoning_effort(
73
+ REASONING_EFFORT[reasoning_effort]
74
+ )
75
+ if start_date is None:
76
+ start_date = datetime.datetime.now().strftime("%Y-%m-%d")
77
+ sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
78
+ if browser_description is not None:
79
+ sys_msg_content = sys_msg_content.with_tools(browser_description)
80
+ if python_description is not None:
81
+ sys_msg_content = sys_msg_content.with_tools(python_description)
82
+ sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
83
+ return sys_msg
84
+
85
+
86
+ def get_developer_message(
87
+ instructions: Optional[str] = None, tools: Optional[list[Tool]] = None
88
+ ) -> Message:
89
+ dev_msg_content = DeveloperContent.new()
90
+ if instructions is not None:
91
+ dev_msg_content = dev_msg_content.with_instructions(instructions)
92
+ if tools is not None:
93
+ function_tools = []
94
+ for tool in tools:
95
+ if tool.type in ("web_search_preview", "code_interpreter"):
96
+ # These are built-in tools that are added to the system message.
97
+ pass
98
+ elif tool.type == "function":
99
+ function_tools.append(tool)
100
+ else:
101
+ raise ValueError(f"tool type {tool.type} not supported")
102
+ if function_tools:
103
+ function_tool_descriptions = [
104
+ ToolDescription.new(
105
+ name=tool.name,
106
+ description=tool.description,
107
+ parameters=tool.parameters,
108
+ )
109
+ for tool in function_tools
110
+ ]
111
+ dev_msg_content = dev_msg_content.with_function_tools(
112
+ function_tool_descriptions
113
+ )
114
+ dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
115
+ return dev_msg
116
+
117
+
118
+ def get_user_message(content: str) -> Message:
119
+ return Message.from_role_and_content(Role.USER, content)
120
+
121
+
122
+ def parse_response_input(
123
+ response_msg: ResponseInputOutputItem,
124
+ prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
125
+ ) -> Message:
126
+ if not isinstance(response_msg, dict):
127
+ response_msg = response_msg.model_dump()
128
+ if "type" not in response_msg or response_msg["type"] == "message":
129
+ role = response_msg["role"]
130
+ content = response_msg["content"]
131
+ if role == "system":
132
+ # User is trying to set a system message. Change it to:
133
+ # <|start|>developer<|message|># Instructions
134
+ # {instructions}<|end|>
135
+ role = "developer"
136
+ text_prefix = "Instructions:\n"
137
+ else:
138
+ text_prefix = ""
139
+ if isinstance(content, str):
140
+ msg = Message.from_role_and_content(role, text_prefix + content)
141
+ else:
142
+ contents = [TextContent(text=text_prefix + c["text"]) for c in content]
143
+ msg = Message.from_role_and_contents(role, contents)
144
+ elif response_msg["type"] == "function_call_output":
145
+ call_id = response_msg["call_id"]
146
+ call_response: Optional[ResponseFunctionToolCall] = None
147
+ for prev_response in reversed(prev_responses):
148
+ if (
149
+ isinstance(prev_response, ResponseFunctionToolCall)
150
+ and prev_response.call_id == call_id
151
+ ):
152
+ call_response = prev_response
153
+ break
154
+ if call_response is None:
155
+ raise ValueError(f"No call message found for {call_id}")
156
+ msg = Message.from_author_and_content(
157
+ Author.new(Role.TOOL, f"functions.{call_response.name}"),
158
+ response_msg["output"],
159
+ )
160
+ elif response_msg["type"] == "reasoning":
161
+ content = response_msg["content"]
162
+ assert len(content) == 1
163
+ msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
164
+ elif response_msg["type"] == "function_call":
165
+ msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
166
+ msg = msg.with_channel("commentary")
167
+ msg = msg.with_recipient(f"functions.{response_msg['name']}")
168
+ msg = msg.with_content_type("json")
169
+ else:
170
+ raise ValueError(f"Unknown input type: {response_msg['type']}")
171
+ return msg
172
+
173
+
174
+ def parse_response_output(output: ResponseOutputItem) -> Message:
175
+ if isinstance(output, ResponseOutputMessage):
176
+ role = output.role
177
+ contents = [TextContent(text=c.text) for c in output.content]
178
+ msg = Message.from_role_and_contents(role, contents)
179
+ return msg
180
+ elif isinstance(output, ResponseFunctionToolCall):
181
+ msg = Message.from_role_and_content(Role.ASSISTANT, output.arguments)
182
+ msg = msg.with_channel("commentary")
183
+ msg = msg.with_recipient(output.name)
184
+ msg = msg.with_content_type("json")
185
+ return msg
186
+ else:
187
+ raise ValueError(f"Unknown output type: {type(output)}")
188
+
189
+
190
+ def parse_chat_input(chat_msg) -> Message:
191
+ role = chat_msg.role
192
+ content = chat_msg.content
193
+ if isinstance(content, str):
194
+ contents = [TextContent(text=content)]
195
+ else:
196
+ # TODO: Support refusal.
197
+ contents = [TextContent(text=c.text) for c in content]
198
+ msg = Message.from_role_and_contents(role, contents)
199
+ return msg
200
+
201
+
202
+ def render_for_completion(messages: list[Message]) -> list[int]:
203
+ conversation = Conversation.from_messages(messages)
204
+ token_ids = get_encoding().render_conversation_for_completion(
205
+ conversation, Role.ASSISTANT
206
+ )
207
+ return token_ids
208
+
209
+
210
+ def get_stop_tokens_for_assistant_actions() -> list[int]:
211
+ return get_encoding().stop_tokens_for_assistant_actions()
212
+
213
+
214
+ def get_streamable_parser_for_assistant() -> StreamableParser:
215
+ return StreamableParser(get_encoding(), role=Role.ASSISTANT)
216
+
217
+
218
+ def parse_output_message(message: Message):
219
+ if message.author.role != "assistant":
220
+ # This is a message from a tool to the assistant (e.g., search result).
221
+ # Don't include it in the final output for now. This aligns with
222
+ # OpenAI's behavior on models like o4-mini.
223
+ return []
224
+
225
+ output_items = []
226
+ recipient = message.recipient
227
+ if recipient is not None and recipient.startswith("browser."):
228
+ if len(message.content) != 1:
229
+ raise ValueError("Invalid number of contents in browser message")
230
+ content = message.content[0]
231
+ browser_call = json.loads(content.text)
232
+ # TODO: translate to url properly!
233
+ if recipient == "browser.search":
234
+ action = ActionSearch(
235
+ query=f"cursor:{browser_call.get('query', '')}", type="search"
236
+ )
237
+ elif recipient == "browser.open":
238
+ action = ActionOpenPage(
239
+ url=f"cursor:{browser_call.get('url', '')}", type="open_page"
240
+ )
241
+ elif recipient == "browser.find":
242
+ action = ActionFind(
243
+ pattern=browser_call["pattern"],
244
+ url=f"cursor:{browser_call.get('url', '')}",
245
+ type="find",
246
+ )
247
+ else:
248
+ raise ValueError(f"Unknown browser action: {recipient}")
249
+ web_search_item = ResponseFunctionWebSearch(
250
+ id=f"ws_{random_uuid()}",
251
+ action=action,
252
+ status="completed",
253
+ type="web_search_call",
254
+ )
255
+ output_items.append(web_search_item)
256
+ elif message.channel == "analysis":
257
+ for content in message.content:
258
+ reasoning_item = ResponseReasoningItem(
259
+ id=f"rs_{random_uuid()}",
260
+ type="reasoning",
261
+ summary=[],
262
+ content=[
263
+ ResponseReasoningTextContent(
264
+ text=content.text, type="reasoning_text"
265
+ )
266
+ ],
267
+ status=None,
268
+ )
269
+ output_items.append(reasoning_item)
270
+ elif message.channel == "commentary":
271
+ if message.recipient.startswith("functions."):
272
+ function_name = message.recipient.split(".")[-1]
273
+ for content in message.content:
274
+ random_id = random_uuid()
275
+ response_item = ResponseFunctionToolCall(
276
+ arguments=content.text,
277
+ call_id=f"call_{random_id}",
278
+ type="function_call",
279
+ name=function_name,
280
+ id=f"ft_{random_id}",
281
+ )
282
+ output_items.append(response_item)
283
+ elif message.recipient.startswith("python") or message.recipient.startswith(
284
+ "browser"
285
+ ):
286
+ for content in message.content:
287
+ reasoning_item = ResponseReasoningItem(
288
+ id=f"rs_{random_uuid()}",
289
+ type="reasoning",
290
+ summary=[],
291
+ content=[
292
+ ResponseReasoningTextContent(
293
+ text=content.text, type="reasoning_text"
294
+ )
295
+ ],
296
+ status=None,
297
+ )
298
+ output_items.append(reasoning_item)
299
+ else:
300
+ raise ValueError(f"Unknown recipient: {message.recipient}")
301
+ elif message.channel == "final":
302
+ contents = []
303
+ for content in message.content:
304
+ output_text = ResponseOutputText(
305
+ text=content.text,
306
+ annotations=[], # TODO
307
+ type="output_text",
308
+ logprobs=None, # TODO
309
+ )
310
+ contents.append(output_text)
311
+ text_item = ResponseOutputMessage(
312
+ id=f"msg_{random_uuid()}",
313
+ content=contents,
314
+ role=message.author.role,
315
+ status="completed",
316
+ type="message",
317
+ )
318
+ output_items.append(text_item)
319
+ else:
320
+ raise ValueError(f"Unknown channel: {message.channel}")
321
+ return output_items
322
+
323
+
324
+ def parse_remaining_state(parser: StreamableParser):
325
+ if not parser.current_content:
326
+ return []
327
+ if parser.current_role != Role.ASSISTANT:
328
+ return []
329
+ current_recipient = parser.current_recipient
330
+ if current_recipient is not None and current_recipient.startswith("browser."):
331
+ return []
332
+
333
+ if parser.current_channel == "analysis":
334
+ reasoning_item = ResponseReasoningItem(
335
+ id=f"rs_{random_uuid()}",
336
+ type="reasoning",
337
+ summary=[],
338
+ content=[
339
+ ResponseReasoningTextContent(
340
+ text=parser.current_content, type="reasoning_text"
341
+ )
342
+ ],
343
+ status=None,
344
+ )
345
+ return [reasoning_item]
346
+ elif parser.current_channel == "final":
347
+ output_text = ResponseOutputText(
348
+ content=[
349
+ ResponseReasoningTextContent(
350
+ text=parser.current_content, type="reasoning_text"
351
+ )
352
+ ],
353
+ annotations=[], # TODO
354
+ type="output_text",
355
+ logprobs=None, # TODO
356
+ )
357
+ text_item = ResponseOutputMessage(
358
+ id=f"msg_{random_uuid()}",
359
+ content=[output_text],
360
+ role="assistant",
361
+ status="completed",
362
+ type="message",
363
+ )
364
+ return [text_item]
365
+ return []
366
+
367
+
368
+ def parse_output_into_messages(token_ids: Iterable[int]):
369
+ parser = get_streamable_parser_for_assistant()
370
+ for token_id in token_ids:
371
+ parser.process(token_id)
372
+ return parser
@@ -26,12 +26,13 @@ import os
26
26
  import threading
27
27
  import time
28
28
  from http import HTTPStatus
29
- from typing import AsyncIterator, Callable, Dict, Optional
29
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
30
 
31
31
  # Fix a bug of Python threading
32
32
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
33
33
 
34
34
  from contextlib import asynccontextmanager
35
+ from typing import AsyncGenerator
35
36
 
36
37
  import numpy as np
37
38
  import orjson
@@ -56,6 +57,7 @@ from sglang.srt.entrypoints.openai.protocol import (
56
57
  ErrorResponse,
57
58
  ModelCard,
58
59
  ModelList,
60
+ ResponsesRequest,
59
61
  ScoringRequest,
60
62
  V1RerankReqInput,
61
63
  )
@@ -147,6 +149,36 @@ async def lifespan(fast_api_app: FastAPI):
147
149
  )
148
150
 
149
151
  server_args: ServerArgs = fast_api_app.server_args
152
+
153
+ tool_server = None
154
+ if server_args.tool_server == "demo":
155
+ from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
156
+
157
+ tool_server = DemoToolServer()
158
+ elif server_args.tool_server:
159
+ from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
160
+
161
+ tool_server = MCPToolServer()
162
+ await tool_server.add_tool_server(server_args.tool_server)
163
+
164
+ try:
165
+ from sglang.srt.entrypoints.openai.serving_responses import (
166
+ OpenAIServingResponses,
167
+ )
168
+
169
+ fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
170
+ _global_state.tokenizer_manager,
171
+ _global_state.template_manager,
172
+ enable_prompt_tokens_details=True,
173
+ enable_force_include_usage=True,
174
+ tool_server=tool_server,
175
+ )
176
+ except Exception as e:
177
+ import traceback
178
+
179
+ traceback.print_exc()
180
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
181
+
150
182
  if server_args.warmups is not None:
151
183
  await execute_warmups(
152
184
  server_args.disaggregation_mode,
@@ -244,7 +276,7 @@ async def health_generate(request: Request) -> Response:
244
276
  logger.info("Health check request received during shutdown. Returning 503.")
245
277
  return Response(status_code=503)
246
278
 
247
- if not _global_state.tokenizer_manager.server_status.is_healthy():
279
+ if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
248
280
  return Response(status_code=503)
249
281
 
250
282
  sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
@@ -284,7 +316,7 @@ async def health_generate(request: Request) -> Response:
284
316
  if _global_state.tokenizer_manager.last_receive_tstamp > tic:
285
317
  task.cancel()
286
318
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
287
- _global_state.tokenizer_manager.health_check_failed = False
319
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
288
320
  return Response(status_code=200)
289
321
 
290
322
  task.cancel()
@@ -298,7 +330,7 @@ async def health_generate(request: Request) -> Response:
298
330
  f"last_heartbeat time: {last_receive_time}"
299
331
  )
300
332
  _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
301
- _global_state.tokenizer_manager.health_check_failed = True
333
+ _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
302
334
  return Response(status_code=503)
303
335
 
304
336
 
@@ -843,6 +875,42 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
843
875
  )
844
876
 
845
877
 
878
+ @app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
879
+ async def v1_responses_request(request: dict, raw_request: Request):
880
+ """Endpoint for the responses API with reasoning support."""
881
+
882
+ request_obj = ResponsesRequest(**request)
883
+ result = await raw_request.app.state.openai_serving_responses.create_responses(
884
+ request_obj, raw_request
885
+ )
886
+
887
+ # Handle streaming responses
888
+ if isinstance(result, AsyncGenerator):
889
+ return StreamingResponse(
890
+ result,
891
+ media_type="text/event-stream",
892
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
893
+ )
894
+
895
+ return result
896
+
897
+
898
+ @app.get("/v1/responses/{response_id}")
899
+ async def v1_retrieve_responses(response_id: str, raw_request: Request):
900
+ """Retrieve a response by ID."""
901
+ return await raw_request.app.state.openai_serving_responses.retrieve_responses(
902
+ response_id
903
+ )
904
+
905
+
906
+ @app.post("/v1/responses/{response_id}/cancel")
907
+ async def v1_cancel_responses(response_id: str, raw_request: Request):
908
+ """Cancel a background response."""
909
+ return await raw_request.app.state.openai_serving_responses.cancel_responses(
910
+ response_id
911
+ )
912
+
913
+
846
914
  @app.api_route(
847
915
  "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
848
916
  )
@@ -1103,6 +1171,8 @@ def _wait_and_warmup(
1103
1171
  pipe_finish_writer,
1104
1172
  ):
1105
1173
  return
1174
+ else:
1175
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
1106
1176
 
1107
1177
  logger.info("The server is fired up and ready to roll!")
1108
1178