sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +35 -0
  5. sglang/srt/conversation.py +9 -117
  6. sglang/srt/disaggregation/base/conn.py +5 -2
  7. sglang/srt/disaggregation/decode.py +6 -1
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
  9. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  10. sglang/srt/disaggregation/prefill.py +3 -0
  11. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  12. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  13. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  14. sglang/srt/distributed/parallel_state.py +22 -9
  15. sglang/srt/entrypoints/context.py +244 -0
  16. sglang/srt/entrypoints/engine.py +8 -5
  17. sglang/srt/entrypoints/harmony_utils.py +370 -0
  18. sglang/srt/entrypoints/http_server.py +106 -15
  19. sglang/srt/entrypoints/openai/protocol.py +227 -1
  20. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  21. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  22. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  23. sglang/srt/entrypoints/tool.py +87 -0
  24. sglang/srt/eplb/expert_distribution.py +4 -2
  25. sglang/srt/eplb/expert_location.py +5 -1
  26. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  27. sglang/srt/hf_transformers_utils.py +55 -13
  28. sglang/srt/jinja_template_utils.py +8 -1
  29. sglang/srt/layers/attention/aiter_backend.py +5 -8
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  31. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  32. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  33. sglang/srt/layers/attention/triton_backend.py +85 -14
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  35. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  36. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
  38. sglang/srt/layers/attention/vision.py +40 -15
  39. sglang/srt/layers/communicator.py +35 -8
  40. sglang/srt/layers/dp_attention.py +12 -0
  41. sglang/srt/layers/linear.py +9 -8
  42. sglang/srt/layers/logits_processor.py +9 -1
  43. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  44. sglang/srt/layers/moe/ep_moe/layer.py +87 -107
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
  48. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
  49. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  50. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  51. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  52. sglang/srt/layers/moe/topk.py +12 -3
  53. sglang/srt/layers/moe/utils.py +59 -0
  54. sglang/srt/layers/quantization/__init__.py +22 -0
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  56. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  57. sglang/srt/layers/quantization/fp4.py +557 -0
  58. sglang/srt/layers/quantization/fp8.py +8 -7
  59. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  60. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  61. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  62. sglang/srt/layers/quantization/mxfp4.py +651 -0
  63. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  64. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  65. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  66. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  67. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  68. sglang/srt/layers/quantization/quark/utils.py +107 -0
  69. sglang/srt/layers/quantization/unquant.py +60 -6
  70. sglang/srt/layers/quantization/w4afp8.py +1 -1
  71. sglang/srt/layers/rotary_embedding.py +225 -1
  72. sglang/srt/layers/utils.py +9 -0
  73. sglang/srt/layers/vocab_parallel_embedding.py +15 -4
  74. sglang/srt/lora/lora_manager.py +70 -14
  75. sglang/srt/lora/lora_registry.py +10 -2
  76. sglang/srt/lora/mem_pool.py +43 -5
  77. sglang/srt/managers/cache_controller.py +61 -32
  78. sglang/srt/managers/data_parallel_controller.py +52 -2
  79. sglang/srt/managers/detokenizer_manager.py +1 -1
  80. sglang/srt/managers/io_struct.py +21 -4
  81. sglang/srt/managers/mm_utils.py +5 -11
  82. sglang/srt/managers/schedule_batch.py +30 -8
  83. sglang/srt/managers/schedule_policy.py +3 -1
  84. sglang/srt/managers/scheduler.py +170 -18
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  86. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  87. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  88. sglang/srt/managers/template_manager.py +59 -22
  89. sglang/srt/managers/tokenizer_manager.py +137 -67
  90. sglang/srt/managers/tp_worker.py +3 -0
  91. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  92. sglang/srt/managers/utils.py +45 -1
  93. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  94. sglang/srt/mem_cache/hicache_storage.py +13 -21
  95. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  96. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  97. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  98. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  99. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  100. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  101. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  102. sglang/srt/model_executor/forward_batch_info.py +48 -17
  103. sglang/srt/model_executor/model_runner.py +24 -2
  104. sglang/srt/model_loader/weight_utils.py +10 -0
  105. sglang/srt/models/bailing_moe.py +425 -0
  106. sglang/srt/models/deepseek_v2.py +95 -50
  107. sglang/srt/models/ernie4.py +426 -0
  108. sglang/srt/models/ernie4_eagle.py +203 -0
  109. sglang/srt/models/gemma3n_mm.py +39 -0
  110. sglang/srt/models/glm4_moe.py +102 -27
  111. sglang/srt/models/gpt_oss.py +1134 -0
  112. sglang/srt/models/grok.py +3 -3
  113. sglang/srt/models/llama4.py +13 -2
  114. sglang/srt/models/mixtral.py +3 -3
  115. sglang/srt/models/mllama4.py +428 -19
  116. sglang/srt/models/qwen2.py +6 -0
  117. sglang/srt/models/qwen2_moe.py +7 -4
  118. sglang/srt/models/qwen3_moe.py +39 -14
  119. sglang/srt/models/step3_vl.py +10 -1
  120. sglang/srt/models/transformers.py +2 -5
  121. sglang/srt/multimodal/processors/base_processor.py +4 -3
  122. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  123. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  124. sglang/srt/operations_strategy.py +1 -1
  125. sglang/srt/reasoning_parser.py +18 -39
  126. sglang/srt/server_args.py +218 -23
  127. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  128. sglang/srt/two_batch_overlap.py +163 -9
  129. sglang/srt/utils.py +41 -26
  130. sglang/srt/weight_sync/utils.py +1 -1
  131. sglang/test/runners.py +4 -4
  132. sglang/test/test_utils.py +4 -4
  133. sglang/version.py +1 -1
  134. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
  135. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
  136. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  137. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  138. /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
  139. /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
  140. /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
  141. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  143. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,370 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import datetime
4
+ import json
5
+ from collections.abc import Iterable
6
+ from typing import Literal, Optional, Union
7
+
8
+ from openai.types.responses import (
9
+ ResponseOutputItem,
10
+ ResponseOutputMessage,
11
+ ResponseOutputText,
12
+ ResponseReasoningItem,
13
+ )
14
+ from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
15
+ from openai.types.responses.response_function_web_search import (
16
+ ActionFind,
17
+ ActionOpenPage,
18
+ ActionSearch,
19
+ ResponseFunctionWebSearch,
20
+ )
21
+ from openai.types.responses.response_reasoning_item import (
22
+ Content as ResponseReasoningTextContent,
23
+ )
24
+ from openai.types.responses.tool import Tool
25
+ from openai_harmony import (
26
+ Author,
27
+ Conversation,
28
+ DeveloperContent,
29
+ HarmonyEncodingName,
30
+ Message,
31
+ ReasoningEffort,
32
+ Role,
33
+ StreamableParser,
34
+ SystemContent,
35
+ TextContent,
36
+ ToolDescription,
37
+ load_harmony_encoding,
38
+ )
39
+
40
+ from sglang.srt.entrypoints.openai.protocol import ResponseInputOutputItem
41
+ from sglang.srt.utils import random_uuid
42
+
43
+ REASONING_EFFORT = {
44
+ "high": ReasoningEffort.HIGH,
45
+ "medium": ReasoningEffort.MEDIUM,
46
+ "low": ReasoningEffort.LOW,
47
+ }
48
+
49
+ _harmony_encoding = None
50
+
51
+
52
+ def get_encoding():
53
+ global _harmony_encoding
54
+ if _harmony_encoding is None:
55
+ _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
56
+ return _harmony_encoding
57
+
58
+
59
+ def get_system_message(
60
+ model_identity: Optional[str] = None,
61
+ reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
62
+ start_date: Optional[str] = None,
63
+ browser_description: Optional[str] = None,
64
+ python_description: Optional[str] = None,
65
+ ) -> Message:
66
+ sys_msg_content = SystemContent.new()
67
+ if model_identity is not None:
68
+ sys_msg_content = sys_msg_content.with_model_identity(model_identity)
69
+ if reasoning_effort is not None:
70
+ sys_msg_content = sys_msg_content.with_reasoning_effort(
71
+ REASONING_EFFORT[reasoning_effort]
72
+ )
73
+ if start_date is None:
74
+ start_date = datetime.datetime.now().strftime("%Y-%m-%d")
75
+ sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
76
+ if browser_description is not None:
77
+ sys_msg_content = sys_msg_content.with_tools(browser_description)
78
+ if python_description is not None:
79
+ sys_msg_content = sys_msg_content.with_tools(python_description)
80
+ sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
81
+ return sys_msg
82
+
83
+
84
+ def get_developer_message(
85
+ instructions: Optional[str] = None, tools: Optional[list[Tool]] = None
86
+ ) -> Message:
87
+ dev_msg_content = DeveloperContent.new()
88
+ if instructions is not None:
89
+ dev_msg_content = dev_msg_content.with_instructions(instructions)
90
+ if tools is not None:
91
+ function_tools = []
92
+ for tool in tools:
93
+ if tool.type in ("web_search_preview", "code_interpreter"):
94
+ # These are built-in tools that are added to the system message.
95
+ pass
96
+ elif tool.type == "function":
97
+ function_tools.append(tool)
98
+ else:
99
+ raise ValueError(f"tool type {tool.type} not supported")
100
+ if function_tools:
101
+ function_tool_descriptions = [
102
+ ToolDescription.new(
103
+ name=tool.name,
104
+ description=tool.description,
105
+ parameters=tool.parameters,
106
+ )
107
+ for tool in function_tools
108
+ ]
109
+ dev_msg_content = dev_msg_content.with_function_tools(
110
+ function_tool_descriptions
111
+ )
112
+ dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
113
+ return dev_msg
114
+
115
+
116
+ def get_user_message(content: str) -> Message:
117
+ return Message.from_role_and_content(Role.USER, content)
118
+
119
+
120
+ def parse_response_input(
121
+ response_msg: ResponseInputOutputItem,
122
+ prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
123
+ ) -> Message:
124
+ if not isinstance(response_msg, dict):
125
+ response_msg = response_msg.model_dump()
126
+ if "type" not in response_msg or response_msg["type"] == "message":
127
+ role = response_msg["role"]
128
+ content = response_msg["content"]
129
+ if role == "system":
130
+ # User is trying to set a system message. Change it to:
131
+ # <|start|>developer<|message|># Instructions
132
+ # {instructions}<|end|>
133
+ role = "developer"
134
+ text_prefix = "Instructions:\n"
135
+ else:
136
+ text_prefix = ""
137
+ if isinstance(content, str):
138
+ msg = Message.from_role_and_content(role, text_prefix + content)
139
+ else:
140
+ contents = [TextContent(text=text_prefix + c["text"]) for c in content]
141
+ msg = Message.from_role_and_contents(role, contents)
142
+ elif response_msg["type"] == "function_call_output":
143
+ call_id = response_msg["call_id"]
144
+ call_response: Optional[ResponseFunctionToolCall] = None
145
+ for prev_response in reversed(prev_responses):
146
+ if (
147
+ isinstance(prev_response, ResponseFunctionToolCall)
148
+ and prev_response.call_id == call_id
149
+ ):
150
+ call_response = prev_response
151
+ break
152
+ if call_response is None:
153
+ raise ValueError(f"No call message found for {call_id}")
154
+ msg = Message.from_author_and_content(
155
+ Author.new(Role.TOOL, f"functions.{call_response.name}"),
156
+ response_msg["output"],
157
+ )
158
+ elif response_msg["type"] == "reasoning":
159
+ content = response_msg["content"]
160
+ assert len(content) == 1
161
+ msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
162
+ elif response_msg["type"] == "function_call":
163
+ msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
164
+ msg = msg.with_channel("commentary")
165
+ msg = msg.with_recipient(f"functions.{response_msg['name']}")
166
+ msg = msg.with_content_type("json")
167
+ else:
168
+ raise ValueError(f"Unknown input type: {response_msg['type']}")
169
+ return msg
170
+
171
+
172
+ def parse_response_output(output: ResponseOutputItem) -> Message:
173
+ if isinstance(output, ResponseOutputMessage):
174
+ role = output.role
175
+ contents = [TextContent(text=c.text) for c in output.content]
176
+ msg = Message.from_role_and_contents(role, contents)
177
+ return msg
178
+ elif isinstance(output, ResponseFunctionToolCall):
179
+ msg = Message.from_role_and_content(Role.ASSISTANT, output.arguments)
180
+ msg = msg.with_channel("commentary")
181
+ msg = msg.with_recipient(output.name)
182
+ msg = msg.with_content_type("json")
183
+ return msg
184
+ else:
185
+ raise ValueError(f"Unknown output type: {type(output)}")
186
+
187
+
188
+ def parse_chat_input(chat_msg) -> Message:
189
+ role = chat_msg.role
190
+ content = chat_msg.content
191
+ if isinstance(content, str):
192
+ contents = [TextContent(text=content)]
193
+ else:
194
+ # TODO: Support refusal.
195
+ contents = [TextContent(text=c.text) for c in content]
196
+ msg = Message.from_role_and_contents(role, contents)
197
+ return msg
198
+
199
+
200
+ def render_for_completion(messages: list[Message]) -> list[int]:
201
+ conversation = Conversation.from_messages(messages)
202
+ token_ids = get_encoding().render_conversation_for_completion(
203
+ conversation, Role.ASSISTANT
204
+ )
205
+ return token_ids
206
+
207
+
208
+ def get_stop_tokens_for_assistant_actions() -> list[int]:
209
+ return get_encoding().stop_tokens_for_assistant_actions()
210
+
211
+
212
+ def get_streamable_parser_for_assistant() -> StreamableParser:
213
+ return StreamableParser(get_encoding(), role=Role.ASSISTANT)
214
+
215
+
216
+ def parse_output_message(message: Message):
217
+ if message.author.role != "assistant":
218
+ # This is a message from a tool to the assistant (e.g., search result).
219
+ # Don't include it in the final output for now. This aligns with
220
+ # OpenAI's behavior on models like o4-mini.
221
+ return []
222
+
223
+ output_items = []
224
+ recipient = message.recipient
225
+ if recipient is not None and recipient.startswith("browser."):
226
+ if len(message.content) != 1:
227
+ raise ValueError("Invalid number of contents in browser message")
228
+ content = message.content[0]
229
+ browser_call = json.loads(content.text)
230
+ # TODO: translate to url properly!
231
+ if recipient == "browser.search":
232
+ action = ActionSearch(
233
+ query=f"cursor:{browser_call.get('query', '')}", type="search"
234
+ )
235
+ elif recipient == "browser.open":
236
+ action = ActionOpenPage(
237
+ url=f"cursor:{browser_call.get('url', '')}", type="open_page"
238
+ )
239
+ elif recipient == "browser.find":
240
+ action = ActionFind(
241
+ pattern=browser_call["pattern"],
242
+ url=f"cursor:{browser_call.get('url', '')}",
243
+ type="find",
244
+ )
245
+ else:
246
+ raise ValueError(f"Unknown browser action: {recipient}")
247
+ web_search_item = ResponseFunctionWebSearch(
248
+ id=f"ws_{random_uuid()}",
249
+ action=action,
250
+ status="completed",
251
+ type="web_search_call",
252
+ )
253
+ output_items.append(web_search_item)
254
+ elif message.channel == "analysis":
255
+ for content in message.content:
256
+ reasoning_item = ResponseReasoningItem(
257
+ id=f"rs_{random_uuid()}",
258
+ type="reasoning",
259
+ summary=[],
260
+ content=[
261
+ ResponseReasoningTextContent(
262
+ text=content.text, type="reasoning_text"
263
+ )
264
+ ],
265
+ status=None,
266
+ )
267
+ output_items.append(reasoning_item)
268
+ elif message.channel == "commentary":
269
+ if message.recipient.startswith("functions."):
270
+ function_name = message.recipient.split(".")[-1]
271
+ for content in message.content:
272
+ random_id = random_uuid()
273
+ response_item = ResponseFunctionToolCall(
274
+ arguments=content.text,
275
+ call_id=f"call_{random_id}",
276
+ type="function_call",
277
+ name=function_name,
278
+ id=f"ft_{random_id}",
279
+ )
280
+ output_items.append(response_item)
281
+ elif message.recipient.startswith("python") or message.recipient.startswith(
282
+ "browser"
283
+ ):
284
+ for content in message.content:
285
+ reasoning_item = ResponseReasoningItem(
286
+ id=f"rs_{random_uuid()}",
287
+ type="reasoning",
288
+ summary=[],
289
+ content=[
290
+ ResponseReasoningTextContent(
291
+ text=content.text, type="reasoning_text"
292
+ )
293
+ ],
294
+ status=None,
295
+ )
296
+ output_items.append(reasoning_item)
297
+ else:
298
+ raise ValueError(f"Unknown recipient: {message.recipient}")
299
+ elif message.channel == "final":
300
+ contents = []
301
+ for content in message.content:
302
+ output_text = ResponseOutputText(
303
+ text=content.text,
304
+ annotations=[], # TODO
305
+ type="output_text",
306
+ logprobs=None, # TODO
307
+ )
308
+ contents.append(output_text)
309
+ text_item = ResponseOutputMessage(
310
+ id=f"msg_{random_uuid()}",
311
+ content=contents,
312
+ role=message.author.role,
313
+ status="completed",
314
+ type="message",
315
+ )
316
+ output_items.append(text_item)
317
+ else:
318
+ raise ValueError(f"Unknown channel: {message.channel}")
319
+ return output_items
320
+
321
+
322
+ def parse_remaining_state(parser: StreamableParser):
323
+ if not parser.current_content:
324
+ return []
325
+ if parser.current_role != Role.ASSISTANT:
326
+ return []
327
+ current_recipient = parser.current_recipient
328
+ if current_recipient is not None and current_recipient.startswith("browser."):
329
+ return []
330
+
331
+ if parser.current_channel == "analysis":
332
+ reasoning_item = ResponseReasoningItem(
333
+ id=f"rs_{random_uuid()}",
334
+ type="reasoning",
335
+ summary=[],
336
+ content=[
337
+ ResponseReasoningTextContent(
338
+ text=parser.current_content, type="reasoning_text"
339
+ )
340
+ ],
341
+ status=None,
342
+ )
343
+ return [reasoning_item]
344
+ elif parser.current_channel == "final":
345
+ output_text = ResponseOutputText(
346
+ content=[
347
+ ResponseReasoningTextContent(
348
+ text=parser.current_content, type="reasoning_text"
349
+ )
350
+ ],
351
+ annotations=[], # TODO
352
+ type="output_text",
353
+ logprobs=None, # TODO
354
+ )
355
+ text_item = ResponseOutputMessage(
356
+ id=f"msg_{random_uuid()}",
357
+ content=[output_text],
358
+ role="assistant",
359
+ status="completed",
360
+ type="message",
361
+ )
362
+ return [text_item]
363
+ return []
364
+
365
+
366
+ def parse_output_into_messages(token_ids: Iterable[int]):
367
+ parser = get_streamable_parser_for_assistant()
368
+ for token_id in token_ids:
369
+ parser.process(token_id)
370
+ return parser
@@ -32,6 +32,7 @@ from typing import AsyncIterator, Callable, Dict, Optional
32
32
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
33
33
 
34
34
  from contextlib import asynccontextmanager
35
+ from typing import AsyncGenerator
35
36
 
36
37
  import numpy as np
37
38
  import orjson
@@ -45,6 +46,7 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
45
46
 
46
47
  from sglang.srt.disaggregation.utils import (
47
48
  FAKE_BOOTSTRAP_HOST,
49
+ DisaggregationMode,
48
50
  register_disaggregation_server,
49
51
  )
50
52
  from sglang.srt.entrypoints.engine import _launch_subprocesses
@@ -55,6 +57,7 @@ from sglang.srt.entrypoints.openai.protocol import (
55
57
  ErrorResponse,
56
58
  ModelCard,
57
59
  ModelList,
60
+ ResponsesRequest,
58
61
  ScoringRequest,
59
62
  V1RerankReqInput,
60
63
  )
@@ -88,7 +91,7 @@ from sglang.srt.managers.io_struct import (
88
91
  VertexGenerateReqInput,
89
92
  )
90
93
  from sglang.srt.managers.template_manager import TemplateManager
91
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
94
+ from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
92
95
  from sglang.srt.metrics.func_timer import enable_func_timer
93
96
  from sglang.srt.reasoning_parser import ReasoningParser
94
97
  from sglang.srt.server_args import ServerArgs
@@ -146,6 +149,37 @@ async def lifespan(fast_api_app: FastAPI):
146
149
  )
147
150
 
148
151
  server_args: ServerArgs = fast_api_app.server_args
152
+
153
+ tool_server = None
154
+ if server_args.tool_server == "demo":
155
+ from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
156
+
157
+ tool_server = DemoToolServer()
158
+ elif server_args.tool_server:
159
+ from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
160
+
161
+ tool_server = MCPToolServer()
162
+ await tool_server.add_tool_server(server_args.tool_server)
163
+
164
+ try:
165
+ from sglang.srt.entrypoints.openai.serving_responses import (
166
+ OpenAIServingResponses,
167
+ )
168
+
169
+ fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
170
+ _global_state.tokenizer_manager,
171
+ _global_state.template_manager,
172
+ enable_prompt_tokens_details=True,
173
+ enable_force_include_usage=True,
174
+ tool_server=tool_server,
175
+ )
176
+ except Exception as e:
177
+ # print stack trace
178
+ import traceback
179
+
180
+ traceback.print_exc()
181
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
182
+
149
183
  if server_args.warmups is not None:
150
184
  await execute_warmups(
151
185
  server_args.disaggregation_mode,
@@ -230,23 +264,28 @@ async def validate_json_request(raw_request: Request):
230
264
 
231
265
 
232
266
  @app.get("/health")
233
- async def health() -> Response:
234
- """Check the health of the http server."""
235
- return Response(status_code=200)
236
-
237
-
238
267
  @app.get("/health_generate")
239
268
  async def health_generate(request: Request) -> Response:
240
- """Check the health of the inference server by generating one token."""
269
+ """
270
+ Check the health of the inference server by sending a special request to generate one token.
271
+
272
+ If the server is running something, this request will be ignored, so it creates zero overhead.
273
+ If the server is not running anything, this request will be run, so we know whether the server is healthy.
274
+ """
275
+
241
276
  if _global_state.tokenizer_manager.gracefully_exit:
242
277
  logger.info("Health check request received during shutdown. Returning 503.")
243
278
  return Response(status_code=503)
244
279
 
280
+ if not _global_state.tokenizer_manager.server_status.is_healthy():
281
+ return Response(status_code=503)
282
+
245
283
  sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
246
284
  rid = f"HEALTH_CHECK_{time.time()}"
247
285
 
248
286
  if _global_state.tokenizer_manager.is_image_gen:
249
- raise NotImplementedError()
287
+ # Keep this branch for some internal use cases.
288
+ raise NotImplementedError("Image generation is not supported yet.")
250
289
  elif _global_state.tokenizer_manager.is_generation:
251
290
  gri = GenerateReqInput(
252
291
  rid=rid,
@@ -254,6 +293,12 @@ async def health_generate(request: Request) -> Response:
254
293
  sampling_params=sampling_params,
255
294
  log_metrics=False,
256
295
  )
296
+ if (
297
+ _global_state.tokenizer_manager.server_args.disaggregation_mode
298
+ != DisaggregationMode.NULL
299
+ ):
300
+ gri.bootstrap_host = FAKE_BOOTSTRAP_HOST
301
+ gri.bootstrap_room = 0
257
302
  else:
258
303
  gri = EmbeddingReqInput(
259
304
  rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
@@ -263,9 +308,6 @@ async def health_generate(request: Request) -> Response:
263
308
  async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
264
309
  break
265
310
 
266
- # This request is a special request.
267
- # If the server already has something running, this request will be ignored, so it creates zero overhead.
268
- # If the server is not running, this request will be run, so we know whether the server is healthy.
269
311
  task = asyncio.create_task(gen())
270
312
 
271
313
  # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
@@ -834,6 +876,42 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
834
876
  )
835
877
 
836
878
 
879
+ @app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
880
+ async def v1_responses_request(request: dict, raw_request: Request):
881
+ """Endpoint for the responses API with reasoning support."""
882
+
883
+ request_obj = ResponsesRequest(**request)
884
+ result = await raw_request.app.state.openai_serving_responses.create_responses(
885
+ request_obj, raw_request
886
+ )
887
+
888
+ # Handle streaming responses
889
+ if isinstance(result, AsyncGenerator):
890
+ return StreamingResponse(
891
+ result,
892
+ media_type="text/event-stream",
893
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
894
+ )
895
+
896
+ return result
897
+
898
+
899
+ @app.get("/v1/responses/{response_id}")
900
+ async def v1_retrieve_responses(response_id: str, raw_request: Request):
901
+ """Retrieve a response by ID."""
902
+ return await raw_request.app.state.openai_serving_responses.retrieve_responses(
903
+ response_id
904
+ )
905
+
906
+
907
+ @app.post("/v1/responses/{response_id}/cancel")
908
+ async def v1_cancel_responses(response_id: str, raw_request: Request):
909
+ """Cancel a background response."""
910
+ return await raw_request.app.state.openai_serving_responses.cancel_responses(
911
+ response_id
912
+ )
913
+
914
+
837
915
  @app.api_route(
838
916
  "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
839
917
  )
@@ -1032,8 +1110,10 @@ def _execute_server_warmup(
1032
1110
  timeout=600,
1033
1111
  )
1034
1112
  assert res.status_code == 200, f"{res}"
1113
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
1114
+
1035
1115
  else:
1036
- logger.info(f"Start of prefill warmup ...")
1116
+ logger.info(f"Start of pd disaggregation warmup ...")
1037
1117
  json_data = {
1038
1118
  "sampling_params": {
1039
1119
  "temperature": 0.0,
@@ -1055,9 +1135,18 @@ def _execute_server_warmup(
1055
1135
  headers=headers,
1056
1136
  timeout=1800, # because of deep gemm precache is very long if not precache.
1057
1137
  )
1058
- logger.info(
1059
- f"End of prefill warmup with status {res.status_code}, resp: {res.json()}"
1060
- )
1138
+ if res.status_code == 200:
1139
+ logger.info(
1140
+ f"End of prefill disaggregation mode warmup with status {res.status_code}, resp: {res.json()}"
1141
+ )
1142
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
1143
+ else:
1144
+ logger.info(
1145
+ "Prefill disaggregation mode warm Up Failed, status code: {}".format(
1146
+ res.status_code
1147
+ )
1148
+ )
1149
+ _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
1061
1150
 
1062
1151
  except Exception:
1063
1152
  last_traceback = get_exception_traceback()
@@ -1083,6 +1172,8 @@ def _wait_and_warmup(
1083
1172
  pipe_finish_writer,
1084
1173
  ):
1085
1174
  return
1175
+ else:
1176
+ _global_state.tokenizer_manager.server_status = ServerStatus.Up
1086
1177
 
1087
1178
  logger.info("The server is fired up and ready to roll!")
1088
1179