sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,372 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3
|
+
# Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
|
4
|
+
# Slight differences in processing chat messages
|
5
|
+
import datetime
|
6
|
+
import json
|
7
|
+
from collections.abc import Iterable
|
8
|
+
from typing import Literal, Optional, Union
|
9
|
+
|
10
|
+
from openai.types.responses import (
|
11
|
+
ResponseOutputItem,
|
12
|
+
ResponseOutputMessage,
|
13
|
+
ResponseOutputText,
|
14
|
+
ResponseReasoningItem,
|
15
|
+
)
|
16
|
+
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
|
17
|
+
from openai.types.responses.response_function_web_search import (
|
18
|
+
ActionFind,
|
19
|
+
ActionOpenPage,
|
20
|
+
ActionSearch,
|
21
|
+
ResponseFunctionWebSearch,
|
22
|
+
)
|
23
|
+
from openai.types.responses.response_reasoning_item import (
|
24
|
+
Content as ResponseReasoningTextContent,
|
25
|
+
)
|
26
|
+
from openai.types.responses.tool import Tool
|
27
|
+
from openai_harmony import (
|
28
|
+
Author,
|
29
|
+
Conversation,
|
30
|
+
DeveloperContent,
|
31
|
+
HarmonyEncodingName,
|
32
|
+
Message,
|
33
|
+
ReasoningEffort,
|
34
|
+
Role,
|
35
|
+
StreamableParser,
|
36
|
+
SystemContent,
|
37
|
+
TextContent,
|
38
|
+
ToolDescription,
|
39
|
+
load_harmony_encoding,
|
40
|
+
)
|
41
|
+
|
42
|
+
from sglang.srt.entrypoints.openai.protocol import ResponseInputOutputItem
|
43
|
+
from sglang.srt.utils import random_uuid
|
44
|
+
|
45
|
+
REASONING_EFFORT = {
|
46
|
+
"high": ReasoningEffort.HIGH,
|
47
|
+
"medium": ReasoningEffort.MEDIUM,
|
48
|
+
"low": ReasoningEffort.LOW,
|
49
|
+
}
|
50
|
+
|
51
|
+
_harmony_encoding = None
|
52
|
+
|
53
|
+
|
54
|
+
def get_encoding():
|
55
|
+
global _harmony_encoding
|
56
|
+
if _harmony_encoding is None:
|
57
|
+
_harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
58
|
+
return _harmony_encoding
|
59
|
+
|
60
|
+
|
61
|
+
def get_system_message(
|
62
|
+
model_identity: Optional[str] = None,
|
63
|
+
reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
|
64
|
+
start_date: Optional[str] = None,
|
65
|
+
browser_description: Optional[str] = None,
|
66
|
+
python_description: Optional[str] = None,
|
67
|
+
) -> Message:
|
68
|
+
sys_msg_content = SystemContent.new()
|
69
|
+
if model_identity is not None:
|
70
|
+
sys_msg_content = sys_msg_content.with_model_identity(model_identity)
|
71
|
+
if reasoning_effort is not None:
|
72
|
+
sys_msg_content = sys_msg_content.with_reasoning_effort(
|
73
|
+
REASONING_EFFORT[reasoning_effort]
|
74
|
+
)
|
75
|
+
if start_date is None:
|
76
|
+
start_date = datetime.datetime.now().strftime("%Y-%m-%d")
|
77
|
+
sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
|
78
|
+
if browser_description is not None:
|
79
|
+
sys_msg_content = sys_msg_content.with_tools(browser_description)
|
80
|
+
if python_description is not None:
|
81
|
+
sys_msg_content = sys_msg_content.with_tools(python_description)
|
82
|
+
sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
|
83
|
+
return sys_msg
|
84
|
+
|
85
|
+
|
86
|
+
def get_developer_message(
|
87
|
+
instructions: Optional[str] = None, tools: Optional[list[Tool]] = None
|
88
|
+
) -> Message:
|
89
|
+
dev_msg_content = DeveloperContent.new()
|
90
|
+
if instructions is not None:
|
91
|
+
dev_msg_content = dev_msg_content.with_instructions(instructions)
|
92
|
+
if tools is not None:
|
93
|
+
function_tools = []
|
94
|
+
for tool in tools:
|
95
|
+
if tool.type in ("web_search_preview", "code_interpreter"):
|
96
|
+
# These are built-in tools that are added to the system message.
|
97
|
+
pass
|
98
|
+
elif tool.type == "function":
|
99
|
+
function_tools.append(tool)
|
100
|
+
else:
|
101
|
+
raise ValueError(f"tool type {tool.type} not supported")
|
102
|
+
if function_tools:
|
103
|
+
function_tool_descriptions = [
|
104
|
+
ToolDescription.new(
|
105
|
+
name=tool.name,
|
106
|
+
description=tool.description,
|
107
|
+
parameters=tool.parameters,
|
108
|
+
)
|
109
|
+
for tool in function_tools
|
110
|
+
]
|
111
|
+
dev_msg_content = dev_msg_content.with_function_tools(
|
112
|
+
function_tool_descriptions
|
113
|
+
)
|
114
|
+
dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
|
115
|
+
return dev_msg
|
116
|
+
|
117
|
+
|
118
|
+
def get_user_message(content: str) -> Message:
|
119
|
+
return Message.from_role_and_content(Role.USER, content)
|
120
|
+
|
121
|
+
|
122
|
+
def parse_response_input(
|
123
|
+
response_msg: ResponseInputOutputItem,
|
124
|
+
prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
|
125
|
+
) -> Message:
|
126
|
+
if not isinstance(response_msg, dict):
|
127
|
+
response_msg = response_msg.model_dump()
|
128
|
+
if "type" not in response_msg or response_msg["type"] == "message":
|
129
|
+
role = response_msg["role"]
|
130
|
+
content = response_msg["content"]
|
131
|
+
if role == "system":
|
132
|
+
# User is trying to set a system message. Change it to:
|
133
|
+
# <|start|>developer<|message|># Instructions
|
134
|
+
# {instructions}<|end|>
|
135
|
+
role = "developer"
|
136
|
+
text_prefix = "Instructions:\n"
|
137
|
+
else:
|
138
|
+
text_prefix = ""
|
139
|
+
if isinstance(content, str):
|
140
|
+
msg = Message.from_role_and_content(role, text_prefix + content)
|
141
|
+
else:
|
142
|
+
contents = [TextContent(text=text_prefix + c["text"]) for c in content]
|
143
|
+
msg = Message.from_role_and_contents(role, contents)
|
144
|
+
elif response_msg["type"] == "function_call_output":
|
145
|
+
call_id = response_msg["call_id"]
|
146
|
+
call_response: Optional[ResponseFunctionToolCall] = None
|
147
|
+
for prev_response in reversed(prev_responses):
|
148
|
+
if (
|
149
|
+
isinstance(prev_response, ResponseFunctionToolCall)
|
150
|
+
and prev_response.call_id == call_id
|
151
|
+
):
|
152
|
+
call_response = prev_response
|
153
|
+
break
|
154
|
+
if call_response is None:
|
155
|
+
raise ValueError(f"No call message found for {call_id}")
|
156
|
+
msg = Message.from_author_and_content(
|
157
|
+
Author.new(Role.TOOL, f"functions.{call_response.name}"),
|
158
|
+
response_msg["output"],
|
159
|
+
)
|
160
|
+
elif response_msg["type"] == "reasoning":
|
161
|
+
content = response_msg["content"]
|
162
|
+
assert len(content) == 1
|
163
|
+
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
|
164
|
+
elif response_msg["type"] == "function_call":
|
165
|
+
msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
|
166
|
+
msg = msg.with_channel("commentary")
|
167
|
+
msg = msg.with_recipient(f"functions.{response_msg['name']}")
|
168
|
+
msg = msg.with_content_type("json")
|
169
|
+
else:
|
170
|
+
raise ValueError(f"Unknown input type: {response_msg['type']}")
|
171
|
+
return msg
|
172
|
+
|
173
|
+
|
174
|
+
def parse_response_output(output: ResponseOutputItem) -> Message:
|
175
|
+
if isinstance(output, ResponseOutputMessage):
|
176
|
+
role = output.role
|
177
|
+
contents = [TextContent(text=c.text) for c in output.content]
|
178
|
+
msg = Message.from_role_and_contents(role, contents)
|
179
|
+
return msg
|
180
|
+
elif isinstance(output, ResponseFunctionToolCall):
|
181
|
+
msg = Message.from_role_and_content(Role.ASSISTANT, output.arguments)
|
182
|
+
msg = msg.with_channel("commentary")
|
183
|
+
msg = msg.with_recipient(output.name)
|
184
|
+
msg = msg.with_content_type("json")
|
185
|
+
return msg
|
186
|
+
else:
|
187
|
+
raise ValueError(f"Unknown output type: {type(output)}")
|
188
|
+
|
189
|
+
|
190
|
+
def parse_chat_input(chat_msg) -> Message:
|
191
|
+
role = chat_msg.role
|
192
|
+
content = chat_msg.content
|
193
|
+
if isinstance(content, str):
|
194
|
+
contents = [TextContent(text=content)]
|
195
|
+
else:
|
196
|
+
# TODO: Support refusal.
|
197
|
+
contents = [TextContent(text=c.text) for c in content]
|
198
|
+
msg = Message.from_role_and_contents(role, contents)
|
199
|
+
return msg
|
200
|
+
|
201
|
+
|
202
|
+
def render_for_completion(messages: list[Message]) -> list[int]:
|
203
|
+
conversation = Conversation.from_messages(messages)
|
204
|
+
token_ids = get_encoding().render_conversation_for_completion(
|
205
|
+
conversation, Role.ASSISTANT
|
206
|
+
)
|
207
|
+
return token_ids
|
208
|
+
|
209
|
+
|
210
|
+
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
211
|
+
return get_encoding().stop_tokens_for_assistant_actions()
|
212
|
+
|
213
|
+
|
214
|
+
def get_streamable_parser_for_assistant() -> StreamableParser:
|
215
|
+
return StreamableParser(get_encoding(), role=Role.ASSISTANT)
|
216
|
+
|
217
|
+
|
218
|
+
def parse_output_message(message: Message):
|
219
|
+
if message.author.role != "assistant":
|
220
|
+
# This is a message from a tool to the assistant (e.g., search result).
|
221
|
+
# Don't include it in the final output for now. This aligns with
|
222
|
+
# OpenAI's behavior on models like o4-mini.
|
223
|
+
return []
|
224
|
+
|
225
|
+
output_items = []
|
226
|
+
recipient = message.recipient
|
227
|
+
if recipient is not None and recipient.startswith("browser."):
|
228
|
+
if len(message.content) != 1:
|
229
|
+
raise ValueError("Invalid number of contents in browser message")
|
230
|
+
content = message.content[0]
|
231
|
+
browser_call = json.loads(content.text)
|
232
|
+
# TODO: translate to url properly!
|
233
|
+
if recipient == "browser.search":
|
234
|
+
action = ActionSearch(
|
235
|
+
query=f"cursor:{browser_call.get('query', '')}", type="search"
|
236
|
+
)
|
237
|
+
elif recipient == "browser.open":
|
238
|
+
action = ActionOpenPage(
|
239
|
+
url=f"cursor:{browser_call.get('url', '')}", type="open_page"
|
240
|
+
)
|
241
|
+
elif recipient == "browser.find":
|
242
|
+
action = ActionFind(
|
243
|
+
pattern=browser_call["pattern"],
|
244
|
+
url=f"cursor:{browser_call.get('url', '')}",
|
245
|
+
type="find",
|
246
|
+
)
|
247
|
+
else:
|
248
|
+
raise ValueError(f"Unknown browser action: {recipient}")
|
249
|
+
web_search_item = ResponseFunctionWebSearch(
|
250
|
+
id=f"ws_{random_uuid()}",
|
251
|
+
action=action,
|
252
|
+
status="completed",
|
253
|
+
type="web_search_call",
|
254
|
+
)
|
255
|
+
output_items.append(web_search_item)
|
256
|
+
elif message.channel == "analysis":
|
257
|
+
for content in message.content:
|
258
|
+
reasoning_item = ResponseReasoningItem(
|
259
|
+
id=f"rs_{random_uuid()}",
|
260
|
+
type="reasoning",
|
261
|
+
summary=[],
|
262
|
+
content=[
|
263
|
+
ResponseReasoningTextContent(
|
264
|
+
text=content.text, type="reasoning_text"
|
265
|
+
)
|
266
|
+
],
|
267
|
+
status=None,
|
268
|
+
)
|
269
|
+
output_items.append(reasoning_item)
|
270
|
+
elif message.channel == "commentary":
|
271
|
+
if message.recipient.startswith("functions."):
|
272
|
+
function_name = message.recipient.split(".")[-1]
|
273
|
+
for content in message.content:
|
274
|
+
random_id = random_uuid()
|
275
|
+
response_item = ResponseFunctionToolCall(
|
276
|
+
arguments=content.text,
|
277
|
+
call_id=f"call_{random_id}",
|
278
|
+
type="function_call",
|
279
|
+
name=function_name,
|
280
|
+
id=f"ft_{random_id}",
|
281
|
+
)
|
282
|
+
output_items.append(response_item)
|
283
|
+
elif message.recipient.startswith("python") or message.recipient.startswith(
|
284
|
+
"browser"
|
285
|
+
):
|
286
|
+
for content in message.content:
|
287
|
+
reasoning_item = ResponseReasoningItem(
|
288
|
+
id=f"rs_{random_uuid()}",
|
289
|
+
type="reasoning",
|
290
|
+
summary=[],
|
291
|
+
content=[
|
292
|
+
ResponseReasoningTextContent(
|
293
|
+
text=content.text, type="reasoning_text"
|
294
|
+
)
|
295
|
+
],
|
296
|
+
status=None,
|
297
|
+
)
|
298
|
+
output_items.append(reasoning_item)
|
299
|
+
else:
|
300
|
+
raise ValueError(f"Unknown recipient: {message.recipient}")
|
301
|
+
elif message.channel == "final":
|
302
|
+
contents = []
|
303
|
+
for content in message.content:
|
304
|
+
output_text = ResponseOutputText(
|
305
|
+
text=content.text,
|
306
|
+
annotations=[], # TODO
|
307
|
+
type="output_text",
|
308
|
+
logprobs=None, # TODO
|
309
|
+
)
|
310
|
+
contents.append(output_text)
|
311
|
+
text_item = ResponseOutputMessage(
|
312
|
+
id=f"msg_{random_uuid()}",
|
313
|
+
content=contents,
|
314
|
+
role=message.author.role,
|
315
|
+
status="completed",
|
316
|
+
type="message",
|
317
|
+
)
|
318
|
+
output_items.append(text_item)
|
319
|
+
else:
|
320
|
+
raise ValueError(f"Unknown channel: {message.channel}")
|
321
|
+
return output_items
|
322
|
+
|
323
|
+
|
324
|
+
def parse_remaining_state(parser: StreamableParser):
|
325
|
+
if not parser.current_content:
|
326
|
+
return []
|
327
|
+
if parser.current_role != Role.ASSISTANT:
|
328
|
+
return []
|
329
|
+
current_recipient = parser.current_recipient
|
330
|
+
if current_recipient is not None and current_recipient.startswith("browser."):
|
331
|
+
return []
|
332
|
+
|
333
|
+
if parser.current_channel == "analysis":
|
334
|
+
reasoning_item = ResponseReasoningItem(
|
335
|
+
id=f"rs_{random_uuid()}",
|
336
|
+
type="reasoning",
|
337
|
+
summary=[],
|
338
|
+
content=[
|
339
|
+
ResponseReasoningTextContent(
|
340
|
+
text=parser.current_content, type="reasoning_text"
|
341
|
+
)
|
342
|
+
],
|
343
|
+
status=None,
|
344
|
+
)
|
345
|
+
return [reasoning_item]
|
346
|
+
elif parser.current_channel == "final":
|
347
|
+
output_text = ResponseOutputText(
|
348
|
+
content=[
|
349
|
+
ResponseReasoningTextContent(
|
350
|
+
text=parser.current_content, type="reasoning_text"
|
351
|
+
)
|
352
|
+
],
|
353
|
+
annotations=[], # TODO
|
354
|
+
type="output_text",
|
355
|
+
logprobs=None, # TODO
|
356
|
+
)
|
357
|
+
text_item = ResponseOutputMessage(
|
358
|
+
id=f"msg_{random_uuid()}",
|
359
|
+
content=[output_text],
|
360
|
+
role="assistant",
|
361
|
+
status="completed",
|
362
|
+
type="message",
|
363
|
+
)
|
364
|
+
return [text_item]
|
365
|
+
return []
|
366
|
+
|
367
|
+
|
368
|
+
def parse_output_into_messages(token_ids: Iterable[int]):
|
369
|
+
parser = get_streamable_parser_for_assistant()
|
370
|
+
for token_id in token_ids:
|
371
|
+
parser.process(token_id)
|
372
|
+
return parser
|
@@ -26,12 +26,13 @@ import os
|
|
26
26
|
import threading
|
27
27
|
import time
|
28
28
|
from http import HTTPStatus
|
29
|
-
from typing import AsyncIterator, Callable, Dict, Optional
|
29
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
30
|
|
31
31
|
# Fix a bug of Python threading
|
32
32
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
33
33
|
|
34
34
|
from contextlib import asynccontextmanager
|
35
|
+
from typing import AsyncGenerator
|
35
36
|
|
36
37
|
import numpy as np
|
37
38
|
import orjson
|
@@ -56,6 +57,7 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
56
57
|
ErrorResponse,
|
57
58
|
ModelCard,
|
58
59
|
ModelList,
|
60
|
+
ResponsesRequest,
|
59
61
|
ScoringRequest,
|
60
62
|
V1RerankReqInput,
|
61
63
|
)
|
@@ -147,6 +149,36 @@ async def lifespan(fast_api_app: FastAPI):
|
|
147
149
|
)
|
148
150
|
|
149
151
|
server_args: ServerArgs = fast_api_app.server_args
|
152
|
+
|
153
|
+
tool_server = None
|
154
|
+
if server_args.tool_server == "demo":
|
155
|
+
from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
|
156
|
+
|
157
|
+
tool_server = DemoToolServer()
|
158
|
+
elif server_args.tool_server:
|
159
|
+
from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
|
160
|
+
|
161
|
+
tool_server = MCPToolServer()
|
162
|
+
await tool_server.add_tool_server(server_args.tool_server)
|
163
|
+
|
164
|
+
try:
|
165
|
+
from sglang.srt.entrypoints.openai.serving_responses import (
|
166
|
+
OpenAIServingResponses,
|
167
|
+
)
|
168
|
+
|
169
|
+
fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
|
170
|
+
_global_state.tokenizer_manager,
|
171
|
+
_global_state.template_manager,
|
172
|
+
enable_prompt_tokens_details=True,
|
173
|
+
enable_force_include_usage=True,
|
174
|
+
tool_server=tool_server,
|
175
|
+
)
|
176
|
+
except Exception as e:
|
177
|
+
import traceback
|
178
|
+
|
179
|
+
traceback.print_exc()
|
180
|
+
logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
|
181
|
+
|
150
182
|
if server_args.warmups is not None:
|
151
183
|
await execute_warmups(
|
152
184
|
server_args.disaggregation_mode,
|
@@ -244,7 +276,7 @@ async def health_generate(request: Request) -> Response:
|
|
244
276
|
logger.info("Health check request received during shutdown. Returning 503.")
|
245
277
|
return Response(status_code=503)
|
246
278
|
|
247
|
-
if
|
279
|
+
if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
|
248
280
|
return Response(status_code=503)
|
249
281
|
|
250
282
|
sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
|
@@ -284,7 +316,7 @@ async def health_generate(request: Request) -> Response:
|
|
284
316
|
if _global_state.tokenizer_manager.last_receive_tstamp > tic:
|
285
317
|
task.cancel()
|
286
318
|
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
287
|
-
_global_state.tokenizer_manager.
|
319
|
+
_global_state.tokenizer_manager.server_status = ServerStatus.Up
|
288
320
|
return Response(status_code=200)
|
289
321
|
|
290
322
|
task.cancel()
|
@@ -298,7 +330,7 @@ async def health_generate(request: Request) -> Response:
|
|
298
330
|
f"last_heartbeat time: {last_receive_time}"
|
299
331
|
)
|
300
332
|
_global_state.tokenizer_manager.rid_to_state.pop(rid, None)
|
301
|
-
_global_state.tokenizer_manager.
|
333
|
+
_global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
|
302
334
|
return Response(status_code=503)
|
303
335
|
|
304
336
|
|
@@ -843,6 +875,42 @@ async def v1_score_request(request: ScoringRequest, raw_request: Request):
|
|
843
875
|
)
|
844
876
|
|
845
877
|
|
878
|
+
@app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
|
879
|
+
async def v1_responses_request(request: dict, raw_request: Request):
|
880
|
+
"""Endpoint for the responses API with reasoning support."""
|
881
|
+
|
882
|
+
request_obj = ResponsesRequest(**request)
|
883
|
+
result = await raw_request.app.state.openai_serving_responses.create_responses(
|
884
|
+
request_obj, raw_request
|
885
|
+
)
|
886
|
+
|
887
|
+
# Handle streaming responses
|
888
|
+
if isinstance(result, AsyncGenerator):
|
889
|
+
return StreamingResponse(
|
890
|
+
result,
|
891
|
+
media_type="text/event-stream",
|
892
|
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
|
893
|
+
)
|
894
|
+
|
895
|
+
return result
|
896
|
+
|
897
|
+
|
898
|
+
@app.get("/v1/responses/{response_id}")
|
899
|
+
async def v1_retrieve_responses(response_id: str, raw_request: Request):
|
900
|
+
"""Retrieve a response by ID."""
|
901
|
+
return await raw_request.app.state.openai_serving_responses.retrieve_responses(
|
902
|
+
response_id
|
903
|
+
)
|
904
|
+
|
905
|
+
|
906
|
+
@app.post("/v1/responses/{response_id}/cancel")
|
907
|
+
async def v1_cancel_responses(response_id: str, raw_request: Request):
|
908
|
+
"""Cancel a background response."""
|
909
|
+
return await raw_request.app.state.openai_serving_responses.cancel_responses(
|
910
|
+
response_id
|
911
|
+
)
|
912
|
+
|
913
|
+
|
846
914
|
@app.api_route(
|
847
915
|
"/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
|
848
916
|
)
|
@@ -1103,6 +1171,8 @@ def _wait_and_warmup(
|
|
1103
1171
|
pipe_finish_writer,
|
1104
1172
|
):
|
1105
1173
|
return
|
1174
|
+
else:
|
1175
|
+
_global_state.tokenizer_manager.server_status = ServerStatus.Up
|
1106
1176
|
|
1107
1177
|
logger.info("The server is fired up and ready to roll!")
|
1108
1178
|
|