sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +6 -0
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +7 -7
- sglang/srt/disaggregation/decode.py +8 -3
- sglang/srt/disaggregation/mooncake/conn.py +43 -25
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/distributed/parallel_state.py +4 -2
- sglang/srt/entrypoints/context.py +3 -20
- sglang/srt/entrypoints/engine.py +13 -8
- sglang/srt/entrypoints/harmony_utils.py +2 -0
- sglang/srt/entrypoints/http_server.py +4 -5
- sglang/srt/entrypoints/openai/protocol.py +0 -9
- sglang/srt/entrypoints/openai/serving_chat.py +59 -265
- sglang/srt/entrypoints/openai/tool_server.py +4 -3
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/jinja_template_utils.py +6 -0
- sglang/srt/layers/attention/aiter_backend.py +370 -107
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +9 -1
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +8 -10
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +1 -0
- sglang/srt/layers/moe/cutlass_moe.py +11 -16
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +60 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +4 -1
- sglang/srt/layers/quantization/__init__.py +5 -3
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +22 -10
- sglang/srt/layers/quantization/modelopt_quant.py +6 -11
- sglang/srt/layers/quantization/mxfp4.py +4 -1
- sglang/srt/layers/quantization/w4afp8.py +20 -11
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +281 -2
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +12 -48
- sglang/srt/lora/lora_registry.py +20 -9
- sglang/srt/lora/mem_pool.py +20 -63
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +21 -29
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +6 -6
- sglang/srt/managers/mm_utils.py +1 -2
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +35 -20
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +15 -7
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/tokenizer_manager.py +25 -26
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +34 -24
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +33 -35
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +22 -3
- sglang/srt/model_executor/forward_batch_info.py +26 -5
- sglang/srt/model_executor/model_runner.py +129 -35
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/models/deepseek_v2.py +74 -35
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +8 -9
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +9 -9
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +136 -19
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/reasoning_parser.py +316 -0
- sglang/srt/server_args.py +115 -139
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +12 -4
- sglang/srt/utils.py +3 -3
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
- sglang/lang/backend/__init__.py +0 -0
- sglang/srt/function_call/harmony_tool_parser.py +0 -130
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
|
7
7
|
|
8
8
|
from fastapi import Request
|
9
9
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
|
-
from openai_harmony import Message as OpenAIMessage
|
11
10
|
|
12
11
|
from sglang.srt.conversation import generate_chat_conv
|
13
|
-
from sglang.srt.entrypoints.harmony_utils import (
|
14
|
-
get_developer_message,
|
15
|
-
get_stop_tokens_for_assistant_actions,
|
16
|
-
get_streamable_parser_for_assistant,
|
17
|
-
get_system_message,
|
18
|
-
parse_chat_input,
|
19
|
-
parse_output_into_messages,
|
20
|
-
render_for_completion,
|
21
|
-
)
|
22
12
|
from sglang.srt.entrypoints.openai.protocol import (
|
23
13
|
ChatCompletionRequest,
|
24
14
|
ChatCompletionResponse,
|
@@ -57,30 +47,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
57
47
|
"""Handler for /v1/chat/completions requests"""
|
58
48
|
|
59
49
|
def __init__(
|
60
|
-
self,
|
50
|
+
self,
|
51
|
+
tokenizer_manager: TokenizerManager,
|
52
|
+
template_manager: TemplateManager,
|
61
53
|
):
|
62
54
|
super().__init__(tokenizer_manager)
|
63
55
|
self.template_manager = template_manager
|
64
|
-
self.use_harmony = (
|
65
|
-
self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
|
66
|
-
)
|
67
|
-
|
68
|
-
if self.use_harmony:
|
69
|
-
from sglang.srt.function_call.harmony_tool_parser import (
|
70
|
-
HarmonyToolCallParser,
|
71
|
-
)
|
72
|
-
|
73
|
-
self.harmony_tool_parser = HarmonyToolCallParser()
|
74
|
-
|
75
|
-
# NOTE While OpenAI's chat completion API supports browsing
|
76
|
-
# for some models, currently vLLM doesn't support it. Please use the
|
77
|
-
# Responses API instead.
|
78
|
-
self.supports_browsing = False
|
79
|
-
self.browser_tool = None
|
80
|
-
# NOTE: Chat completion API does not support code interpreter.
|
81
|
-
# Please use the Responses API instead.
|
82
|
-
self.supports_code_interpreter = False
|
83
|
-
self.python_tool = None
|
84
56
|
|
85
57
|
def _request_id_prefix(self) -> str:
|
86
58
|
return "chatcmpl-"
|
@@ -97,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
97
69
|
):
|
98
70
|
return "Tools cannot be empty if tool choice is set to required."
|
99
71
|
|
72
|
+
max_output_tokens = request.max_completion_tokens or request.max_tokens
|
73
|
+
server_context_length = self.tokenizer_manager.server_args.context_length
|
74
|
+
if (
|
75
|
+
max_output_tokens
|
76
|
+
and server_context_length
|
77
|
+
and max_output_tokens > server_context_length
|
78
|
+
):
|
79
|
+
return (
|
80
|
+
f"max_completion_tokens is too large: {max_output_tokens}."
|
81
|
+
f"This model supports at most {server_context_length} completion tokens."
|
82
|
+
)
|
83
|
+
|
100
84
|
return None
|
101
85
|
|
102
86
|
def _convert_to_internal_request(
|
@@ -107,66 +91,43 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
107
91
|
is_multimodal = self.tokenizer_manager.model_config.is_multimodal
|
108
92
|
|
109
93
|
# Process messages and apply chat template
|
110
|
-
|
111
|
-
processed_messages = self._process_messages(request, is_multimodal)
|
112
|
-
|
113
|
-
# Build sampling parameters
|
114
|
-
sampling_params = self._build_sampling_params(
|
115
|
-
request,
|
116
|
-
processed_messages.stop,
|
117
|
-
processed_messages.tool_call_constraint,
|
118
|
-
)
|
94
|
+
processed_messages = self._process_messages(request, is_multimodal)
|
119
95
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
**prompt_kwargs,
|
131
|
-
image_data=processed_messages.image_data,
|
132
|
-
video_data=processed_messages.video_data,
|
133
|
-
audio_data=processed_messages.audio_data,
|
134
|
-
sampling_params=sampling_params,
|
135
|
-
return_logprob=request.logprobs,
|
136
|
-
logprob_start_len=-1,
|
137
|
-
top_logprobs_num=request.top_logprobs or 0,
|
138
|
-
stream=request.stream,
|
139
|
-
return_text_in_logprobs=True,
|
140
|
-
modalities=processed_messages.modalities,
|
141
|
-
lora_path=request.lora_path,
|
142
|
-
bootstrap_host=request.bootstrap_host,
|
143
|
-
bootstrap_port=request.bootstrap_port,
|
144
|
-
bootstrap_room=request.bootstrap_room,
|
145
|
-
return_hidden_states=request.return_hidden_states,
|
146
|
-
rid=request.rid,
|
147
|
-
)
|
96
|
+
# Build sampling parameters
|
97
|
+
sampling_params = self._build_sampling_params(
|
98
|
+
request,
|
99
|
+
processed_messages.stop,
|
100
|
+
processed_messages.tool_call_constraint,
|
101
|
+
)
|
102
|
+
|
103
|
+
# Handle single vs multiple requests
|
104
|
+
if is_multimodal:
|
105
|
+
prompt_kwargs = {"text": processed_messages.prompt}
|
148
106
|
else:
|
149
|
-
processed_messages,
|
150
|
-
|
151
|
-
|
152
|
-
input_ids
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
107
|
+
if isinstance(processed_messages.prompt_ids, str):
|
108
|
+
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
109
|
+
else:
|
110
|
+
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
111
|
+
|
112
|
+
adapted_request = GenerateReqInput(
|
113
|
+
**prompt_kwargs,
|
114
|
+
image_data=processed_messages.image_data,
|
115
|
+
video_data=processed_messages.video_data,
|
116
|
+
audio_data=processed_messages.audio_data,
|
117
|
+
sampling_params=sampling_params,
|
118
|
+
return_logprob=request.logprobs,
|
119
|
+
logprob_start_len=-1,
|
120
|
+
top_logprobs_num=request.top_logprobs or 0,
|
121
|
+
stream=request.stream,
|
122
|
+
return_text_in_logprobs=True,
|
123
|
+
modalities=processed_messages.modalities,
|
124
|
+
lora_path=request.lora_path,
|
125
|
+
bootstrap_host=request.bootstrap_host,
|
126
|
+
bootstrap_port=request.bootstrap_port,
|
127
|
+
bootstrap_room=request.bootstrap_room,
|
128
|
+
return_hidden_states=request.return_hidden_states,
|
129
|
+
rid=request.rid,
|
130
|
+
)
|
170
131
|
|
171
132
|
return adapted_request, request
|
172
133
|
|
@@ -251,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
251
212
|
tokenize=True,
|
252
213
|
add_generation_prompt=True,
|
253
214
|
tools=tools,
|
215
|
+
reasoning_effort=request.reasoning_effort,
|
254
216
|
**(
|
255
217
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
256
218
|
),
|
257
219
|
)
|
258
220
|
except Exception:
|
259
|
-
#
|
260
|
-
#
|
261
|
-
#
|
221
|
+
# This except branch will be triggered when the chosen model
|
222
|
+
# has a different tools input format that is not compatible
|
223
|
+
# with openAI's apply_chat_template tool_call format, like Mistral.
|
262
224
|
tools = (
|
263
225
|
[t if "function" in t else {"function": t} for t in tools]
|
264
226
|
if tools
|
@@ -269,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
269
231
|
tokenize=True,
|
270
232
|
add_generation_prompt=True,
|
271
233
|
tools=tools,
|
234
|
+
reasoning_effort=request.reasoning_effort,
|
272
235
|
**(
|
273
236
|
request.chat_template_kwargs if request.chat_template_kwargs else {}
|
274
237
|
),
|
@@ -459,12 +422,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
459
422
|
cached_tokens = {}
|
460
423
|
hidden_states = {}
|
461
424
|
|
462
|
-
# Harmony tracking
|
463
|
-
if self.use_harmony:
|
464
|
-
harmony_parsers = [
|
465
|
-
get_streamable_parser_for_assistant() for _ in range(request.n)
|
466
|
-
]
|
467
|
-
|
468
425
|
try:
|
469
426
|
async for content in self.tokenizer_manager.generate_request(
|
470
427
|
adapted_request, raw_request
|
@@ -511,58 +468,14 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
511
468
|
)
|
512
469
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
513
470
|
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
new_token_ids = content["output_ids"]
|
519
|
-
for token_id in new_token_ids:
|
520
|
-
harmony_parser.process(token_id)
|
521
|
-
|
522
|
-
is_final = harmony_parser.current_channel == "final"
|
523
|
-
is_analysis = harmony_parser.current_channel == "analysis"
|
524
|
-
delta = harmony_parser.last_content_delta or ""
|
525
|
-
|
526
|
-
if is_analysis:
|
527
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
528
|
-
index=index,
|
529
|
-
delta=DeltaMessage(reasoning_content=delta),
|
530
|
-
finish_reason=None,
|
531
|
-
)
|
532
|
-
chunk = ChatCompletionStreamResponse(
|
533
|
-
id=content["meta_info"]["id"],
|
534
|
-
created=int(time.time()),
|
535
|
-
choices=[choice_data],
|
536
|
-
model=request.model,
|
537
|
-
)
|
538
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
539
|
-
continue
|
540
|
-
|
541
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
542
|
-
index=index,
|
543
|
-
delta=DeltaMessage(content=delta if delta else None),
|
544
|
-
finish_reason=None,
|
545
|
-
matched_stop=None,
|
546
|
-
logprobs=choice_logprobs,
|
547
|
-
)
|
548
|
-
chunk = ChatCompletionStreamResponse(
|
549
|
-
id=content["meta_info"]["id"],
|
550
|
-
created=int(time.time()),
|
551
|
-
choices=[choice_data],
|
552
|
-
model=request.model,
|
553
|
-
)
|
554
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
555
|
-
continue
|
556
|
-
else:
|
557
|
-
stream_buffer = stream_buffers.get(index, "")
|
558
|
-
delta = content["text"][len(stream_buffer) :]
|
559
|
-
stream_buffers[index] = stream_buffer + delta
|
471
|
+
stream_buffer = stream_buffers.get(index, "")
|
472
|
+
delta = content["text"][len(stream_buffer) :]
|
473
|
+
stream_buffers[index] = stream_buffer + delta
|
560
474
|
|
561
475
|
# Handle reasoning content
|
562
476
|
if (
|
563
477
|
self.tokenizer_manager.server_args.reasoning_parser
|
564
478
|
and request.separate_reasoning
|
565
|
-
and not self.use_harmony
|
566
479
|
):
|
567
480
|
reasoning_text, delta = self._process_reasoning_stream(
|
568
481
|
index, delta, reasoning_parser_dict, content, request
|
@@ -581,27 +494,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
581
494
|
)
|
582
495
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
583
496
|
|
584
|
-
if self.use_harmony and not is_final:
|
585
|
-
choice_data = ChatCompletionResponseStreamChoice(
|
586
|
-
index=index,
|
587
|
-
delta=DeltaMessage(reasoning_content=delta),
|
588
|
-
finish_reason=None,
|
589
|
-
)
|
590
|
-
chunk = ChatCompletionStreamResponse(
|
591
|
-
id=content["meta_info"]["id"],
|
592
|
-
created=int(time.time()),
|
593
|
-
choices=[choice_data],
|
594
|
-
model=request.model,
|
595
|
-
)
|
596
|
-
yield f"data: {chunk.model_dump_json()}\n\n"
|
597
|
-
|
598
497
|
# Handle tool calls
|
599
|
-
|
600
|
-
if (
|
601
|
-
request.tool_choice != "none"
|
602
|
-
and request.tools
|
603
|
-
and not self.use_harmony
|
604
|
-
):
|
498
|
+
if request.tool_choice != "none" and request.tools:
|
605
499
|
async for chunk in self._process_tool_call_stream(
|
606
500
|
index,
|
607
501
|
delta,
|
@@ -765,76 +659,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
765
659
|
|
766
660
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
767
661
|
text = ret_item["text"]
|
768
|
-
output_ids = ret_item["output_ids"]
|
769
|
-
|
770
|
-
if self.use_harmony:
|
771
|
-
parser = parse_output_into_messages(output_ids)
|
772
|
-
output_msgs = parser.messages
|
773
|
-
if len(output_msgs) == 0:
|
774
|
-
# The generation has stopped during reasoning.
|
775
|
-
is_tool_call = False
|
776
|
-
reasoning_content = parser.current_content
|
777
|
-
final_content = None
|
778
|
-
elif len(output_msgs) == 1:
|
779
|
-
# The generation has stopped during final message.
|
780
|
-
is_tool_call = False
|
781
|
-
reasoning_content = output_msgs[0].content[0].text
|
782
|
-
final_content = parser.current_content
|
783
|
-
else:
|
784
|
-
if len(output_msgs) != 2:
|
785
|
-
raise ValueError(
|
786
|
-
"Expected 2 output messages (reasoning and final), "
|
787
|
-
f"but got {len(output_msgs)}."
|
788
|
-
)
|
789
|
-
reasoning_msg, final_msg = output_msgs
|
790
|
-
reasoning_content = reasoning_msg.content[0].text
|
791
|
-
final_content = final_msg.content[0].text
|
792
|
-
is_tool_call = final_msg.recipient is not None
|
793
|
-
|
794
|
-
if is_tool_call:
|
795
|
-
# Extract tool call information from final message
|
796
|
-
tool_call = (
|
797
|
-
self.harmony_tool_parser.extract_tool_calls_from_message(
|
798
|
-
final_msg
|
799
|
-
)
|
800
|
-
)
|
801
|
-
tool_calls = [tool_call] if tool_call else []
|
802
|
-
|
803
|
-
message = ChatMessage(
|
804
|
-
role="assistant",
|
805
|
-
reasoning_content=reasoning_content,
|
806
|
-
content=None, # Tool calls don't have regular content
|
807
|
-
tool_calls=tool_calls,
|
808
|
-
)
|
809
|
-
else:
|
810
|
-
# Normal message
|
811
|
-
message = ChatMessage(
|
812
|
-
role="assistant",
|
813
|
-
reasoning_content=reasoning_content,
|
814
|
-
content=final_content,
|
815
|
-
)
|
816
|
-
|
817
|
-
if is_tool_call:
|
818
|
-
finish_reason_type = "tool_calls"
|
819
|
-
elif finish_reason:
|
820
|
-
finish_reason_type = (
|
821
|
-
finish_reason["type"] if finish_reason else "stop"
|
822
|
-
)
|
823
|
-
else:
|
824
|
-
finish_reason_type = "stop"
|
825
|
-
choice_data = ChatCompletionResponseChoice(
|
826
|
-
index=idx,
|
827
|
-
message=message,
|
828
|
-
logprobs=choice_logprobs,
|
829
|
-
finish_reason=finish_reason_type,
|
830
|
-
matched_stop=(
|
831
|
-
finish_reason["matched"]
|
832
|
-
if finish_reason and "matched" in finish_reason
|
833
|
-
else None
|
834
|
-
),
|
835
|
-
)
|
836
|
-
choices.append(choice_data)
|
837
|
-
continue
|
838
662
|
|
839
663
|
# Handle reasoning content
|
840
664
|
reasoning_text = None
|
@@ -1184,33 +1008,3 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
1184
1008
|
return f"data: {chunk.model_dump_json()}\n\n"
|
1185
1009
|
|
1186
1010
|
return None
|
1187
|
-
|
1188
|
-
def _make_request_with_harmony(
|
1189
|
-
self,
|
1190
|
-
request: ChatCompletionRequest,
|
1191
|
-
):
|
1192
|
-
messages: list[OpenAIMessage] = []
|
1193
|
-
|
1194
|
-
# Add system message.
|
1195
|
-
# In Chat Completion API, browsing is enabled by default if the model
|
1196
|
-
# supports it.
|
1197
|
-
assert not self.supports_browsing
|
1198
|
-
assert not self.supports_code_interpreter
|
1199
|
-
sys_msg = get_system_message(
|
1200
|
-
reasoning_effort=request.reasoning_effort,
|
1201
|
-
browser_description=None,
|
1202
|
-
python_description=None,
|
1203
|
-
)
|
1204
|
-
messages.append(sys_msg)
|
1205
|
-
|
1206
|
-
# Add developer message.
|
1207
|
-
dev_msg = get_developer_message()
|
1208
|
-
messages.append(dev_msg)
|
1209
|
-
|
1210
|
-
# Add user message.
|
1211
|
-
for chat_msg in request.messages:
|
1212
|
-
messages.append(parse_chat_input(chat_msg))
|
1213
|
-
|
1214
|
-
# Render prompt token ids.
|
1215
|
-
prompt_token_ids = render_for_completion(messages)
|
1216
|
-
return messages, prompt_token_ids
|
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
|
|
5
5
|
from contextlib import AbstractAsyncContextManager, asynccontextmanager
|
6
6
|
from typing import Any
|
7
7
|
|
8
|
-
logger = logging.getLogger(__name__)
|
9
8
|
try:
|
10
9
|
from mcp import ClientSession
|
11
10
|
from mcp.client.sse import sse_client
|
12
11
|
from mcp.types import ListToolsResult
|
13
|
-
except ImportError:
|
14
|
-
|
12
|
+
except ImportError as e:
|
13
|
+
ClientSession = sse_client = ListToolsResult = e
|
15
14
|
|
16
15
|
from openai_harmony import ToolDescription, ToolNamespaceConfig
|
17
16
|
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
18
19
|
|
19
20
|
async def list_server_and_tools(server_url: str):
|
20
21
|
|
@@ -316,6 +316,7 @@ class EBNFComposer:
|
|
316
316
|
|
317
317
|
combined_args = "".join(rule_parts)
|
318
318
|
arguments_rule = args_template.format(arg_rules=combined_args)
|
319
|
+
arguments_rule = arguments_rule or '""'
|
319
320
|
|
320
321
|
# Add the function call rule and its arguments rule
|
321
322
|
ebnf_lines.append(
|
@@ -11,6 +11,7 @@ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
|
|
11
11
|
from sglang.srt.function_call.core_types import ToolCallItem
|
12
12
|
from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
|
13
13
|
from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
|
14
|
+
from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
|
14
15
|
from sglang.srt.function_call.kimik2_detector import KimiK2Detector
|
15
16
|
from sglang.srt.function_call.llama32_detector import Llama32Detector
|
16
17
|
from sglang.srt.function_call.mistral_detector import MistralDetector
|
@@ -41,6 +42,7 @@ class FunctionCallParser:
|
|
41
42
|
"qwen3_coder": Qwen3CoderDetector,
|
42
43
|
"glm45": Glm4MoeDetector,
|
43
44
|
"step3": Step3Detector,
|
45
|
+
"gpt-oss": GptOssDetector,
|
44
46
|
}
|
45
47
|
|
46
48
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -158,7 +158,7 @@ class Glm4MoeDetector(BaseFormatDetector):
|
|
158
158
|
individual_call_end_token=self.eot_token,
|
159
159
|
tool_call_separator="\\n",
|
160
160
|
function_format="xml",
|
161
|
-
call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
|
161
|
+
call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
|
162
162
|
key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
|
163
163
|
key_value_separator="\\n",
|
164
164
|
)
|