sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +11 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +4 -3
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +71 -0
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/vision.py +13 -5
- sglang/srt/layers/communicator.py +21 -4
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +2 -7
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +77 -73
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +3 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +55 -30
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +15 -3
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +28 -7
- sglang/srt/managers/scheduler.py +26 -12
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +24 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +7 -6
- sglang/srt/model_executor/forward_batch_info.py +35 -14
- sglang/srt/model_executor/model_runner.py +19 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +72 -33
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +24 -12
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +142 -7
- sglang/srt/two_batch_overlap.py +157 -5
- sglang/srt/utils.py +38 -2
- sglang/test/runners.py +2 -2
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,18 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
|
7
7
|
|
8
8
|
from fastapi import Request
|
9
9
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
|
+
from openai_harmony import Message as OpenAIMessage
|
10
11
|
|
11
12
|
from sglang.srt.conversation import generate_chat_conv
|
13
|
+
from sglang.srt.entrypoints.harmony_utils import (
|
14
|
+
get_developer_message,
|
15
|
+
get_stop_tokens_for_assistant_actions,
|
16
|
+
get_streamable_parser_for_assistant,
|
17
|
+
get_system_message,
|
18
|
+
parse_chat_input,
|
19
|
+
parse_output_into_messages,
|
20
|
+
render_for_completion,
|
21
|
+
)
|
12
22
|
from sglang.srt.entrypoints.openai.protocol import (
|
13
23
|
ChatCompletionRequest,
|
14
24
|
ChatCompletionResponse,
|
@@ -51,6 +61,26 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
51
61
|
):
|
52
62
|
super().__init__(tokenizer_manager)
|
53
63
|
self.template_manager = template_manager
|
64
|
+
self.use_harmony = (
|
65
|
+
self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
|
66
|
+
)
|
67
|
+
|
68
|
+
if self.use_harmony:
|
69
|
+
from sglang.srt.function_call.harmony_tool_parser import (
|
70
|
+
HarmonyToolCallParser,
|
71
|
+
)
|
72
|
+
|
73
|
+
self.harmony_tool_parser = HarmonyToolCallParser()
|
74
|
+
|
75
|
+
# NOTE While OpenAI's chat completion API supports browsing
|
76
|
+
# for some models, currently vLLM doesn't support it. Please use the
|
77
|
+
# Responses API instead.
|
78
|
+
self.supports_browsing = False
|
79
|
+
self.browser_tool = None
|
80
|
+
# NOTE: Chat completion API does not support code interpreter.
|
81
|
+
# Please use the Responses API instead.
|
82
|
+
self.supports_code_interpreter = False
|
83
|
+
self.python_tool = None
|
54
84
|
|
55
85
|
def _request_id_prefix(self) -> str:
|
56
86
|
return "chatcmpl-"
|
@@ -77,41 +107,66 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
77
107
|
is_multimodal = self.tokenizer_manager.model_config.is_multimodal
|
78
108
|
|
79
109
|
# Process messages and apply chat template
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
110
|
+
if not self.use_harmony:
|
111
|
+
processed_messages = self._process_messages(request, is_multimodal)
|
112
|
+
|
113
|
+
# Build sampling parameters
|
114
|
+
sampling_params = self._build_sampling_params(
|
115
|
+
request,
|
116
|
+
processed_messages.stop,
|
117
|
+
processed_messages.tool_call_constraint,
|
118
|
+
)
|
86
119
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
else:
|
91
|
-
if isinstance(processed_messages.prompt_ids, str):
|
92
|
-
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
120
|
+
# Handle single vs multiple requests
|
121
|
+
if is_multimodal:
|
122
|
+
prompt_kwargs = {"text": processed_messages.prompt}
|
93
123
|
else:
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
124
|
+
if isinstance(processed_messages.prompt_ids, str):
|
125
|
+
prompt_kwargs = {"text": processed_messages.prompt_ids}
|
126
|
+
else:
|
127
|
+
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
128
|
+
|
129
|
+
adapted_request = GenerateReqInput(
|
130
|
+
**prompt_kwargs,
|
131
|
+
image_data=processed_messages.image_data,
|
132
|
+
video_data=processed_messages.video_data,
|
133
|
+
audio_data=processed_messages.audio_data,
|
134
|
+
sampling_params=sampling_params,
|
135
|
+
return_logprob=request.logprobs,
|
136
|
+
logprob_start_len=-1,
|
137
|
+
top_logprobs_num=request.top_logprobs or 0,
|
138
|
+
stream=request.stream,
|
139
|
+
return_text_in_logprobs=True,
|
140
|
+
modalities=processed_messages.modalities,
|
141
|
+
lora_path=request.lora_path,
|
142
|
+
bootstrap_host=request.bootstrap_host,
|
143
|
+
bootstrap_port=request.bootstrap_port,
|
144
|
+
bootstrap_room=request.bootstrap_room,
|
145
|
+
return_hidden_states=request.return_hidden_states,
|
146
|
+
rid=request.rid,
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
processed_messages, prompt_ids = self._make_request_with_harmony(request)
|
150
|
+
|
151
|
+
adapted_request = GenerateReqInput(
|
152
|
+
input_ids=prompt_ids,
|
153
|
+
sampling_params=self._build_sampling_params(
|
154
|
+
request,
|
155
|
+
request.stop,
|
156
|
+
tool_call_constraint=None,
|
157
|
+
),
|
158
|
+
stream=request.stream,
|
159
|
+
return_logprob=request.logprobs,
|
160
|
+
logprob_start_len=-1,
|
161
|
+
top_logprobs_num=request.top_logprobs or 0,
|
162
|
+
return_text_in_logprobs=True,
|
163
|
+
lora_path=request.lora_path,
|
164
|
+
bootstrap_host=request.bootstrap_host,
|
165
|
+
bootstrap_port=request.bootstrap_port,
|
166
|
+
bootstrap_room=request.bootstrap_room,
|
167
|
+
return_hidden_states=request.return_hidden_states,
|
168
|
+
rid=request.rid,
|
169
|
+
)
|
115
170
|
|
116
171
|
return adapted_request, request
|
117
172
|
|
@@ -277,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
277
332
|
prompt = prompt[: -len(conv.sep2)]
|
278
333
|
else:
|
279
334
|
prompt = conv.get_prompt()
|
335
|
+
if self._get_enable_thinking_from_request(request):
|
336
|
+
prompt += "<think>" # Note(Xinyuan): hard code thinking token
|
280
337
|
|
281
338
|
image_data = conv.image_data if conv.image_data else None
|
282
339
|
video_data = conv.video_data if conv.video_data else None
|
@@ -402,6 +459,12 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
402
459
|
cached_tokens = {}
|
403
460
|
hidden_states = {}
|
404
461
|
|
462
|
+
# Harmony tracking
|
463
|
+
if self.use_harmony:
|
464
|
+
harmony_parsers = [
|
465
|
+
get_streamable_parser_for_assistant() for _ in range(request.n)
|
466
|
+
]
|
467
|
+
|
405
468
|
try:
|
406
469
|
async for content in self.tokenizer_manager.generate_request(
|
407
470
|
adapted_request, raw_request
|
@@ -449,14 +512,57 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
449
512
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
450
513
|
|
451
514
|
# Process content delta
|
452
|
-
|
453
|
-
|
454
|
-
|
515
|
+
if self.use_harmony:
|
516
|
+
harmony_parser = harmony_parsers[index]
|
517
|
+
|
518
|
+
new_token_ids = content["output_ids"]
|
519
|
+
for token_id in new_token_ids:
|
520
|
+
harmony_parser.process(token_id)
|
521
|
+
|
522
|
+
is_final = harmony_parser.current_channel == "final"
|
523
|
+
is_analysis = harmony_parser.current_channel == "analysis"
|
524
|
+
delta = harmony_parser.last_content_delta or ""
|
525
|
+
|
526
|
+
if is_analysis:
|
527
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
528
|
+
index=index,
|
529
|
+
delta=DeltaMessage(reasoning_content=delta),
|
530
|
+
finish_reason=None,
|
531
|
+
)
|
532
|
+
chunk = ChatCompletionStreamResponse(
|
533
|
+
id=content["meta_info"]["id"],
|
534
|
+
created=int(time.time()),
|
535
|
+
choices=[choice_data],
|
536
|
+
model=request.model,
|
537
|
+
)
|
538
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
539
|
+
continue
|
540
|
+
|
541
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
542
|
+
index=index,
|
543
|
+
delta=DeltaMessage(content=delta if delta else None),
|
544
|
+
finish_reason=None,
|
545
|
+
matched_stop=None,
|
546
|
+
logprobs=choice_logprobs,
|
547
|
+
)
|
548
|
+
chunk = ChatCompletionStreamResponse(
|
549
|
+
id=content["meta_info"]["id"],
|
550
|
+
created=int(time.time()),
|
551
|
+
choices=[choice_data],
|
552
|
+
model=request.model,
|
553
|
+
)
|
554
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
555
|
+
continue
|
556
|
+
else:
|
557
|
+
stream_buffer = stream_buffers.get(index, "")
|
558
|
+
delta = content["text"][len(stream_buffer) :]
|
559
|
+
stream_buffers[index] = stream_buffer + delta
|
455
560
|
|
456
561
|
# Handle reasoning content
|
457
562
|
if (
|
458
563
|
self.tokenizer_manager.server_args.reasoning_parser
|
459
564
|
and request.separate_reasoning
|
565
|
+
and not self.use_harmony
|
460
566
|
):
|
461
567
|
reasoning_text, delta = self._process_reasoning_stream(
|
462
568
|
index, delta, reasoning_parser_dict, content, request
|
@@ -475,8 +581,27 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
475
581
|
)
|
476
582
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
477
583
|
|
584
|
+
if self.use_harmony and not is_final:
|
585
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
586
|
+
index=index,
|
587
|
+
delta=DeltaMessage(reasoning_content=delta),
|
588
|
+
finish_reason=None,
|
589
|
+
)
|
590
|
+
chunk = ChatCompletionStreamResponse(
|
591
|
+
id=content["meta_info"]["id"],
|
592
|
+
created=int(time.time()),
|
593
|
+
choices=[choice_data],
|
594
|
+
model=request.model,
|
595
|
+
)
|
596
|
+
yield f"data: {chunk.model_dump_json()}\n\n"
|
597
|
+
|
478
598
|
# Handle tool calls
|
479
|
-
|
599
|
+
# TODO: support tool call parsing for harmony
|
600
|
+
if (
|
601
|
+
request.tool_choice != "none"
|
602
|
+
and request.tools
|
603
|
+
and not self.use_harmony
|
604
|
+
):
|
480
605
|
async for chunk in self._process_tool_call_stream(
|
481
606
|
index,
|
482
607
|
delta,
|
@@ -502,7 +627,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
502
627
|
if delta:
|
503
628
|
choice_data = ChatCompletionResponseStreamChoice(
|
504
629
|
index=index,
|
505
|
-
delta=DeltaMessage(content=delta
|
630
|
+
delta=DeltaMessage(content=delta),
|
506
631
|
finish_reason=None,
|
507
632
|
matched_stop=None,
|
508
633
|
logprobs=choice_logprobs,
|
@@ -640,14 +765,90 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
640
765
|
|
641
766
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
642
767
|
text = ret_item["text"]
|
768
|
+
output_ids = ret_item["output_ids"]
|
769
|
+
|
770
|
+
if self.use_harmony:
|
771
|
+
parser = parse_output_into_messages(output_ids)
|
772
|
+
output_msgs = parser.messages
|
773
|
+
if len(output_msgs) == 0:
|
774
|
+
# The generation has stopped during reasoning.
|
775
|
+
is_tool_call = False
|
776
|
+
reasoning_content = parser.current_content
|
777
|
+
final_content = None
|
778
|
+
elif len(output_msgs) == 1:
|
779
|
+
# The generation has stopped during final message.
|
780
|
+
is_tool_call = False
|
781
|
+
reasoning_content = output_msgs[0].content[0].text
|
782
|
+
final_content = parser.current_content
|
783
|
+
else:
|
784
|
+
if len(output_msgs) != 2:
|
785
|
+
raise ValueError(
|
786
|
+
"Expected 2 output messages (reasoning and final), "
|
787
|
+
f"but got {len(output_msgs)}."
|
788
|
+
)
|
789
|
+
reasoning_msg, final_msg = output_msgs
|
790
|
+
reasoning_content = reasoning_msg.content[0].text
|
791
|
+
final_content = final_msg.content[0].text
|
792
|
+
is_tool_call = final_msg.recipient is not None
|
793
|
+
|
794
|
+
if is_tool_call:
|
795
|
+
# Extract tool call information from final message
|
796
|
+
tool_call = (
|
797
|
+
self.harmony_tool_parser.extract_tool_calls_from_message(
|
798
|
+
final_msg
|
799
|
+
)
|
800
|
+
)
|
801
|
+
tool_calls = [tool_call] if tool_call else []
|
802
|
+
|
803
|
+
message = ChatMessage(
|
804
|
+
role="assistant",
|
805
|
+
reasoning_content=reasoning_content,
|
806
|
+
content=None, # Tool calls don't have regular content
|
807
|
+
tool_calls=tool_calls,
|
808
|
+
)
|
809
|
+
else:
|
810
|
+
# Normal message
|
811
|
+
message = ChatMessage(
|
812
|
+
role="assistant",
|
813
|
+
reasoning_content=reasoning_content,
|
814
|
+
content=final_content,
|
815
|
+
)
|
816
|
+
|
817
|
+
if is_tool_call:
|
818
|
+
finish_reason_type = "tool_calls"
|
819
|
+
elif finish_reason:
|
820
|
+
finish_reason_type = (
|
821
|
+
finish_reason["type"] if finish_reason else "stop"
|
822
|
+
)
|
823
|
+
else:
|
824
|
+
finish_reason_type = "stop"
|
825
|
+
choice_data = ChatCompletionResponseChoice(
|
826
|
+
index=idx,
|
827
|
+
message=message,
|
828
|
+
logprobs=choice_logprobs,
|
829
|
+
finish_reason=finish_reason_type,
|
830
|
+
matched_stop=(
|
831
|
+
finish_reason["matched"]
|
832
|
+
if finish_reason and "matched" in finish_reason
|
833
|
+
else None
|
834
|
+
),
|
835
|
+
)
|
836
|
+
choices.append(choice_data)
|
837
|
+
continue
|
643
838
|
|
644
839
|
# Handle reasoning content
|
645
840
|
reasoning_text = None
|
646
841
|
reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
647
842
|
if reasoning_parser and request.separate_reasoning:
|
843
|
+
is_force_reasoning = (
|
844
|
+
self.template_manager.force_reasoning
|
845
|
+
or self._get_enable_thinking_from_request(request)
|
846
|
+
)
|
648
847
|
try:
|
649
848
|
parser = ReasoningParser(
|
650
|
-
model_type=reasoning_parser,
|
849
|
+
model_type=reasoning_parser,
|
850
|
+
stream_reasoning=False,
|
851
|
+
force_reasoning=is_force_reasoning,
|
651
852
|
)
|
652
853
|
reasoning_text, text = parser.parse_non_stream(text)
|
653
854
|
except Exception as e:
|
@@ -810,14 +1011,19 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
810
1011
|
) -> tuple[Optional[str], str]:
|
811
1012
|
"""Process reasoning content in streaming response"""
|
812
1013
|
if index not in reasoning_parser_dict:
|
1014
|
+
is_force_reasoning = (
|
1015
|
+
self.template_manager.force_reasoning
|
1016
|
+
or self._get_enable_thinking_from_request(request)
|
1017
|
+
)
|
813
1018
|
reasoning_parser_dict[index] = ReasoningParser(
|
814
1019
|
self.tokenizer_manager.server_args.reasoning_parser,
|
815
1020
|
request.stream_reasoning,
|
1021
|
+
is_force_reasoning,
|
816
1022
|
)
|
817
1023
|
reasoning_parser = reasoning_parser_dict[index]
|
818
1024
|
return reasoning_parser.parse_stream_chunk(delta)
|
819
1025
|
|
820
|
-
def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
|
1026
|
+
def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
|
821
1027
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
822
1028
|
|
823
1029
|
NOTE: This parameter is only useful for models that support enable_thinking
|
@@ -826,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
826
1032
|
Args:
|
827
1033
|
request_obj: The request object (or an item from a list of requests).
|
828
1034
|
Returns:
|
829
|
-
The boolean value of 'enable_thinking' if found
|
1035
|
+
The boolean value of 'enable_thinking' if found, otherwise False.
|
830
1036
|
"""
|
831
1037
|
if (
|
832
1038
|
hasattr(request, "chat_template_kwargs")
|
@@ -834,7 +1040,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
834
1040
|
and request.chat_template_kwargs.get("enable_thinking") is not None
|
835
1041
|
):
|
836
1042
|
return request.chat_template_kwargs.get("enable_thinking")
|
837
|
-
return
|
1043
|
+
return False
|
838
1044
|
|
839
1045
|
async def _process_tool_call_stream(
|
840
1046
|
self,
|
@@ -978,3 +1184,33 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
978
1184
|
return f"data: {chunk.model_dump_json()}\n\n"
|
979
1185
|
|
980
1186
|
return None
|
1187
|
+
|
1188
|
+
def _make_request_with_harmony(
|
1189
|
+
self,
|
1190
|
+
request: ChatCompletionRequest,
|
1191
|
+
):
|
1192
|
+
messages: list[OpenAIMessage] = []
|
1193
|
+
|
1194
|
+
# Add system message.
|
1195
|
+
# In Chat Completion API, browsing is enabled by default if the model
|
1196
|
+
# supports it.
|
1197
|
+
assert not self.supports_browsing
|
1198
|
+
assert not self.supports_code_interpreter
|
1199
|
+
sys_msg = get_system_message(
|
1200
|
+
reasoning_effort=request.reasoning_effort,
|
1201
|
+
browser_description=None,
|
1202
|
+
python_description=None,
|
1203
|
+
)
|
1204
|
+
messages.append(sys_msg)
|
1205
|
+
|
1206
|
+
# Add developer message.
|
1207
|
+
dev_msg = get_developer_message()
|
1208
|
+
messages.append(dev_msg)
|
1209
|
+
|
1210
|
+
# Add user message.
|
1211
|
+
for chat_msg in request.messages:
|
1212
|
+
messages.append(parse_chat_input(chat_msg))
|
1213
|
+
|
1214
|
+
# Render prompt token ids.
|
1215
|
+
prompt_token_ids = render_for_completion(messages)
|
1216
|
+
return messages, prompt_token_ids
|