sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/srt/configs/model_config.py +35 -0
  3. sglang/srt/conversation.py +9 -5
  4. sglang/srt/disaggregation/base/conn.py +5 -2
  5. sglang/srt/disaggregation/decode.py +6 -1
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  8. sglang/srt/disaggregation/prefill.py +2 -0
  9. sglang/srt/distributed/parallel_state.py +11 -9
  10. sglang/srt/entrypoints/context.py +244 -0
  11. sglang/srt/entrypoints/engine.py +4 -3
  12. sglang/srt/entrypoints/harmony_utils.py +370 -0
  13. sglang/srt/entrypoints/http_server.py +71 -0
  14. sglang/srt/entrypoints/openai/protocol.py +227 -1
  15. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  16. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  18. sglang/srt/entrypoints/tool.py +87 -0
  19. sglang/srt/eplb/expert_location.py +5 -1
  20. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  21. sglang/srt/hf_transformers_utils.py +30 -3
  22. sglang/srt/jinja_template_utils.py +8 -1
  23. sglang/srt/layers/attention/aiter_backend.py +5 -8
  24. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  25. sglang/srt/layers/attention/triton_backend.py +85 -14
  26. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  27. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  28. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  29. sglang/srt/layers/attention/vision.py +13 -5
  30. sglang/srt/layers/communicator.py +21 -4
  31. sglang/srt/layers/dp_attention.py +12 -0
  32. sglang/srt/layers/linear.py +2 -7
  33. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  34. sglang/srt/layers/moe/ep_moe/layer.py +77 -73
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
  37. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  38. sglang/srt/layers/moe/topk.py +12 -3
  39. sglang/srt/layers/moe/utils.py +16 -0
  40. sglang/srt/layers/quantization/__init__.py +22 -0
  41. sglang/srt/layers/quantization/fp4.py +557 -0
  42. sglang/srt/layers/quantization/fp8.py +3 -6
  43. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  44. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  45. sglang/srt/layers/quantization/mxfp4.py +651 -0
  46. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  47. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  48. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  49. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  50. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  51. sglang/srt/layers/quantization/quark/utils.py +107 -0
  52. sglang/srt/layers/quantization/unquant.py +60 -6
  53. sglang/srt/layers/quantization/w4afp8.py +1 -1
  54. sglang/srt/layers/rotary_embedding.py +225 -1
  55. sglang/srt/layers/utils.py +9 -0
  56. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  57. sglang/srt/lora/lora_manager.py +70 -14
  58. sglang/srt/lora/lora_registry.py +3 -2
  59. sglang/srt/lora/mem_pool.py +43 -5
  60. sglang/srt/managers/cache_controller.py +55 -30
  61. sglang/srt/managers/detokenizer_manager.py +1 -1
  62. sglang/srt/managers/io_struct.py +15 -3
  63. sglang/srt/managers/mm_utils.py +5 -11
  64. sglang/srt/managers/schedule_batch.py +28 -7
  65. sglang/srt/managers/scheduler.py +26 -12
  66. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  67. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  68. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  69. sglang/srt/managers/template_manager.py +35 -1
  70. sglang/srt/managers/tokenizer_manager.py +24 -6
  71. sglang/srt/managers/tp_worker.py +3 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  73. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  74. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  75. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  76. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  77. sglang/srt/model_executor/cuda_graph_runner.py +7 -6
  78. sglang/srt/model_executor/forward_batch_info.py +35 -14
  79. sglang/srt/model_executor/model_runner.py +19 -2
  80. sglang/srt/model_loader/weight_utils.py +10 -0
  81. sglang/srt/models/bailing_moe.py +425 -0
  82. sglang/srt/models/deepseek_v2.py +72 -33
  83. sglang/srt/models/ernie4.py +426 -0
  84. sglang/srt/models/ernie4_eagle.py +203 -0
  85. sglang/srt/models/gemma3n_mm.py +39 -0
  86. sglang/srt/models/glm4_moe.py +24 -12
  87. sglang/srt/models/gpt_oss.py +1134 -0
  88. sglang/srt/models/qwen2.py +6 -0
  89. sglang/srt/models/qwen2_moe.py +6 -0
  90. sglang/srt/models/qwen3_moe.py +32 -6
  91. sglang/srt/models/step3_vl.py +9 -0
  92. sglang/srt/models/transformers.py +2 -5
  93. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  94. sglang/srt/reasoning_parser.py +18 -39
  95. sglang/srt/server_args.py +142 -7
  96. sglang/srt/two_batch_overlap.py +157 -5
  97. sglang/srt/utils.py +38 -2
  98. sglang/test/runners.py +2 -2
  99. sglang/test/test_utils.py +1 -1
  100. sglang/version.py +1 -1
  101. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
  102. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
  103. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  104. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  105. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,18 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
7
7
 
8
8
  from fastapi import Request
9
9
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
+ from openai_harmony import Message as OpenAIMessage
10
11
 
11
12
  from sglang.srt.conversation import generate_chat_conv
13
+ from sglang.srt.entrypoints.harmony_utils import (
14
+ get_developer_message,
15
+ get_stop_tokens_for_assistant_actions,
16
+ get_streamable_parser_for_assistant,
17
+ get_system_message,
18
+ parse_chat_input,
19
+ parse_output_into_messages,
20
+ render_for_completion,
21
+ )
12
22
  from sglang.srt.entrypoints.openai.protocol import (
13
23
  ChatCompletionRequest,
14
24
  ChatCompletionResponse,
@@ -51,6 +61,26 @@ class OpenAIServingChat(OpenAIServingBase):
51
61
  ):
52
62
  super().__init__(tokenizer_manager)
53
63
  self.template_manager = template_manager
64
+ self.use_harmony = (
65
+ self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
66
+ )
67
+
68
+ if self.use_harmony:
69
+ from sglang.srt.function_call.harmony_tool_parser import (
70
+ HarmonyToolCallParser,
71
+ )
72
+
73
+ self.harmony_tool_parser = HarmonyToolCallParser()
74
+
75
+ # NOTE While OpenAI's chat completion API supports browsing
76
+ # for some models, currently vLLM doesn't support it. Please use the
77
+ # Responses API instead.
78
+ self.supports_browsing = False
79
+ self.browser_tool = None
80
+ # NOTE: Chat completion API does not support code interpreter.
81
+ # Please use the Responses API instead.
82
+ self.supports_code_interpreter = False
83
+ self.python_tool = None
54
84
 
55
85
  def _request_id_prefix(self) -> str:
56
86
  return "chatcmpl-"
@@ -77,41 +107,66 @@ class OpenAIServingChat(OpenAIServingBase):
77
107
  is_multimodal = self.tokenizer_manager.model_config.is_multimodal
78
108
 
79
109
  # Process messages and apply chat template
80
- processed_messages = self._process_messages(request, is_multimodal)
81
-
82
- # Build sampling parameters
83
- sampling_params = self._build_sampling_params(
84
- request, processed_messages.stop, processed_messages.tool_call_constraint
85
- )
110
+ if not self.use_harmony:
111
+ processed_messages = self._process_messages(request, is_multimodal)
112
+
113
+ # Build sampling parameters
114
+ sampling_params = self._build_sampling_params(
115
+ request,
116
+ processed_messages.stop,
117
+ processed_messages.tool_call_constraint,
118
+ )
86
119
 
87
- # Handle single vs multiple requests
88
- if is_multimodal:
89
- prompt_kwargs = {"text": processed_messages.prompt}
90
- else:
91
- if isinstance(processed_messages.prompt_ids, str):
92
- prompt_kwargs = {"text": processed_messages.prompt_ids}
120
+ # Handle single vs multiple requests
121
+ if is_multimodal:
122
+ prompt_kwargs = {"text": processed_messages.prompt}
93
123
  else:
94
- prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
95
-
96
- adapted_request = GenerateReqInput(
97
- **prompt_kwargs,
98
- image_data=processed_messages.image_data,
99
- video_data=processed_messages.video_data,
100
- audio_data=processed_messages.audio_data,
101
- sampling_params=sampling_params,
102
- return_logprob=request.logprobs,
103
- logprob_start_len=-1,
104
- top_logprobs_num=request.top_logprobs or 0,
105
- stream=request.stream,
106
- return_text_in_logprobs=True,
107
- modalities=processed_messages.modalities,
108
- lora_path=request.lora_path,
109
- bootstrap_host=request.bootstrap_host,
110
- bootstrap_port=request.bootstrap_port,
111
- bootstrap_room=request.bootstrap_room,
112
- return_hidden_states=request.return_hidden_states,
113
- rid=request.rid,
114
- )
124
+ if isinstance(processed_messages.prompt_ids, str):
125
+ prompt_kwargs = {"text": processed_messages.prompt_ids}
126
+ else:
127
+ prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
128
+
129
+ adapted_request = GenerateReqInput(
130
+ **prompt_kwargs,
131
+ image_data=processed_messages.image_data,
132
+ video_data=processed_messages.video_data,
133
+ audio_data=processed_messages.audio_data,
134
+ sampling_params=sampling_params,
135
+ return_logprob=request.logprobs,
136
+ logprob_start_len=-1,
137
+ top_logprobs_num=request.top_logprobs or 0,
138
+ stream=request.stream,
139
+ return_text_in_logprobs=True,
140
+ modalities=processed_messages.modalities,
141
+ lora_path=request.lora_path,
142
+ bootstrap_host=request.bootstrap_host,
143
+ bootstrap_port=request.bootstrap_port,
144
+ bootstrap_room=request.bootstrap_room,
145
+ return_hidden_states=request.return_hidden_states,
146
+ rid=request.rid,
147
+ )
148
+ else:
149
+ processed_messages, prompt_ids = self._make_request_with_harmony(request)
150
+
151
+ adapted_request = GenerateReqInput(
152
+ input_ids=prompt_ids,
153
+ sampling_params=self._build_sampling_params(
154
+ request,
155
+ request.stop,
156
+ tool_call_constraint=None,
157
+ ),
158
+ stream=request.stream,
159
+ return_logprob=request.logprobs,
160
+ logprob_start_len=-1,
161
+ top_logprobs_num=request.top_logprobs or 0,
162
+ return_text_in_logprobs=True,
163
+ lora_path=request.lora_path,
164
+ bootstrap_host=request.bootstrap_host,
165
+ bootstrap_port=request.bootstrap_port,
166
+ bootstrap_room=request.bootstrap_room,
167
+ return_hidden_states=request.return_hidden_states,
168
+ rid=request.rid,
169
+ )
115
170
 
116
171
  return adapted_request, request
117
172
 
@@ -277,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
277
332
  prompt = prompt[: -len(conv.sep2)]
278
333
  else:
279
334
  prompt = conv.get_prompt()
335
+ if self._get_enable_thinking_from_request(request):
336
+ prompt += "<think>" # Note(Xinyuan): hard code thinking token
280
337
 
281
338
  image_data = conv.image_data if conv.image_data else None
282
339
  video_data = conv.video_data if conv.video_data else None
@@ -402,6 +459,12 @@ class OpenAIServingChat(OpenAIServingBase):
402
459
  cached_tokens = {}
403
460
  hidden_states = {}
404
461
 
462
+ # Harmony tracking
463
+ if self.use_harmony:
464
+ harmony_parsers = [
465
+ get_streamable_parser_for_assistant() for _ in range(request.n)
466
+ ]
467
+
405
468
  try:
406
469
  async for content in self.tokenizer_manager.generate_request(
407
470
  adapted_request, raw_request
@@ -449,14 +512,57 @@ class OpenAIServingChat(OpenAIServingBase):
449
512
  yield f"data: {chunk.model_dump_json()}\n\n"
450
513
 
451
514
  # Process content delta
452
- stream_buffer = stream_buffers.get(index, "")
453
- delta = content["text"][len(stream_buffer) :]
454
- stream_buffers[index] = stream_buffer + delta
515
+ if self.use_harmony:
516
+ harmony_parser = harmony_parsers[index]
517
+
518
+ new_token_ids = content["output_ids"]
519
+ for token_id in new_token_ids:
520
+ harmony_parser.process(token_id)
521
+
522
+ is_final = harmony_parser.current_channel == "final"
523
+ is_analysis = harmony_parser.current_channel == "analysis"
524
+ delta = harmony_parser.last_content_delta or ""
525
+
526
+ if is_analysis:
527
+ choice_data = ChatCompletionResponseStreamChoice(
528
+ index=index,
529
+ delta=DeltaMessage(reasoning_content=delta),
530
+ finish_reason=None,
531
+ )
532
+ chunk = ChatCompletionStreamResponse(
533
+ id=content["meta_info"]["id"],
534
+ created=int(time.time()),
535
+ choices=[choice_data],
536
+ model=request.model,
537
+ )
538
+ yield f"data: {chunk.model_dump_json()}\n\n"
539
+ continue
540
+
541
+ choice_data = ChatCompletionResponseStreamChoice(
542
+ index=index,
543
+ delta=DeltaMessage(content=delta if delta else None),
544
+ finish_reason=None,
545
+ matched_stop=None,
546
+ logprobs=choice_logprobs,
547
+ )
548
+ chunk = ChatCompletionStreamResponse(
549
+ id=content["meta_info"]["id"],
550
+ created=int(time.time()),
551
+ choices=[choice_data],
552
+ model=request.model,
553
+ )
554
+ yield f"data: {chunk.model_dump_json()}\n\n"
555
+ continue
556
+ else:
557
+ stream_buffer = stream_buffers.get(index, "")
558
+ delta = content["text"][len(stream_buffer) :]
559
+ stream_buffers[index] = stream_buffer + delta
455
560
 
456
561
  # Handle reasoning content
457
562
  if (
458
563
  self.tokenizer_manager.server_args.reasoning_parser
459
564
  and request.separate_reasoning
565
+ and not self.use_harmony
460
566
  ):
461
567
  reasoning_text, delta = self._process_reasoning_stream(
462
568
  index, delta, reasoning_parser_dict, content, request
@@ -475,8 +581,27 @@ class OpenAIServingChat(OpenAIServingBase):
475
581
  )
476
582
  yield f"data: {chunk.model_dump_json()}\n\n"
477
583
 
584
+ if self.use_harmony and not is_final:
585
+ choice_data = ChatCompletionResponseStreamChoice(
586
+ index=index,
587
+ delta=DeltaMessage(reasoning_content=delta),
588
+ finish_reason=None,
589
+ )
590
+ chunk = ChatCompletionStreamResponse(
591
+ id=content["meta_info"]["id"],
592
+ created=int(time.time()),
593
+ choices=[choice_data],
594
+ model=request.model,
595
+ )
596
+ yield f"data: {chunk.model_dump_json()}\n\n"
597
+
478
598
  # Handle tool calls
479
- if request.tool_choice != "none" and request.tools:
599
+ # TODO: support tool call parsing for harmony
600
+ if (
601
+ request.tool_choice != "none"
602
+ and request.tools
603
+ and not self.use_harmony
604
+ ):
480
605
  async for chunk in self._process_tool_call_stream(
481
606
  index,
482
607
  delta,
@@ -502,7 +627,7 @@ class OpenAIServingChat(OpenAIServingBase):
502
627
  if delta:
503
628
  choice_data = ChatCompletionResponseStreamChoice(
504
629
  index=index,
505
- delta=DeltaMessage(content=delta if delta else None),
630
+ delta=DeltaMessage(content=delta),
506
631
  finish_reason=None,
507
632
  matched_stop=None,
508
633
  logprobs=choice_logprobs,
@@ -640,14 +765,90 @@ class OpenAIServingChat(OpenAIServingBase):
640
765
 
641
766
  finish_reason = ret_item["meta_info"]["finish_reason"]
642
767
  text = ret_item["text"]
768
+ output_ids = ret_item["output_ids"]
769
+
770
+ if self.use_harmony:
771
+ parser = parse_output_into_messages(output_ids)
772
+ output_msgs = parser.messages
773
+ if len(output_msgs) == 0:
774
+ # The generation has stopped during reasoning.
775
+ is_tool_call = False
776
+ reasoning_content = parser.current_content
777
+ final_content = None
778
+ elif len(output_msgs) == 1:
779
+ # The generation has stopped during final message.
780
+ is_tool_call = False
781
+ reasoning_content = output_msgs[0].content[0].text
782
+ final_content = parser.current_content
783
+ else:
784
+ if len(output_msgs) != 2:
785
+ raise ValueError(
786
+ "Expected 2 output messages (reasoning and final), "
787
+ f"but got {len(output_msgs)}."
788
+ )
789
+ reasoning_msg, final_msg = output_msgs
790
+ reasoning_content = reasoning_msg.content[0].text
791
+ final_content = final_msg.content[0].text
792
+ is_tool_call = final_msg.recipient is not None
793
+
794
+ if is_tool_call:
795
+ # Extract tool call information from final message
796
+ tool_call = (
797
+ self.harmony_tool_parser.extract_tool_calls_from_message(
798
+ final_msg
799
+ )
800
+ )
801
+ tool_calls = [tool_call] if tool_call else []
802
+
803
+ message = ChatMessage(
804
+ role="assistant",
805
+ reasoning_content=reasoning_content,
806
+ content=None, # Tool calls don't have regular content
807
+ tool_calls=tool_calls,
808
+ )
809
+ else:
810
+ # Normal message
811
+ message = ChatMessage(
812
+ role="assistant",
813
+ reasoning_content=reasoning_content,
814
+ content=final_content,
815
+ )
816
+
817
+ if is_tool_call:
818
+ finish_reason_type = "tool_calls"
819
+ elif finish_reason:
820
+ finish_reason_type = (
821
+ finish_reason["type"] if finish_reason else "stop"
822
+ )
823
+ else:
824
+ finish_reason_type = "stop"
825
+ choice_data = ChatCompletionResponseChoice(
826
+ index=idx,
827
+ message=message,
828
+ logprobs=choice_logprobs,
829
+ finish_reason=finish_reason_type,
830
+ matched_stop=(
831
+ finish_reason["matched"]
832
+ if finish_reason and "matched" in finish_reason
833
+ else None
834
+ ),
835
+ )
836
+ choices.append(choice_data)
837
+ continue
643
838
 
644
839
  # Handle reasoning content
645
840
  reasoning_text = None
646
841
  reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
647
842
  if reasoning_parser and request.separate_reasoning:
843
+ is_force_reasoning = (
844
+ self.template_manager.force_reasoning
845
+ or self._get_enable_thinking_from_request(request)
846
+ )
648
847
  try:
649
848
  parser = ReasoningParser(
650
- model_type=reasoning_parser, stream_reasoning=False
849
+ model_type=reasoning_parser,
850
+ stream_reasoning=False,
851
+ force_reasoning=is_force_reasoning,
651
852
  )
652
853
  reasoning_text, text = parser.parse_non_stream(text)
653
854
  except Exception as e:
@@ -810,14 +1011,19 @@ class OpenAIServingChat(OpenAIServingBase):
810
1011
  ) -> tuple[Optional[str], str]:
811
1012
  """Process reasoning content in streaming response"""
812
1013
  if index not in reasoning_parser_dict:
1014
+ is_force_reasoning = (
1015
+ self.template_manager.force_reasoning
1016
+ or self._get_enable_thinking_from_request(request)
1017
+ )
813
1018
  reasoning_parser_dict[index] = ReasoningParser(
814
1019
  self.tokenizer_manager.server_args.reasoning_parser,
815
1020
  request.stream_reasoning,
1021
+ is_force_reasoning,
816
1022
  )
817
1023
  reasoning_parser = reasoning_parser_dict[index]
818
1024
  return reasoning_parser.parse_stream_chunk(delta)
819
1025
 
820
- def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
1026
+ def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
821
1027
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
822
1028
 
823
1029
  NOTE: This parameter is only useful for models that support enable_thinking
@@ -826,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
826
1032
  Args:
827
1033
  request_obj: The request object (or an item from a list of requests).
828
1034
  Returns:
829
- The boolean value of 'enable_thinking' if found and not True, otherwise True.
1035
+ The boolean value of 'enable_thinking' if found, otherwise False.
830
1036
  """
831
1037
  if (
832
1038
  hasattr(request, "chat_template_kwargs")
@@ -834,7 +1040,7 @@ class OpenAIServingChat(OpenAIServingBase):
834
1040
  and request.chat_template_kwargs.get("enable_thinking") is not None
835
1041
  ):
836
1042
  return request.chat_template_kwargs.get("enable_thinking")
837
- return True
1043
+ return False
838
1044
 
839
1045
  async def _process_tool_call_stream(
840
1046
  self,
@@ -978,3 +1184,33 @@ class OpenAIServingChat(OpenAIServingBase):
978
1184
  return f"data: {chunk.model_dump_json()}\n\n"
979
1185
 
980
1186
  return None
1187
+
1188
+ def _make_request_with_harmony(
1189
+ self,
1190
+ request: ChatCompletionRequest,
1191
+ ):
1192
+ messages: list[OpenAIMessage] = []
1193
+
1194
+ # Add system message.
1195
+ # In Chat Completion API, browsing is enabled by default if the model
1196
+ # supports it.
1197
+ assert not self.supports_browsing
1198
+ assert not self.supports_code_interpreter
1199
+ sys_msg = get_system_message(
1200
+ reasoning_effort=request.reasoning_effort,
1201
+ browser_description=None,
1202
+ python_description=None,
1203
+ )
1204
+ messages.append(sys_msg)
1205
+
1206
+ # Add developer message.
1207
+ dev_msg = get_developer_message()
1208
+ messages.append(dev_msg)
1209
+
1210
+ # Add user message.
1211
+ for chat_msg in request.messages:
1212
+ messages.append(parse_chat_input(chat_msg))
1213
+
1214
+ # Render prompt token ids.
1215
+ prompt_token_ids = render_for_completion(messages)
1216
+ return messages, prompt_token_ids