sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_one_batch.py +8 -6
  4. sglang/bench_serving.py +1 -1
  5. sglang/lang/interpreter.py +40 -1
  6. sglang/lang/ir.py +27 -0
  7. sglang/math_utils.py +8 -0
  8. sglang/srt/_custom_ops.py +2 -2
  9. sglang/srt/code_completion_parser.py +2 -44
  10. sglang/srt/configs/model_config.py +6 -0
  11. sglang/srt/constants.py +3 -0
  12. sglang/srt/conversation.py +19 -3
  13. sglang/srt/custom_op.py +5 -1
  14. sglang/srt/disaggregation/base/__init__.py +1 -1
  15. sglang/srt/disaggregation/base/conn.py +25 -11
  16. sglang/srt/disaggregation/common/__init__.py +5 -1
  17. sglang/srt/disaggregation/common/utils.py +42 -0
  18. sglang/srt/disaggregation/decode.py +211 -72
  19. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  20. sglang/srt/disaggregation/fake/__init__.py +1 -1
  21. sglang/srt/disaggregation/fake/conn.py +15 -9
  22. sglang/srt/disaggregation/mini_lb.py +34 -4
  23. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  24. sglang/srt/disaggregation/mooncake/conn.py +30 -29
  25. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  26. sglang/srt/disaggregation/nixl/conn.py +17 -12
  27. sglang/srt/disaggregation/prefill.py +144 -55
  28. sglang/srt/disaggregation/utils.py +155 -123
  29. sglang/srt/distributed/parallel_state.py +12 -4
  30. sglang/srt/entrypoints/engine.py +37 -29
  31. sglang/srt/entrypoints/http_server.py +153 -72
  32. sglang/srt/entrypoints/http_server_engine.py +0 -3
  33. sglang/srt/entrypoints/openai/__init__.py +0 -0
  34. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
  35. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  36. sglang/srt/entrypoints/openai/serving_chat.py +921 -0
  37. sglang/srt/entrypoints/openai/serving_completions.py +424 -0
  38. sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
  39. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  40. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  41. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  42. sglang/srt/entrypoints/openai/utils.py +72 -0
  43. sglang/srt/eplb_simulator/__init__.py +1 -0
  44. sglang/srt/eplb_simulator/reader.py +51 -0
  45. sglang/srt/function_call/base_format_detector.py +7 -4
  46. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  47. sglang/srt/function_call/ebnf_composer.py +64 -10
  48. sglang/srt/function_call/function_call_parser.py +6 -6
  49. sglang/srt/function_call/llama32_detector.py +1 -1
  50. sglang/srt/function_call/mistral_detector.py +1 -1
  51. sglang/srt/function_call/pythonic_detector.py +1 -1
  52. sglang/srt/function_call/qwen25_detector.py +1 -1
  53. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  54. sglang/srt/layers/activation.py +40 -3
  55. sglang/srt/layers/attention/aiter_backend.py +20 -4
  56. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  57. sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
  58. sglang/srt/layers/attention/flashattention_backend.py +71 -72
  59. sglang/srt/layers/attention/flashinfer_backend.py +10 -8
  60. sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
  61. sglang/srt/layers/attention/flashmla_backend.py +7 -12
  62. sglang/srt/layers/attention/tbo_backend.py +3 -3
  63. sglang/srt/layers/attention/triton_backend.py +138 -130
  64. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  65. sglang/srt/layers/attention/vision.py +51 -24
  66. sglang/srt/layers/communicator.py +28 -10
  67. sglang/srt/layers/dp_attention.py +11 -2
  68. sglang/srt/layers/layernorm.py +29 -2
  69. sglang/srt/layers/linear.py +0 -4
  70. sglang/srt/layers/logits_processor.py +2 -14
  71. sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
  72. sglang/srt/layers/moe/ep_moe/layer.py +249 -33
  73. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
  76. sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
  77. sglang/srt/layers/moe/topk.py +107 -12
  78. sglang/srt/layers/pooler.py +56 -0
  79. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  80. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  81. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  82. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  83. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  84. sglang/srt/layers/quantization/fp8.py +25 -17
  85. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  86. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  87. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  88. sglang/srt/layers/quantization/utils.py +5 -2
  89. sglang/srt/layers/radix_attention.py +2 -3
  90. sglang/srt/layers/rotary_embedding.py +42 -2
  91. sglang/srt/layers/sampler.py +1 -1
  92. sglang/srt/lora/lora_manager.py +249 -105
  93. sglang/srt/lora/mem_pool.py +53 -50
  94. sglang/srt/lora/utils.py +1 -1
  95. sglang/srt/managers/cache_controller.py +33 -14
  96. sglang/srt/managers/io_struct.py +31 -10
  97. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  98. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  99. sglang/srt/managers/schedule_batch.py +79 -37
  100. sglang/srt/managers/schedule_policy.py +70 -56
  101. sglang/srt/managers/scheduler.py +220 -79
  102. sglang/srt/managers/template_manager.py +226 -0
  103. sglang/srt/managers/tokenizer_manager.py +40 -10
  104. sglang/srt/managers/tp_worker.py +12 -2
  105. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  106. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  107. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  108. sglang/srt/mem_cache/chunk_cache.py +11 -15
  109. sglang/srt/mem_cache/hiradix_cache.py +38 -25
  110. sglang/srt/mem_cache/memory_pool.py +213 -505
  111. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  112. sglang/srt/mem_cache/radix_cache.py +56 -28
  113. sglang/srt/model_executor/cuda_graph_runner.py +198 -100
  114. sglang/srt/model_executor/forward_batch_info.py +32 -10
  115. sglang/srt/model_executor/model_runner.py +28 -12
  116. sglang/srt/model_loader/loader.py +16 -2
  117. sglang/srt/model_loader/weight_utils.py +11 -2
  118. sglang/srt/models/bert.py +113 -13
  119. sglang/srt/models/deepseek_nextn.py +29 -27
  120. sglang/srt/models/deepseek_v2.py +213 -173
  121. sglang/srt/models/glm4.py +312 -0
  122. sglang/srt/models/internvl.py +46 -102
  123. sglang/srt/models/mimo_mtp.py +2 -18
  124. sglang/srt/models/roberta.py +117 -9
  125. sglang/srt/models/vila.py +305 -0
  126. sglang/srt/reasoning_parser.py +21 -11
  127. sglang/srt/sampling/sampling_batch_info.py +24 -0
  128. sglang/srt/sampling/sampling_params.py +2 -0
  129. sglang/srt/server_args.py +351 -238
  130. sglang/srt/speculative/build_eagle_tree.py +1 -1
  131. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
  132. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
  133. sglang/srt/speculative/eagle_utils.py +468 -116
  134. sglang/srt/speculative/eagle_worker.py +258 -84
  135. sglang/srt/torch_memory_saver_adapter.py +19 -15
  136. sglang/srt/two_batch_overlap.py +4 -2
  137. sglang/srt/utils.py +235 -11
  138. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  139. sglang/test/runners.py +38 -3
  140. sglang/test/test_block_fp8.py +1 -0
  141. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  142. sglang/test/test_block_fp8_ep.py +2 -0
  143. sglang/test/test_utils.py +4 -1
  144. sglang/utils.py +9 -0
  145. sglang/version.py +1 -1
  146. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
  147. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
  148. sglang/srt/entrypoints/verl_engine.py +0 -179
  149. sglang/srt/openai_api/adapter.py +0 -1990
  150. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
  151. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
  152. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,921 @@
1
+ import copy
2
+ import json
3
+ import logging
4
+ import time
5
+ import uuid
6
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
7
+
8
+ from fastapi import Request
9
+ from fastapi.responses import ORJSONResponse, StreamingResponse
10
+
11
+ from sglang.srt.conversation import generate_chat_conv
12
+ from sglang.srt.entrypoints.openai.protocol import (
13
+ ChatCompletionRequest,
14
+ ChatCompletionResponse,
15
+ ChatCompletionResponseChoice,
16
+ ChatCompletionResponseStreamChoice,
17
+ ChatCompletionStreamResponse,
18
+ ChatCompletionTokenLogprob,
19
+ ChatMessage,
20
+ ChoiceLogprobs,
21
+ DeltaMessage,
22
+ ErrorResponse,
23
+ FunctionResponse,
24
+ LogProbs,
25
+ ToolCall,
26
+ TopLogprob,
27
+ )
28
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
29
+ from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
30
+ from sglang.srt.entrypoints.openai.utils import (
31
+ process_hidden_states_from_ret,
32
+ to_openai_style_logprobs,
33
+ )
34
+ from sglang.srt.function_call.function_call_parser import FunctionCallParser
35
+ from sglang.srt.jinja_template_utils import process_content_for_template_format
36
+ from sglang.srt.managers.io_struct import GenerateReqInput
37
+ from sglang.srt.managers.template_manager import TemplateManager
38
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
39
+ from sglang.srt.reasoning_parser import ReasoningParser
40
+ from sglang.utils import convert_json_schema_to_str
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class OpenAIServingChat(OpenAIServingBase):
46
+ """Handler for /v1/chat/completions requests"""
47
+
48
+ def __init__(
49
+ self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
50
+ ):
51
+ super().__init__(tokenizer_manager)
52
+ self.template_manager = template_manager
53
+
54
+ def _request_id_prefix(self) -> str:
55
+ return "chatcmpl-"
56
+
57
+ def _convert_to_internal_request(
58
+ self,
59
+ request: ChatCompletionRequest,
60
+ ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
61
+ """Convert OpenAI chat completion request to internal format"""
62
+ is_multimodal = self.tokenizer_manager.model_config.is_multimodal
63
+
64
+ # Process messages and apply chat template
65
+ (
66
+ prompt,
67
+ prompt_ids,
68
+ image_data,
69
+ audio_data,
70
+ modalities,
71
+ stop,
72
+ tool_call_constraint,
73
+ ) = self._process_messages(request, is_multimodal)
74
+
75
+ # Build sampling parameters
76
+ sampling_params = self._build_sampling_params(
77
+ request, stop, tool_call_constraint
78
+ )
79
+
80
+ # Handle single vs multiple requests
81
+ if is_multimodal:
82
+ prompt_kwargs = {"text": prompt}
83
+ else:
84
+ if isinstance(prompt_ids, str):
85
+ prompt_kwargs = {"text": prompt_ids}
86
+ else:
87
+ prompt_kwargs = {"input_ids": prompt_ids}
88
+
89
+ adapted_request = GenerateReqInput(
90
+ **prompt_kwargs,
91
+ image_data=image_data,
92
+ audio_data=audio_data,
93
+ sampling_params=sampling_params,
94
+ return_logprob=request.logprobs,
95
+ logprob_start_len=-1,
96
+ top_logprobs_num=request.top_logprobs or 0,
97
+ stream=request.stream,
98
+ return_text_in_logprobs=True,
99
+ modalities=modalities,
100
+ lora_path=request.lora_path,
101
+ bootstrap_host=request.bootstrap_host,
102
+ bootstrap_port=request.bootstrap_port,
103
+ bootstrap_room=request.bootstrap_room,
104
+ return_hidden_states=request.return_hidden_states,
105
+ )
106
+
107
+ return adapted_request, request
108
+
109
+ def _process_messages(
110
+ self, request: ChatCompletionRequest, is_multimodal: bool
111
+ ) -> tuple[
112
+ str,
113
+ Union[str, List[int]],
114
+ Optional[Any],
115
+ Optional[Any],
116
+ List[str],
117
+ List[str],
118
+ Optional[Any],
119
+ ]:
120
+ """Process chat messages and apply chat template"""
121
+ tool_call_constraint = None
122
+ prompt = ""
123
+ prompt_ids = []
124
+
125
+ if not isinstance(request.messages, str):
126
+ # Apply chat template and its stop strings
127
+ tools = None
128
+ if request.tools and request.tool_choice != "none":
129
+ request.skip_special_tokens = False
130
+ if not isinstance(request.tool_choice, str):
131
+ tools = [
132
+ item.function.model_dump()
133
+ for item in request.tools
134
+ if item.function.name == request.tool_choice.function.name
135
+ ]
136
+ else:
137
+ tools = [item.function.model_dump() for item in request.tools]
138
+
139
+ tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
140
+ parser = FunctionCallParser(request.tools, tool_call_parser)
141
+ tool_call_constraint = parser.get_structure_constraint(
142
+ request.tool_choice
143
+ )
144
+
145
+ # Use chat template
146
+ if self.template_manager.chat_template_name is None:
147
+ prompt, prompt_ids, image_data, audio_data, modalities, stop = (
148
+ self._apply_jinja_template(request, tools, is_multimodal)
149
+ )
150
+ else:
151
+ prompt, prompt_ids, image_data, audio_data, modalities, stop = (
152
+ self._apply_conversation_template(request, is_multimodal)
153
+ )
154
+ else:
155
+ # Use raw prompt
156
+ prompt_ids = request.messages
157
+ stop = request.stop or []
158
+ image_data = None
159
+ audio_data = None
160
+ modalities = []
161
+ prompt = request.messages
162
+
163
+ return (
164
+ prompt,
165
+ prompt_ids,
166
+ image_data,
167
+ audio_data,
168
+ modalities,
169
+ stop,
170
+ tool_call_constraint,
171
+ )
172
+
173
+ def _apply_jinja_template(
174
+ self,
175
+ request: ChatCompletionRequest,
176
+ tools: Optional[List[Dict]],
177
+ is_multimodal: bool,
178
+ ) -> tuple[str, List[int], Optional[Any], Optional[Any], List[str], List[str]]:
179
+ """Apply Jinja chat template"""
180
+ prompt = ""
181
+ prompt_ids = []
182
+ openai_compatible_messages = []
183
+ image_data = []
184
+ audio_data = []
185
+ modalities = []
186
+
187
+ template_content_format = self.template_manager.jinja_template_content_format
188
+
189
+ for message in request.messages:
190
+ if message.content is None:
191
+ message.content = ""
192
+ msg_dict = message.model_dump()
193
+
194
+ # Process content based on detected template format
195
+ processed_msg = process_content_for_template_format(
196
+ msg_dict,
197
+ template_content_format,
198
+ image_data,
199
+ audio_data,
200
+ modalities,
201
+ )
202
+ openai_compatible_messages.append(processed_msg)
203
+
204
+ # Handle assistant prefix for continue_final_message
205
+ assistant_prefix = None
206
+ if (
207
+ openai_compatible_messages
208
+ and openai_compatible_messages[-1]["role"] == "assistant"
209
+ ):
210
+ if request.continue_final_message:
211
+ assistant_prefix = openai_compatible_messages[-1]["content"]
212
+ openai_compatible_messages = openai_compatible_messages[:-1]
213
+
214
+ try:
215
+ prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
216
+ openai_compatible_messages,
217
+ tokenize=True,
218
+ add_generation_prompt=True,
219
+ tools=tools,
220
+ **(
221
+ request.chat_template_kwargs if request.chat_template_kwargs else {}
222
+ ),
223
+ )
224
+ except Exception:
225
+ # This except branch will be triggered when the chosen model
226
+ # has a different tools input format that is not compatible
227
+ # with openAI's apply_chat_template tool_call format, like Mistral.
228
+ tools = (
229
+ [t if "function" in t else {"function": t} for t in tools]
230
+ if tools
231
+ else None
232
+ )
233
+ prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
234
+ openai_compatible_messages,
235
+ tokenize=True,
236
+ add_generation_prompt=True,
237
+ tools=tools,
238
+ **(
239
+ request.chat_template_kwargs if request.chat_template_kwargs else {}
240
+ ),
241
+ )
242
+
243
+ if assistant_prefix:
244
+ encoded = self.tokenizer_manager.tokenizer.encode(assistant_prefix)
245
+ if encoded and encoded[0] == self.tokenizer_manager.tokenizer.bos_token_id:
246
+ encoded = encoded[1:]
247
+ prompt_ids += encoded
248
+
249
+ if is_multimodal:
250
+ prompt = self.tokenizer_manager.tokenizer.decode(prompt_ids)
251
+
252
+ stop = request.stop
253
+ image_data = image_data if image_data else None
254
+ audio_data = audio_data if audio_data else None
255
+ modalities = modalities if modalities else []
256
+ return prompt, prompt_ids, image_data, audio_data, modalities, stop
257
+
258
+ def _apply_conversation_template(
259
+ self,
260
+ request: ChatCompletionRequest,
261
+ is_multimodal: bool,
262
+ ) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str], List[str]]:
263
+ """Apply conversation template"""
264
+ prompt = ""
265
+ prompt_ids = []
266
+ conv = generate_chat_conv(request, self.template_manager.chat_template_name)
267
+
268
+ # If we should continue the final assistant message, adjust the conversation.
269
+ if (
270
+ request.continue_final_message
271
+ and request.messages
272
+ and request.messages[-1].role == "assistant"
273
+ ):
274
+ # Remove the auto-added blank assistant turn, if present.
275
+ if conv.messages and conv.messages[-1][1] is None:
276
+ conv.messages.pop()
277
+ # Rebuild the prompt from the conversation.
278
+ prompt = conv.get_prompt()
279
+ # Strip trailing stop tokens or separators that indicate end-of-assistant.
280
+ if isinstance(conv.stop_str, list):
281
+ for stop_token in conv.stop_str:
282
+ if prompt.endswith(stop_token):
283
+ prompt = prompt[: -len(stop_token)]
284
+ elif isinstance(conv.stop_str, str) and prompt.endswith(conv.stop_str):
285
+ prompt = prompt[: -len(conv.stop_str)]
286
+ if conv.sep and prompt.endswith(conv.sep):
287
+ prompt = prompt[: -len(conv.sep)]
288
+ if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
289
+ prompt = prompt[: -len(conv.sep2)]
290
+ else:
291
+ prompt = conv.get_prompt()
292
+
293
+ image_data = conv.image_data if conv.image_data else None
294
+ audio_data = conv.audio_data if conv.audio_data else None
295
+ modalities = conv.modalities if conv.modalities else []
296
+ stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
297
+
298
+ if request.stop:
299
+ if isinstance(request.stop, str):
300
+ stop.append(request.stop)
301
+ else:
302
+ stop.extend(request.stop)
303
+
304
+ if not is_multimodal:
305
+ prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
306
+
307
+ return prompt, prompt_ids, image_data, audio_data, modalities, stop
308
+
309
+ def _build_sampling_params(
310
+ self,
311
+ request: ChatCompletionRequest,
312
+ stop: List[str],
313
+ tool_call_constraint: Optional[Any],
314
+ ) -> Dict[str, Any]:
315
+ """Build sampling parameters for the request"""
316
+
317
+ sampling_params = {
318
+ "temperature": request.temperature,
319
+ "max_new_tokens": request.max_tokens or request.max_completion_tokens,
320
+ "min_new_tokens": request.min_tokens,
321
+ "stop": stop,
322
+ "stop_token_ids": request.stop_token_ids,
323
+ "top_p": request.top_p,
324
+ "top_k": request.top_k,
325
+ "min_p": request.min_p,
326
+ "presence_penalty": request.presence_penalty,
327
+ "frequency_penalty": request.frequency_penalty,
328
+ "repetition_penalty": request.repetition_penalty,
329
+ "regex": request.regex,
330
+ "ebnf": request.ebnf,
331
+ "n": request.n,
332
+ "no_stop_trim": request.no_stop_trim,
333
+ "ignore_eos": request.ignore_eos,
334
+ "skip_special_tokens": request.skip_special_tokens,
335
+ "logit_bias": request.logit_bias,
336
+ }
337
+
338
+ if request.response_format and request.response_format.type == "json_schema":
339
+ sampling_params["json_schema"] = convert_json_schema_to_str(
340
+ request.response_format.json_schema.schema_
341
+ )
342
+ elif request.response_format and request.response_format.type == "json_object":
343
+ sampling_params["json_schema"] = '{"type": "object"}'
344
+ elif (
345
+ request.response_format and request.response_format.type == "structural_tag"
346
+ ):
347
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
348
+ request.response_format.model_dump(by_alias=True)
349
+ )
350
+
351
+ # Check if there are already existing output constraints
352
+ has_existing_constraints = (
353
+ sampling_params.get("regex")
354
+ or sampling_params.get("ebnf")
355
+ or sampling_params.get("structural_tag")
356
+ or sampling_params.get("json_schema")
357
+ )
358
+
359
+ if tool_call_constraint and has_existing_constraints:
360
+ logger.warning("Constrained decoding is not compatible with tool calls.")
361
+ elif tool_call_constraint:
362
+ constraint_type, constraint_value = tool_call_constraint
363
+ if constraint_type == "structural_tag":
364
+ sampling_params[constraint_type] = convert_json_schema_to_str(
365
+ constraint_value.model_dump(by_alias=True)
366
+ )
367
+ else:
368
+ sampling_params[constraint_type] = constraint_value
369
+ return sampling_params
370
+
371
+ async def _handle_streaming_request(
372
+ self,
373
+ adapted_request: GenerateReqInput,
374
+ request: ChatCompletionRequest,
375
+ raw_request: Request,
376
+ ) -> StreamingResponse:
377
+ """Handle streaming chat completion request"""
378
+ return StreamingResponse(
379
+ self._generate_chat_stream(adapted_request, request, raw_request),
380
+ media_type="text/event-stream",
381
+ background=self.tokenizer_manager.create_abort_task(adapted_request),
382
+ )
383
+
384
+ async def _generate_chat_stream(
385
+ self,
386
+ adapted_request: GenerateReqInput,
387
+ request: ChatCompletionRequest,
388
+ raw_request: Request,
389
+ ) -> AsyncGenerator[str, None]:
390
+ """Generate streaming chat completion response"""
391
+ # Parsers for tool calls and reasoning
392
+ parser_dict = {}
393
+ reasoning_parser_dict = {}
394
+
395
+ # State tracking for streaming
396
+ is_firsts = {}
397
+ stream_buffers = {}
398
+ n_prev_tokens = {}
399
+
400
+ # Usage tracking
401
+ prompt_tokens = {}
402
+ completion_tokens = {}
403
+ cached_tokens = {}
404
+ hidden_states = {}
405
+
406
+ try:
407
+ async for content in self.tokenizer_manager.generate_request(
408
+ adapted_request, raw_request
409
+ ):
410
+ index = content.get("index", 0)
411
+
412
+ prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
413
+ completion_tokens[index] = content["meta_info"]["completion_tokens"]
414
+ cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
415
+ hidden_states[index] = content["meta_info"].get("hidden_states", None)
416
+
417
+ # Handle logprobs
418
+ choice_logprobs = None
419
+ if request.logprobs:
420
+ choice_logprobs = self._process_streaming_logprobs(
421
+ content, n_prev_tokens.get(index, 0)
422
+ )
423
+ n_prev_tokens[index] = len(
424
+ content["meta_info"]["output_token_logprobs"]
425
+ )
426
+
427
+ finish_reason = content["meta_info"]["finish_reason"]
428
+ finish_reason_type = finish_reason["type"] if finish_reason else None
429
+
430
+ # First chunk with role
431
+ if is_firsts.get(index, True):
432
+ is_firsts[index] = False
433
+ delta = DeltaMessage(role="assistant", content="")
434
+ choice_data = ChatCompletionResponseStreamChoice(
435
+ index=index,
436
+ delta=delta,
437
+ finish_reason=finish_reason_type,
438
+ matched_stop=(
439
+ finish_reason["matched"]
440
+ if finish_reason and "matched" in finish_reason
441
+ else None
442
+ ),
443
+ logprobs=choice_logprobs,
444
+ )
445
+ chunk = ChatCompletionStreamResponse(
446
+ id=content["meta_info"]["id"],
447
+ created=int(time.time()),
448
+ choices=[choice_data],
449
+ model=request.model,
450
+ )
451
+ yield f"data: {chunk.model_dump_json()}\n\n"
452
+
453
+ # Process content delta
454
+ stream_buffer = stream_buffers.get(index, "")
455
+ delta = content["text"][len(stream_buffer) :]
456
+ stream_buffers[index] = stream_buffer + delta
457
+
458
+ # Handle reasoning content
459
+ if (
460
+ self.tokenizer_manager.server_args.reasoning_parser
461
+ and request.separate_reasoning
462
+ ):
463
+ reasoning_text, delta = self._process_reasoning_stream(
464
+ index, delta, reasoning_parser_dict, content, request
465
+ )
466
+ if reasoning_text:
467
+ choice_data = ChatCompletionResponseStreamChoice(
468
+ index=index,
469
+ delta=DeltaMessage(reasoning_content=reasoning_text),
470
+ finish_reason=finish_reason_type,
471
+ )
472
+ chunk = ChatCompletionStreamResponse(
473
+ id=content["meta_info"]["id"],
474
+ created=int(time.time()),
475
+ choices=[choice_data],
476
+ model=request.model,
477
+ )
478
+ yield f"data: {chunk.model_dump_json()}\n\n"
479
+
480
+ if not delta:
481
+ continue
482
+
483
+ # Handle tool calls
484
+ if request.tool_choice != "none" and request.tools:
485
+ async for chunk in self._process_tool_call_stream(
486
+ index,
487
+ delta,
488
+ parser_dict,
489
+ content,
490
+ request,
491
+ finish_reason_type,
492
+ ):
493
+ yield chunk
494
+ else:
495
+ # Regular content
496
+ if delta or not (
497
+ request.stream_options and request.stream_options.include_usage
498
+ ):
499
+ choice_data = ChatCompletionResponseStreamChoice(
500
+ index=index,
501
+ delta=DeltaMessage(content=delta if delta else None),
502
+ finish_reason=(
503
+ None
504
+ if request.stream_options
505
+ and request.stream_options.include_usage
506
+ else finish_reason_type
507
+ ),
508
+ matched_stop=(
509
+ finish_reason["matched"]
510
+ if finish_reason and "matched" in finish_reason
511
+ else None
512
+ ),
513
+ logprobs=choice_logprobs,
514
+ )
515
+ chunk = ChatCompletionStreamResponse(
516
+ id=content["meta_info"]["id"],
517
+ created=int(time.time()),
518
+ choices=[choice_data],
519
+ model=request.model,
520
+ )
521
+ yield f"data: {chunk.model_dump_json()}\n\n"
522
+
523
+ # Final chunk with finish_reason
524
+ finish_reason_chunk = ChatCompletionStreamResponse(
525
+ id=content["meta_info"]["id"],
526
+ created=int(time.time()),
527
+ choices=[
528
+ ChatCompletionResponseStreamChoice(
529
+ index=index,
530
+ delta=DeltaMessage(),
531
+ finish_reason=finish_reason_type,
532
+ matched_stop=(
533
+ finish_reason["matched"]
534
+ if finish_reason and "matched" in finish_reason
535
+ else None
536
+ ),
537
+ )
538
+ ],
539
+ model=request.model,
540
+ usage=None,
541
+ )
542
+ yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
543
+
544
+ # Send hidden states if requested
545
+ if request.return_hidden_states and hidden_states:
546
+ for index, choice_hidden_states in hidden_states.items():
547
+ if choice_hidden_states:
548
+ last_token_hidden_states = (
549
+ choice_hidden_states[-1]
550
+ if len(choice_hidden_states) > 1
551
+ else []
552
+ )
553
+ hidden_states_chunk = ChatCompletionStreamResponse(
554
+ id=content["meta_info"]["id"],
555
+ created=int(time.time()),
556
+ choices=[
557
+ ChatCompletionResponseStreamChoice(
558
+ index=index,
559
+ delta=DeltaMessage(
560
+ hidden_states=last_token_hidden_states
561
+ ),
562
+ finish_reason=finish_reason_type,
563
+ )
564
+ ],
565
+ model=request.model,
566
+ )
567
+ yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
568
+
569
+ # Additional usage chunk
570
+ if request.stream_options and request.stream_options.include_usage:
571
+ usage = UsageProcessor.calculate_streaming_usage(
572
+ prompt_tokens,
573
+ completion_tokens,
574
+ cached_tokens,
575
+ n_choices=request.n,
576
+ enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
577
+ )
578
+ usage_chunk = ChatCompletionStreamResponse(
579
+ id=content["meta_info"]["id"],
580
+ created=int(time.time()),
581
+ choices=[], # Empty choices array as per OpenAI spec
582
+ model=request.model,
583
+ usage=usage,
584
+ )
585
+ yield f"data: {usage_chunk.model_dump_json()}\n\n"
586
+
587
+ except ValueError as e:
588
+ error = self.create_streaming_error_response(str(e))
589
+ yield f"data: {error}\n\n"
590
+
591
+ yield "data: [DONE]\n\n"
592
+
593
+ async def _handle_non_streaming_request(
594
+ self,
595
+ adapted_request: GenerateReqInput,
596
+ request: ChatCompletionRequest,
597
+ raw_request: Request,
598
+ ) -> Union[ChatCompletionResponse, ErrorResponse, ORJSONResponse]:
599
+ """Handle non-streaming chat completion request"""
600
+ try:
601
+ ret = await self.tokenizer_manager.generate_request(
602
+ adapted_request, raw_request
603
+ ).__anext__()
604
+ except ValueError as e:
605
+ return self.create_error_response(str(e))
606
+
607
+ if not isinstance(ret, list):
608
+ ret = [ret]
609
+
610
+ response = self._build_chat_response(
611
+ request,
612
+ ret,
613
+ int(time.time()),
614
+ )
615
+
616
+ return response
617
+
618
+ def _build_chat_response(
619
+ self,
620
+ request: ChatCompletionRequest,
621
+ ret: List[Dict[str, Any]],
622
+ created: int,
623
+ ) -> Union[ChatCompletionResponse, ORJSONResponse]:
624
+ """Build chat completion response from generation results"""
625
+ choices = []
626
+
627
+ for idx, ret_item in enumerate(ret):
628
+ # Process logprobs
629
+ choice_logprobs = None
630
+ if request.logprobs:
631
+ choice_logprobs = self._process_response_logprobs(ret_item)
632
+
633
+ # Handle hidden states
634
+ hidden_states = process_hidden_states_from_ret(ret_item, request)
635
+
636
+ finish_reason = ret_item["meta_info"]["finish_reason"]
637
+ text = ret_item["text"]
638
+
639
+ # Handle reasoning content
640
+ reasoning_text = None
641
+ reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
642
+ if reasoning_parser and request.separate_reasoning:
643
+ try:
644
+ parser = ReasoningParser(
645
+ model_type=reasoning_parser, stream_reasoning=False
646
+ )
647
+ reasoning_text, text = parser.parse_non_stream(text)
648
+ except Exception as e:
649
+ logger.error(f"Reasoning parsing error: {e}")
650
+ return self.create_error_response(
651
+ "Failed to parse reasoning content",
652
+ err_type="InternalServerError",
653
+ status_code=500,
654
+ )
655
+
656
+ # Handle tool calls
657
+ tool_calls = None
658
+ if request.tool_choice != "none" and request.tools:
659
+ tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
660
+ tool_calls, text, finish_reason = self._process_tool_calls(
661
+ text, request.tools, tool_call_parser, finish_reason
662
+ )
663
+
664
+ choice_data = ChatCompletionResponseChoice(
665
+ index=idx,
666
+ message=ChatMessage(
667
+ role="assistant",
668
+ content=text if text else None,
669
+ tool_calls=tool_calls,
670
+ reasoning_content=reasoning_text if reasoning_text else None,
671
+ ),
672
+ logprobs=choice_logprobs,
673
+ finish_reason=finish_reason["type"] if finish_reason else None,
674
+ matched_stop=(
675
+ finish_reason["matched"]
676
+ if finish_reason and "matched" in finish_reason
677
+ else None
678
+ ),
679
+ hidden_states=hidden_states,
680
+ )
681
+ choices.append(choice_data)
682
+
683
+ # Calculate usage
684
+ usage = UsageProcessor.calculate_response_usage(
685
+ ret,
686
+ n_choices=request.n,
687
+ enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
688
+ )
689
+
690
+ return ChatCompletionResponse(
691
+ id=ret[0]["meta_info"]["id"],
692
+ created=created,
693
+ model=request.model,
694
+ choices=choices,
695
+ usage=usage,
696
+ )
697
+
698
+ def _process_logprobs_tokens(
699
+ self, logprobs: LogProbs, use_token_index: bool = False
700
+ ) -> List[ChatCompletionTokenLogprob]:
701
+ """Common helper to process logprobs tokens for both streaming and non-streaming
702
+
703
+ Args:
704
+ logprobs: LogProbs data from model
705
+ use_token_index: True for non-streaming (use token_idx), False for streaming (use index 0)
706
+ """
707
+ token_logprobs = []
708
+
709
+ for token_idx, (token, logprob) in enumerate(
710
+ zip(logprobs.tokens, logprobs.token_logprobs)
711
+ ):
712
+ token_bytes = list(token.encode("utf-8"))
713
+ top_logprobs = []
714
+ if logprobs.top_logprobs:
715
+ # - Non-streaming (use_token_index=True): uses token_idx for full data
716
+ # - Streaming (use_token_index=False): uses index 0 for pre-sliced data
717
+ top_logprobs_idx = token_idx if use_token_index else 0
718
+ for top_token, top_logprob in logprobs.top_logprobs[
719
+ top_logprobs_idx
720
+ ].items():
721
+ top_token_bytes = list(top_token.encode("utf-8"))
722
+ top_logprobs.append(
723
+ TopLogprob(
724
+ token=top_token,
725
+ bytes=top_token_bytes,
726
+ logprob=top_logprob,
727
+ )
728
+ )
729
+ token_logprobs.append(
730
+ ChatCompletionTokenLogprob(
731
+ token=token,
732
+ bytes=token_bytes,
733
+ logprob=logprob,
734
+ top_logprobs=top_logprobs,
735
+ )
736
+ )
737
+
738
+ return token_logprobs
739
+
740
+ def _process_response_logprobs(self, ret_item: Dict[str, Any]) -> ChoiceLogprobs:
741
+ """Process logprobs for non-streaming response"""
742
+ logprobs = to_openai_style_logprobs(
743
+ output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
744
+ output_top_logprobs=ret_item["meta_info"].get("output_top_logprobs", None),
745
+ )
746
+
747
+ token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
748
+ return ChoiceLogprobs(content=token_logprobs)
749
+
750
+ def _process_tool_calls(
751
+ self,
752
+ text: str,
753
+ tools: List[Any],
754
+ tool_call_parser: Optional[str],
755
+ finish_reason: Dict[str, Any],
756
+ ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
757
+ """Process tool calls in the response"""
758
+ parser = FunctionCallParser(tools, tool_call_parser)
759
+ if parser.has_tool_call(text):
760
+ if finish_reason["type"] == "stop":
761
+ finish_reason["type"] = "tool_calls"
762
+ finish_reason["matched"] = None
763
+ try:
764
+ text, call_info_list = parser.parse_non_stream(text)
765
+ tool_calls = [
766
+ ToolCall(
767
+ id=f"call_{uuid.uuid4().hex[:24]}",
768
+ function=FunctionResponse(
769
+ name=call_info.name, arguments=call_info.parameters
770
+ ),
771
+ )
772
+ for call_info in call_info_list
773
+ ]
774
+ return tool_calls, text, finish_reason
775
+ except Exception as e:
776
+ logger.error(f"Tool call parsing error: {e}")
777
+ # Return error but don't fail the whole request
778
+ return None, text, finish_reason
779
+
780
+ return None, text, finish_reason
781
+
782
+ def _process_streaming_logprobs(
783
+ self, content: Dict[str, Any], n_prev_token: int
784
+ ) -> ChoiceLogprobs:
785
+ """Process logprobs for streaming response"""
786
+ logprobs = to_openai_style_logprobs(
787
+ output_token_logprobs=content["meta_info"]["output_token_logprobs"][
788
+ n_prev_token:
789
+ ],
790
+ output_top_logprobs=content["meta_info"].get("output_top_logprobs", [])[
791
+ n_prev_token:
792
+ ],
793
+ )
794
+
795
+ token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=False)
796
+ return ChoiceLogprobs(content=token_logprobs)
797
+
798
+ def _process_reasoning_stream(
799
+ self,
800
+ index: int,
801
+ delta: str,
802
+ reasoning_parser_dict: Dict[int, ReasoningParser],
803
+ content: Dict[str, Any],
804
+ request: ChatCompletionRequest,
805
+ ) -> tuple[Optional[str], str]:
806
+ """Process reasoning content in streaming response"""
807
+ if index not in reasoning_parser_dict:
808
+ reasoning_parser_dict[index] = ReasoningParser(
809
+ self.tokenizer_manager.server_args.reasoning_parser,
810
+ request.stream_reasoning,
811
+ )
812
+ reasoning_parser = reasoning_parser_dict[index]
813
+ return reasoning_parser.parse_stream_chunk(delta)
814
+
815
+ def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
816
+ """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
817
+
818
+ NOTE: This parameter is only useful for models that support enable_thinking
819
+ flag, such as Qwen3.
820
+
821
+ Args:
822
+ request_obj: The request object (or an item from a list of requests).
823
+ Returns:
824
+ The boolean value of 'enable_thinking' if found and not True, otherwise True.
825
+ """
826
+ if (
827
+ hasattr(request, "chat_template_kwargs")
828
+ and request.chat_template_kwargs
829
+ and request.chat_template_kwargs.get("enable_thinking") is not None
830
+ ):
831
+ return request.chat_template_kwargs.get("enable_thinking")
832
+ return True
833
+
834
+ async def _process_tool_call_stream(
835
+ self,
836
+ index: int,
837
+ delta: str,
838
+ parser_dict: Dict[int, FunctionCallParser],
839
+ content: Dict[str, Any],
840
+ request: ChatCompletionRequest,
841
+ finish_reason_type: Optional[str],
842
+ ):
843
+ """Process tool calls in streaming response"""
844
+ if index not in parser_dict:
845
+ parser_dict[index] = FunctionCallParser(
846
+ tools=request.tools,
847
+ tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
848
+ )
849
+ parser = parser_dict[index]
850
+
851
+ normal_text, calls = parser.parse_stream_chunk(delta)
852
+
853
+ # Yield normal text
854
+ if normal_text:
855
+ choice_data = ChatCompletionResponseStreamChoice(
856
+ index=index,
857
+ delta=DeltaMessage(content=normal_text),
858
+ finish_reason=finish_reason_type,
859
+ )
860
+ chunk = ChatCompletionStreamResponse(
861
+ id=content["meta_info"]["id"],
862
+ created=int(time.time()),
863
+ choices=[choice_data],
864
+ model=request.model,
865
+ )
866
+ yield f"data: {chunk.model_dump_json()}\n\n"
867
+
868
+ # Yield tool calls
869
+ for call_item in calls:
870
+ # Tool call ID should be generated only once per tool call
871
+ if call_item.name:
872
+ # First chunk: include ID and function name
873
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
874
+ function_name = call_item.name
875
+ else:
876
+ # Subsequent chunks: null ID and name for argument deltas
877
+ tool_call_id = None
878
+ function_name = None
879
+
880
+ if finish_reason_type == "stop":
881
+ # Handle remaining arguments
882
+ latest_delta_len = 0
883
+ if isinstance(call_item.parameters, str):
884
+ latest_delta_len = len(call_item.parameters)
885
+
886
+ expected_call = json.dumps(
887
+ parser.detector.prev_tool_call_arr[index].get("arguments", {}),
888
+ ensure_ascii=False,
889
+ )
890
+ actual_call = parser.detector.streamed_args_for_tool[index]
891
+ if latest_delta_len > 0:
892
+ actual_call = actual_call[:-latest_delta_len]
893
+ remaining_call = expected_call.replace(actual_call, "", 1)
894
+ call_item.parameters = remaining_call
895
+ finish_reason_type = "tool_calls"
896
+
897
+ tool_call = ToolCall(
898
+ id=tool_call_id,
899
+ index=call_item.tool_index,
900
+ function=FunctionResponse(
901
+ name=function_name,
902
+ arguments=call_item.parameters,
903
+ ),
904
+ )
905
+
906
+ choice_data = ChatCompletionResponseStreamChoice(
907
+ index=index,
908
+ delta=DeltaMessage(tool_calls=[tool_call]),
909
+ finish_reason=(
910
+ None
911
+ if request.stream_options and request.stream_options.include_usage
912
+ else finish_reason_type
913
+ ),
914
+ )
915
+ chunk = ChatCompletionStreamResponse(
916
+ id=content["meta_info"]["id"],
917
+ created=int(time.time()),
918
+ choices=[choice_data],
919
+ model=request.model,
920
+ )
921
+ yield f"data: {chunk.model_dump_json()}\n\n"