sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/srt/configs/model_config.py +35 -0
  3. sglang/srt/conversation.py +9 -5
  4. sglang/srt/disaggregation/base/conn.py +5 -2
  5. sglang/srt/disaggregation/decode.py +6 -1
  6. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  7. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  8. sglang/srt/disaggregation/prefill.py +2 -0
  9. sglang/srt/distributed/parallel_state.py +11 -9
  10. sglang/srt/entrypoints/context.py +244 -0
  11. sglang/srt/entrypoints/engine.py +4 -3
  12. sglang/srt/entrypoints/harmony_utils.py +370 -0
  13. sglang/srt/entrypoints/http_server.py +71 -0
  14. sglang/srt/entrypoints/openai/protocol.py +227 -1
  15. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  16. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  17. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  18. sglang/srt/entrypoints/tool.py +87 -0
  19. sglang/srt/eplb/expert_location.py +5 -1
  20. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  21. sglang/srt/hf_transformers_utils.py +30 -3
  22. sglang/srt/jinja_template_utils.py +8 -1
  23. sglang/srt/layers/attention/aiter_backend.py +5 -8
  24. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  25. sglang/srt/layers/attention/triton_backend.py +85 -14
  26. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  27. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  28. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  29. sglang/srt/layers/attention/vision.py +13 -5
  30. sglang/srt/layers/communicator.py +21 -4
  31. sglang/srt/layers/dp_attention.py +12 -0
  32. sglang/srt/layers/linear.py +2 -7
  33. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  34. sglang/srt/layers/moe/ep_moe/layer.py +77 -73
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
  37. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  38. sglang/srt/layers/moe/topk.py +12 -3
  39. sglang/srt/layers/moe/utils.py +16 -0
  40. sglang/srt/layers/quantization/__init__.py +22 -0
  41. sglang/srt/layers/quantization/fp4.py +557 -0
  42. sglang/srt/layers/quantization/fp8.py +3 -6
  43. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  44. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  45. sglang/srt/layers/quantization/mxfp4.py +651 -0
  46. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  47. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  48. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  49. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  50. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  51. sglang/srt/layers/quantization/quark/utils.py +107 -0
  52. sglang/srt/layers/quantization/unquant.py +60 -6
  53. sglang/srt/layers/quantization/w4afp8.py +1 -1
  54. sglang/srt/layers/rotary_embedding.py +225 -1
  55. sglang/srt/layers/utils.py +9 -0
  56. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  57. sglang/srt/lora/lora_manager.py +70 -14
  58. sglang/srt/lora/lora_registry.py +3 -2
  59. sglang/srt/lora/mem_pool.py +43 -5
  60. sglang/srt/managers/cache_controller.py +55 -30
  61. sglang/srt/managers/detokenizer_manager.py +1 -1
  62. sglang/srt/managers/io_struct.py +15 -3
  63. sglang/srt/managers/mm_utils.py +5 -11
  64. sglang/srt/managers/schedule_batch.py +28 -7
  65. sglang/srt/managers/scheduler.py +26 -12
  66. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  67. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  68. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  69. sglang/srt/managers/template_manager.py +35 -1
  70. sglang/srt/managers/tokenizer_manager.py +24 -6
  71. sglang/srt/managers/tp_worker.py +3 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  73. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  74. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  75. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  76. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  77. sglang/srt/model_executor/cuda_graph_runner.py +7 -6
  78. sglang/srt/model_executor/forward_batch_info.py +35 -14
  79. sglang/srt/model_executor/model_runner.py +19 -2
  80. sglang/srt/model_loader/weight_utils.py +10 -0
  81. sglang/srt/models/bailing_moe.py +425 -0
  82. sglang/srt/models/deepseek_v2.py +72 -33
  83. sglang/srt/models/ernie4.py +426 -0
  84. sglang/srt/models/ernie4_eagle.py +203 -0
  85. sglang/srt/models/gemma3n_mm.py +39 -0
  86. sglang/srt/models/glm4_moe.py +24 -12
  87. sglang/srt/models/gpt_oss.py +1134 -0
  88. sglang/srt/models/qwen2.py +6 -0
  89. sglang/srt/models/qwen2_moe.py +6 -0
  90. sglang/srt/models/qwen3_moe.py +32 -6
  91. sglang/srt/models/step3_vl.py +9 -0
  92. sglang/srt/models/transformers.py +2 -5
  93. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  94. sglang/srt/reasoning_parser.py +18 -39
  95. sglang/srt/server_args.py +142 -7
  96. sglang/srt/two_batch_overlap.py +157 -5
  97. sglang/srt/utils.py +38 -2
  98. sglang/test/runners.py +2 -2
  99. sglang/test/test_utils.py +1 -1
  100. sglang/version.py +1 -1
  101. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
  102. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
  103. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  104. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  105. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1273 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from vLLM's OpenAIServingResponses
3
+ """Handler for /v1/responses requests"""
4
+
5
+ import asyncio
6
+ import copy
7
+ import json
8
+ import logging
9
+ import time
10
+ from contextlib import AsyncExitStack
11
+ from http import HTTPStatus
12
+ from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+
14
+ import jinja2
15
+ import openai.types.responses as openai_responses_types
16
+ from fastapi import Request
17
+ from fastapi.responses import ORJSONResponse
18
+ from openai.types.responses import (
19
+ ResponseOutputMessage,
20
+ ResponseOutputText,
21
+ ResponseReasoningItem,
22
+ )
23
+ from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
24
+ from openai.types.responses.response_reasoning_item import (
25
+ Content as ResponseReasoningTextContent,
26
+ )
27
+ from openai_harmony import Message as OpenAIMessage
28
+
29
+ from sglang.srt.entrypoints.context import (
30
+ ConversationContext,
31
+ HarmonyContext,
32
+ SimpleContext,
33
+ StreamingHarmonyContext,
34
+ )
35
+ from sglang.srt.entrypoints.harmony_utils import (
36
+ get_developer_message,
37
+ get_stop_tokens_for_assistant_actions,
38
+ get_system_message,
39
+ get_user_message,
40
+ parse_output_message,
41
+ parse_remaining_state,
42
+ parse_response_input,
43
+ render_for_completion,
44
+ )
45
+ from sglang.srt.entrypoints.openai.protocol import (
46
+ ChatCompletionMessageParam,
47
+ ChatCompletionRequest,
48
+ PromptTokenUsageInfo,
49
+ RequestResponseMetadata,
50
+ ResponsesRequest,
51
+ ResponsesResponse,
52
+ UsageInfo,
53
+ )
54
+ from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
+ from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
+ from sglang.srt.managers.io_struct import GenerateReqInput
57
+ from sglang.srt.managers.template_manager import TemplateManager
58
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
+ from sglang.srt.reasoning_parser import ReasoningParser
60
+ from sglang.srt.utils import random_uuid
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+
65
+ class OpenAIServingResponses(OpenAIServingChat):
66
+ """Handler for /v1/responses requests"""
67
+
68
+ def __init__(
69
+ self,
70
+ tokenizer_manager: TokenizerManager,
71
+ template_manager: TemplateManager,
72
+ *,
73
+ enable_prompt_tokens_details: bool = False,
74
+ enable_force_include_usage: bool = False,
75
+ tool_server: Optional[ToolServer] = None,
76
+ ) -> None:
77
+ super().__init__(tokenizer_manager, template_manager)
78
+
79
+ # template_manager is already set by parent class
80
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
81
+ self.enable_prompt_tokens_details = enable_prompt_tokens_details
82
+ self.enable_force_include_usage = enable_force_include_usage
83
+
84
+ # Get default sampling params from model config if available
85
+ self.default_sampling_params = {}
86
+
87
+ self.supports_browsing = (
88
+ tool_server.has_tool("browser") if tool_server else False
89
+ )
90
+ self.supports_code_interpreter = (
91
+ tool_server.has_tool("python") if tool_server else False
92
+ )
93
+ self.tool_server = tool_server
94
+ # Get from model config
95
+ self.use_harmony = (
96
+ self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
97
+ )
98
+
99
+ if self.use_harmony:
100
+ # OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
101
+ # We need to add them to the stop token ids.
102
+ if "stop_token_ids" not in self.default_sampling_params:
103
+ self.default_sampling_params["stop_token_ids"] = []
104
+ self.default_sampling_params["stop_token_ids"].extend(
105
+ get_stop_tokens_for_assistant_actions()
106
+ )
107
+
108
+ # Response storage for background and retrieval operations
109
+ # Note: In production, this should use a proper storage backend (Redis, database)
110
+ # with TTL/expiration to prevent memory leaks
111
+ self.response_store: dict[str, ResponsesResponse] = {}
112
+ self.response_store_lock = asyncio.Lock()
113
+
114
+ # Message storage for conversation continuity
115
+ # Note: In production, this should use a proper storage backend (Redis, database)
116
+ # with TTL/expiration to prevent memory leaks
117
+ self.msg_store: dict[
118
+ str, Union[list[ChatCompletionMessageParam], list["OpenAIMessage"]]
119
+ ] = {}
120
+
121
+ self.background_tasks: dict[str, asyncio.Task] = {}
122
+
123
+ def _request_id_prefix(self) -> str:
124
+ return "resp_"
125
+
126
+ async def create_responses(
127
+ self,
128
+ request: ResponsesRequest,
129
+ raw_request: Optional[Request] = None,
130
+ ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ORJSONResponse]:
131
+ # Validate model
132
+ if not self.tokenizer_manager:
133
+ return self.create_error_response("Model not loaded")
134
+
135
+ # FIXME: If the engine is dead, raise an error
136
+ # This is required for the streaming case
137
+
138
+ # Handle the previous response ID
139
+ prev_response_id = request.previous_response_id
140
+ if prev_response_id is not None:
141
+ if not prev_response_id.startswith("resp_"):
142
+ return self._make_invalid_id_error(prev_response_id)
143
+ async with self.response_store_lock:
144
+ prev_response = self.response_store.get(prev_response_id)
145
+ if prev_response is None:
146
+ return self._make_not_found_error(prev_response_id)
147
+ else:
148
+ prev_response = None
149
+
150
+ try:
151
+ model_name = request.model
152
+ tokenizer = self.tokenizer_manager.tokenizer
153
+
154
+ if self.use_harmony:
155
+ messages, request_prompts, engine_prompts = (
156
+ self._make_request_with_harmony(request, prev_response)
157
+ )
158
+ else:
159
+ messages, request_prompts, engine_prompts = await self._make_request(
160
+ request, prev_response, tokenizer
161
+ )
162
+
163
+ except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
164
+ logger.exception("Error in preprocessing prompt inputs")
165
+ return self.create_error_response(f"{e} {e.__cause__}")
166
+
167
+ request_metadata = RequestResponseMetadata(request_id=request.request_id)
168
+ if raw_request:
169
+ raw_request.state.request_metadata = request_metadata
170
+
171
+ if (
172
+ self.tool_server is not None
173
+ and isinstance(self.tool_server, MCPToolServer)
174
+ and (request.background or request.stream)
175
+ and request.tools
176
+ and any(
177
+ tool.type in ["web_search_preview", "code_interpreter"]
178
+ for tool in request.tools
179
+ )
180
+ ):
181
+ return self.create_error_response(
182
+ "MCP tool server is not supported in background mode and "
183
+ "streaming mode"
184
+ )
185
+
186
+ # Schedule the request and get the result generator
187
+ generators: list[AsyncGenerator[Any, None]] = []
188
+ tool_list = []
189
+ if self.use_harmony:
190
+ if self.supports_browsing:
191
+ tool_list.append("browser")
192
+ if self.supports_code_interpreter:
193
+ tool_list.append("python")
194
+ async with AsyncExitStack() as exit_stack:
195
+ try:
196
+ if self.tool_server is not None:
197
+ tool_session_ctxs: dict[str, Any] = {
198
+ tool_name: exit_stack.enter_async_context(
199
+ self.tool_server.get_tool_session(tool_name)
200
+ )
201
+ for tool_name in tool_list
202
+ }
203
+ tool_sessions = {}
204
+ for tool_name in tool_list:
205
+ tool_sessions[tool_name] = await tool_session_ctxs[tool_name]
206
+ else:
207
+ assert len(tool_list) == 0
208
+ tool_sessions = {}
209
+ for i, engine_prompt in enumerate(engine_prompts):
210
+ # Calculate default max tokens from context length minus prompt length
211
+ if hasattr(engine_prompt, "__len__"):
212
+ prompt_length = len(engine_prompt)
213
+ elif isinstance(engine_prompt, list):
214
+ prompt_length = len(engine_prompt)
215
+ else:
216
+ prompt_length = 0
217
+
218
+ context_len = (
219
+ self.tokenizer_manager.model_config.context_len
220
+ if hasattr(self.tokenizer_manager.model_config, "context_len")
221
+ else 4096
222
+ )
223
+ default_max_tokens = max(
224
+ context_len - prompt_length, 512
225
+ ) # Ensure minimum 512 tokens
226
+ sampling_params = request.to_sampling_params(
227
+ default_max_tokens, self.default_sampling_params
228
+ )
229
+
230
+ context: ConversationContext
231
+ if self.use_harmony:
232
+ if request.stream:
233
+ context = StreamingHarmonyContext(messages, tool_sessions)
234
+ else:
235
+ context = HarmonyContext(messages, tool_sessions)
236
+ else:
237
+ context = SimpleContext()
238
+
239
+ # Create GenerateReqInput for SGLang
240
+ adapted_request = GenerateReqInput(
241
+ input_ids=engine_prompt,
242
+ sampling_params=sampling_params,
243
+ stream=request.stream,
244
+ rid=request.request_id,
245
+ background=request.background,
246
+ )
247
+
248
+ generator = self._generate_with_builtin_tools(
249
+ request.request_id,
250
+ request_prompts[i],
251
+ adapted_request,
252
+ sampling_params,
253
+ context,
254
+ raw_request=raw_request,
255
+ priority=request.priority,
256
+ )
257
+ generators.append(generator)
258
+ except ValueError as e:
259
+ return self.create_error_response(str(e))
260
+
261
+ assert len(generators) == 1
262
+ (result_generator,) = generators
263
+
264
+ # Store the input messages
265
+ if request.store:
266
+ self.msg_store[request.request_id] = messages
267
+
268
+ if request.background:
269
+ created_time = int(time.time())
270
+ response = ResponsesResponse.from_request(
271
+ request,
272
+ sampling_params,
273
+ model_name=model_name,
274
+ created_time=created_time,
275
+ output=[],
276
+ status="queued",
277
+ usage=None,
278
+ )
279
+ async with self.response_store_lock:
280
+ self.response_store[response.id] = response
281
+
282
+ # Run the request in the background
283
+ task = asyncio.create_task(
284
+ self._run_background_request(
285
+ request,
286
+ sampling_params,
287
+ result_generator,
288
+ context,
289
+ model_name,
290
+ tokenizer,
291
+ request_metadata,
292
+ created_time,
293
+ ),
294
+ name=f"create_{response.id}",
295
+ )
296
+
297
+ # For cleanup
298
+ self.background_tasks[response.id] = task
299
+ task.add_done_callback(
300
+ lambda _: self.background_tasks.pop(response.id, None)
301
+ )
302
+ return response
303
+
304
+ if request.stream:
305
+ return self.responses_stream_generator(
306
+ request,
307
+ sampling_params,
308
+ result_generator,
309
+ context,
310
+ model_name,
311
+ tokenizer,
312
+ request_metadata,
313
+ )
314
+ try:
315
+ result: Union[ORJSONResponse, ResponsesResponse] = (
316
+ await self.responses_full_generator(
317
+ request,
318
+ sampling_params,
319
+ result_generator,
320
+ context,
321
+ model_name,
322
+ tokenizer,
323
+ request_metadata,
324
+ )
325
+ )
326
+ return result
327
+ except Exception as e:
328
+ return self.create_error_response(str(e))
329
+ return self.create_error_response("Unknown error")
330
+
331
+ async def _make_request(
332
+ self,
333
+ request: ResponsesRequest,
334
+ prev_response: Optional[ResponsesResponse],
335
+ tokenizer: Any,
336
+ ):
337
+ # Construct the input messages
338
+ messages = self._construct_input_messages(request, prev_response)
339
+
340
+ # Follow SGLang's pattern: create a ChatCompletionRequest and process messages
341
+ try:
342
+ # Convert ResponsesRequest to ChatCompletionRequest for processing
343
+ chat_request = ChatCompletionRequest(
344
+ model=request.model,
345
+ messages=messages,
346
+ stream=request.stream,
347
+ )
348
+
349
+ # Follow SGLang's _process_messages pattern
350
+ is_multimodal = self.tokenizer_manager.model_config.is_multimodal
351
+ processed_messages = self._process_messages(chat_request, is_multimodal)
352
+
353
+ # Extract the results
354
+ if is_multimodal:
355
+ request_prompts = [processed_messages.prompt]
356
+ engine_prompts = [processed_messages.prompt]
357
+ else:
358
+ request_prompts = [processed_messages.prompt_ids]
359
+ engine_prompts = [processed_messages.prompt_ids]
360
+
361
+ except Exception as e:
362
+ logger.warning(f"Chat processing failed, using fallback: {e}")
363
+ # Fallback to simple encoding
364
+ prompt_text = ""
365
+ for msg in messages:
366
+ role = msg.get("role", "user")
367
+ content = msg.get("content", "")
368
+ prompt_text += f"{role}: {content}\n"
369
+ prompt_ids = tokenizer.encode(prompt_text)
370
+ request_prompts = [prompt_ids]
371
+ engine_prompts = [prompt_ids]
372
+
373
+ return messages, request_prompts, engine_prompts
374
+
375
+ def _make_request_with_harmony(
376
+ self,
377
+ request: ResponsesRequest,
378
+ prev_response: Optional[ResponsesResponse],
379
+ ):
380
+ if request.tool_choice != "auto":
381
+ raise NotImplementedError(
382
+ "Only 'auto' tool_choice is supported in " "response API"
383
+ )
384
+ messages = self._construct_input_messages_with_harmony(request, prev_response)
385
+ prompt_token_ids = render_for_completion(messages)
386
+ engine_prompt = prompt_token_ids
387
+ return messages, [prompt_token_ids], [engine_prompt]
388
+
389
+ async def responses_full_generator(
390
+ self,
391
+ request: ResponsesRequest,
392
+ sampling_params: Any,
393
+ result_generator: AsyncIterator[Any],
394
+ context: ConversationContext,
395
+ model_name: str,
396
+ tokenizer: Any,
397
+ request_metadata: RequestResponseMetadata,
398
+ created_time: Optional[int] = None,
399
+ ) -> Union[ResponsesResponse, ORJSONResponse]:
400
+ if created_time is None:
401
+ created_time = int(time.time())
402
+
403
+ try:
404
+ async for _ in result_generator:
405
+ pass
406
+ except asyncio.CancelledError:
407
+ return self.create_error_response("Client disconnected")
408
+ except ValueError as e:
409
+ return self.create_error_response(str(e))
410
+
411
+ if self.use_harmony:
412
+ assert isinstance(context, HarmonyContext)
413
+ output = self._make_response_output_items_with_harmony(context)
414
+ # TODO: these are all 0 for now!
415
+ num_prompt_tokens = context.num_prompt_tokens
416
+ num_generated_tokens = context.num_output_tokens
417
+ num_cached_tokens = context.num_cached_tokens
418
+ num_reasoning_tokens = context.num_reasoning_tokens
419
+ else:
420
+ assert isinstance(context, SimpleContext)
421
+ final_res = context.last_output
422
+ assert final_res is not None
423
+
424
+ output = self._make_response_output_items(
425
+ request, final_res["text"], tokenizer
426
+ )
427
+
428
+ # Calculate usage from actual output
429
+ if hasattr(final_res, "meta_info"):
430
+ num_prompt_tokens = final_res.meta_info.get("prompt_tokens", 0)
431
+ num_generated_tokens = final_res.meta_info.get("completion_tokens", 0)
432
+ num_cached_tokens = final_res.meta_info.get("cached_tokens", 0)
433
+ elif hasattr(final_res, "prompt_token_ids") and hasattr(
434
+ final_res, "outputs"
435
+ ):
436
+ # Fallback calculation if meta_info not available
437
+ num_prompt_tokens = (
438
+ len(final_res.prompt_token_ids) if final_res.prompt_token_ids else 0
439
+ )
440
+ num_generated_tokens = (
441
+ len(final_res.outputs[0].token_ids)
442
+ if final_res.outputs and final_res.outputs[0].token_ids
443
+ else 0
444
+ )
445
+ num_cached_tokens = getattr(final_res, "num_cached_tokens", 0)
446
+ num_reasoning_tokens = 0
447
+ else:
448
+ # Final fallback
449
+ num_prompt_tokens = 0
450
+ num_generated_tokens = 0
451
+ num_cached_tokens = 0
452
+ num_reasoning_tokens = 0
453
+
454
+ usage = UsageInfo(
455
+ prompt_tokens=num_prompt_tokens,
456
+ completion_tokens=num_generated_tokens,
457
+ total_tokens=num_prompt_tokens + num_generated_tokens,
458
+ reasoning_tokens=num_reasoning_tokens,
459
+ )
460
+ if self.enable_prompt_tokens_details and num_cached_tokens:
461
+ usage.prompt_tokens_details = PromptTokenUsageInfo(
462
+ cached_tokens=num_cached_tokens
463
+ )
464
+ request_metadata.final_usage_info = usage
465
+
466
+ response = ResponsesResponse.from_request(
467
+ request,
468
+ sampling_params,
469
+ model_name=model_name,
470
+ created_time=created_time,
471
+ output=output,
472
+ status="completed",
473
+ usage=usage,
474
+ )
475
+
476
+ if request.store:
477
+ async with self.response_store_lock:
478
+ stored_response = self.response_store.get(response.id)
479
+ # If the response is already cancelled, don't update it
480
+ if stored_response is None or stored_response.status != "cancelled":
481
+ self.response_store[response.id] = response
482
+
483
+ return response
484
+
485
+ def _make_response_output_items(
486
+ self,
487
+ request: ResponsesRequest,
488
+ final_output: Any,
489
+ tokenizer: Any,
490
+ ):
491
+ # Handle reasoning parsing if enabled
492
+ if self.reasoning_parser:
493
+ # Use standard reasoning parser (openai maps to T4Detector internally)
494
+ reasoning_parser = ReasoningParser(
495
+ model_type=self.reasoning_parser, stream_reasoning=False
496
+ )
497
+ reasoning_content, content = reasoning_parser.parse_non_stream(final_output)
498
+ else:
499
+ reasoning_content = None
500
+ content = final_output
501
+
502
+ output_items = []
503
+ if reasoning_content:
504
+ reasoning_item = ResponseReasoningItem(
505
+ id=f"rs_{random_uuid()}",
506
+ type="reasoning",
507
+ summary=[],
508
+ content=[
509
+ ResponseReasoningTextContent(
510
+ type="reasoning_text", text=reasoning_content
511
+ ),
512
+ ],
513
+ status=None,
514
+ )
515
+ output_items.append(reasoning_item)
516
+ if content:
517
+ output_text = ResponseOutputText(
518
+ text=content,
519
+ annotations=[], # TODO
520
+ type="output_text",
521
+ logprobs=None, # TODO
522
+ )
523
+ message = ResponseOutputMessage(
524
+ id=f"msg_{random_uuid()}",
525
+ content=[output_text],
526
+ role="assistant",
527
+ status="completed",
528
+ type="message",
529
+ )
530
+ output_items.append(message)
531
+ return output_items
532
+
533
+ def _make_response_output_items_with_harmony(
534
+ self,
535
+ context: HarmonyContext,
536
+ ):
537
+ output_items = []
538
+ num_init_messages = context.num_init_messages
539
+ for msg in context.messages[num_init_messages:]:
540
+ output_items.extend(parse_output_message(msg))
541
+ # Handle the generation stopped in the middle (if any).
542
+ last_items = parse_remaining_state(context.parser)
543
+ if last_items:
544
+ output_items.extend(last_items)
545
+ return output_items
546
+
547
+ def _construct_input_messages(
548
+ self,
549
+ request: ResponsesRequest,
550
+ prev_response: Optional[ResponsesResponse] = None,
551
+ ) -> list[ChatCompletionMessageParam]:
552
+ messages: list[ChatCompletionMessageParam] = []
553
+ if request.instructions:
554
+ messages.append(
555
+ {
556
+ "role": "system",
557
+ "content": request.instructions,
558
+ }
559
+ )
560
+
561
+ # Prepend the conversation history
562
+ if prev_response is not None:
563
+ # Add the previous messages
564
+ prev_msg = self.msg_store[prev_response.id]
565
+ messages.extend(prev_msg)
566
+
567
+ # Add the previous output
568
+ for output_item in prev_response.output:
569
+ # NOTE: We skip the reasoning output of the previous response
570
+ if isinstance(output_item, ResponseReasoningItem):
571
+ continue
572
+ for content in output_item.content:
573
+ messages.append(
574
+ {
575
+ "role": "system",
576
+ "content": request.instructions,
577
+ }
578
+ )
579
+
580
+ # Append the new input
581
+ # Responses API supports simple text inputs without chat format
582
+ if isinstance(request.input, str):
583
+ messages.append({"role": "user", "content": request.input})
584
+ else:
585
+ messages.extend(request.input) # type: ignore
586
+ return messages
587
+
588
+ def _construct_input_messages_with_harmony(
589
+ self,
590
+ request: ResponsesRequest,
591
+ prev_response: Optional[ResponsesResponse],
592
+ ) -> list["OpenAIMessage"]:
593
+ messages: list["OpenAIMessage"] = []
594
+ if prev_response is None:
595
+ # New conversation.
596
+ reasoning_effort = request.reasoning.effort if request.reasoning else None
597
+ tool_types = [tool.type for tool in request.tools]
598
+ enable_browser = (
599
+ "web_search_preview" in tool_types and self.tool_server is not None
600
+ )
601
+ enable_code_interpreter = (
602
+ "code_interpreter" in tool_types and self.tool_server is not None
603
+ )
604
+ sys_msg = get_system_message(
605
+ reasoning_effort=reasoning_effort,
606
+ browser_description=(
607
+ self.tool_server.get_tool_description("browser")
608
+ if self.tool_server and enable_browser
609
+ else None
610
+ ),
611
+ python_description=(
612
+ self.tool_server.get_tool_description("python")
613
+ if self.tool_server and enable_code_interpreter
614
+ else None
615
+ ),
616
+ )
617
+ messages.append(sys_msg)
618
+ dev_msg = get_developer_message(request.instructions, request.tools)
619
+ messages.append(dev_msg)
620
+ else:
621
+ # Continue the previous conversation.
622
+ # FIXME: Currently, request params like reasoning and
623
+ # instructions are ignored.
624
+ prev_msgs = self.msg_store[prev_response.id]
625
+ # Remove the previous chain-of-thoughts if there is a new "final"
626
+ # message.
627
+ if (
628
+ len(prev_msgs) > 0
629
+ and hasattr(prev_msgs[-1], "channel")
630
+ and prev_msgs[-1].channel == "final"
631
+ ): # type: ignore[union-attr]
632
+ prev_final_msg_idx = -1
633
+ for i in range(len(prev_msgs) - 2, -1, -1):
634
+ if (
635
+ hasattr(prev_msgs[i], "channel")
636
+ and prev_msgs[i].channel == "final"
637
+ ): # type: ignore[union-attr]
638
+ prev_final_msg_idx = i
639
+ break
640
+ recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
641
+ del prev_msgs[prev_final_msg_idx + 1 :]
642
+ for msg in recent_turn_msgs:
643
+ if (
644
+ hasattr(msg, "channel") and msg.channel != "analysis"
645
+ ): # type: ignore[union-attr]
646
+ prev_msgs.append(msg)
647
+ messages.extend(prev_msgs)
648
+ # Append the new input.
649
+ # Responses API supports simple text inputs without chat format.
650
+ if isinstance(request.input, str):
651
+ messages.append(get_user_message(request.input))
652
+ else:
653
+ if prev_response is not None:
654
+ prev_outputs = copy(prev_response.output)
655
+ else:
656
+ prev_outputs = []
657
+ for response_msg in request.input:
658
+ messages.append(parse_response_input(response_msg, prev_outputs))
659
+ if isinstance(response_msg, ResponseFunctionToolCall):
660
+ prev_outputs.append(response_msg)
661
+ return messages
662
+
663
+ async def _run_background_request(
664
+ self,
665
+ request: ResponsesRequest,
666
+ sampling_params: Any,
667
+ result_generator: AsyncIterator[Any],
668
+ context: ConversationContext,
669
+ model_name: str,
670
+ tokenizer: Any,
671
+ request_metadata: RequestResponseMetadata,
672
+ created_time: Optional[int] = None,
673
+ *args,
674
+ **kwargs,
675
+ ):
676
+ try:
677
+ # Update the status to "in_progress"
678
+ async with self.response_store_lock:
679
+ stored_response = self.response_store.get(request.request_id)
680
+ assert stored_response is not None
681
+ stored_response.status = "in_progress"
682
+
683
+ response = await self.responses_full_generator(
684
+ request,
685
+ sampling_params,
686
+ result_generator,
687
+ context,
688
+ model_name,
689
+ tokenizer,
690
+ request_metadata,
691
+ created_time,
692
+ *args,
693
+ **kwargs,
694
+ )
695
+ except Exception as e:
696
+ logger.exception("Background request failed for %s", request.request_id)
697
+ response = self.create_error_response(str(e))
698
+
699
+ if isinstance(response, ORJSONResponse):
700
+ # If the request has failed, update the status to "failed"
701
+ response_id = request.request_id
702
+ async with self.response_store_lock:
703
+ stored_response = self.response_store.get(response_id)
704
+ assert stored_response is not None
705
+ if stored_response.status not in ("completed", "cancelled"):
706
+ stored_response.status = "failed"
707
+
708
+ async def retrieve_responses(
709
+ self,
710
+ response_id: str,
711
+ ) -> Union[ResponsesResponse, ORJSONResponse]:
712
+ if not response_id.startswith("resp_"):
713
+ return self._make_invalid_id_error(response_id)
714
+
715
+ async with self.response_store_lock:
716
+ response = self.response_store.get(response_id)
717
+
718
+ if response is None:
719
+ return self._make_not_found_error(response_id)
720
+ return response
721
+
722
+ async def cancel_responses(
723
+ self,
724
+ response_id: str,
725
+ ) -> Union[ResponsesResponse, ORJSONResponse]:
726
+ if not response_id.startswith("resp_"):
727
+ return self._make_invalid_id_error(response_id)
728
+
729
+ async with self.response_store_lock:
730
+ response = self.response_store.get(response_id)
731
+ if response is None:
732
+ return self._make_not_found_error(response_id)
733
+
734
+ prev_status = response.status
735
+ if prev_status not in ("queued", "in_progress"):
736
+ return self.create_error_response(
737
+ err_type="invalid_request_error",
738
+ message="Cannot cancel a synchronous response.",
739
+ )
740
+
741
+ # Update the status to "cancelled"
742
+ response.status = "cancelled"
743
+
744
+ # Abort the request
745
+ if task := self.background_tasks.get(response_id):
746
+ task.cancel()
747
+ try:
748
+ await task
749
+ except asyncio.CancelledError:
750
+ logger.exception("Background task for %s was cancelled", response_id)
751
+ return response
752
+
753
+ def _make_invalid_id_error(self, response_id: str):
754
+ return self.create_error_response(
755
+ message=(
756
+ f"Invalid 'response_id': '{response_id}'. "
757
+ "Expected an ID that begins with 'resp'."
758
+ ),
759
+ err_type="invalid_request_error",
760
+ param="response_id",
761
+ )
762
+
763
+ def _make_not_found_error(self, response_id: str):
764
+ return self.create_error_response(
765
+ message=f"Response with id '{response_id}' not found.",
766
+ err_type="invalid_request_error",
767
+ status_code=HTTPStatus.NOT_FOUND,
768
+ param="response_id",
769
+ )
770
+
771
+ async def responses_stream_generator(
772
+ self,
773
+ request: ResponsesRequest,
774
+ sampling_params: Any,
775
+ result_generator: AsyncIterator[StreamingHarmonyContext],
776
+ context: StreamingHarmonyContext,
777
+ model_name: str,
778
+ tokenizer: Any,
779
+ request_metadata: RequestResponseMetadata,
780
+ created_time: Optional[int] = None,
781
+ ) -> AsyncGenerator[str, None]:
782
+ # TODO:
783
+ # 1. Handle disconnect
784
+
785
+ created_time = created_time or int(time.time())
786
+
787
+ sequence_number = 0
788
+
789
+ def _send_event(event):
790
+ nonlocal sequence_number
791
+ # Set sequence_number if the event has this attribute
792
+ if hasattr(event, "sequence_number"):
793
+ event.sequence_number = sequence_number
794
+ sequence_number += 1
795
+ # Get event type from the event's type field if it exists
796
+ event_type = getattr(event, "type", "unknown")
797
+ return (
798
+ f"event: {event_type}\n"
799
+ f"data: {event.model_dump_json(indent=None)}\n\n"
800
+ )
801
+
802
+ current_content_index = 0
803
+ current_output_index = 0
804
+ current_item_id = f"item_{random_uuid()}"
805
+ sent_output_item_added = False
806
+
807
+ initial_response = ResponsesResponse.from_request(
808
+ request,
809
+ sampling_params,
810
+ model_name=model_name,
811
+ created_time=created_time,
812
+ output=[],
813
+ status="in_progress",
814
+ usage=None,
815
+ ).model_dump()
816
+ yield _send_event(
817
+ openai_responses_types.ResponseCreatedEvent(
818
+ type="response.created",
819
+ sequence_number=-1,
820
+ response=initial_response,
821
+ )
822
+ )
823
+ yield _send_event(
824
+ openai_responses_types.ResponseInProgressEvent(
825
+ type="response.in_progress",
826
+ sequence_number=-1,
827
+ response=initial_response,
828
+ )
829
+ )
830
+
831
+ async for ctx in result_generator:
832
+
833
+ if ctx.is_expecting_start():
834
+ current_output_index += 1
835
+ sent_output_item_added = False
836
+
837
+ if len(ctx.parser.messages) > 0:
838
+ previous_item = ctx.parser.messages[-1]
839
+ if previous_item.recipient is not None:
840
+ # Deal with tool call here
841
+ pass
842
+ elif previous_item.channel == "analysis":
843
+ reasoning_item = ResponseReasoningItem(
844
+ id=f"rs_{random_uuid()}",
845
+ type="reasoning",
846
+ summary=[],
847
+ content=[
848
+ ResponseReasoningTextContent(
849
+ text=previous_item.content[0].text,
850
+ type="reasoning_text",
851
+ ),
852
+ ],
853
+ status="completed",
854
+ )
855
+ yield _send_event(
856
+ openai_responses_types.ResponseReasoningTextDoneEvent(
857
+ type="response.reasoning_text.done",
858
+ item_id=current_item_id,
859
+ sequence_number=-1,
860
+ output_index=current_output_index,
861
+ content_index=current_content_index,
862
+ text=previous_item.content[0].text,
863
+ )
864
+ )
865
+ yield _send_event(
866
+ openai_responses_types.ResponseOutputItemDoneEvent(
867
+ type="response.output_item.done",
868
+ sequence_number=-1,
869
+ output_index=current_output_index,
870
+ item=reasoning_item,
871
+ )
872
+ )
873
+ elif previous_item.channel == "final":
874
+ text_content = openai_responses_types.ResponseOutputText(
875
+ type="output_text",
876
+ text=previous_item.content[0].text,
877
+ annotations=[],
878
+ )
879
+ yield _send_event(
880
+ openai_responses_types.ResponseTextDoneEvent(
881
+ type="response.output_text.done",
882
+ sequence_number=-1,
883
+ output_index=current_output_index,
884
+ content_index=current_content_index,
885
+ text=previous_item.content[0].text,
886
+ logprobs=[],
887
+ item_id=current_item_id,
888
+ )
889
+ )
890
+ yield _send_event(
891
+ openai_responses_types.ResponseContentPartDoneEvent(
892
+ type="response.content_part.done",
893
+ sequence_number=-1,
894
+ item_id=current_item_id,
895
+ output_index=current_output_index,
896
+ content_index=current_content_index,
897
+ part=text_content,
898
+ )
899
+ )
900
+ yield _send_event(
901
+ openai_responses_types.ResponseOutputItemDoneEvent(
902
+ type="response.output_item.done",
903
+ sequence_number=-1,
904
+ output_index=current_output_index,
905
+ item=openai_responses_types.ResponseOutputMessage(
906
+ id=current_item_id,
907
+ type="message",
908
+ role="assistant",
909
+ content=[text_content],
910
+ status="completed",
911
+ ),
912
+ )
913
+ )
914
+
915
+ if ctx.parser.last_content_delta:
916
+ if (
917
+ ctx.parser.current_channel == "final"
918
+ and ctx.parser.current_recipient is None
919
+ ):
920
+ if not sent_output_item_added:
921
+ sent_output_item_added = True
922
+ yield _send_event(
923
+ openai_responses_types.ResponseOutputItemAddedEvent(
924
+ type="response.output_item.added",
925
+ sequence_number=-1,
926
+ output_index=current_output_index,
927
+ item=openai_responses_types.ResponseOutputMessage(
928
+ id=current_item_id,
929
+ type="message",
930
+ role="assistant",
931
+ content=[],
932
+ status="in_progress",
933
+ ),
934
+ )
935
+ )
936
+ yield _send_event(
937
+ openai_responses_types.ResponseContentPartAddedEvent(
938
+ type="response.content_part.added",
939
+ sequence_number=-1,
940
+ output_index=current_output_index,
941
+ item_id=current_item_id,
942
+ content_index=current_content_index,
943
+ part=openai_responses_types.ResponseOutputText(
944
+ type="output_text",
945
+ text="",
946
+ annotations=[],
947
+ logprobs=[],
948
+ ),
949
+ )
950
+ )
951
+ yield _send_event(
952
+ openai_responses_types.ResponseTextDeltaEvent(
953
+ type="response.output_text.delta",
954
+ sequence_number=-1,
955
+ content_index=current_content_index,
956
+ output_index=current_output_index,
957
+ item_id=current_item_id,
958
+ delta=ctx.parser.last_content_delta,
959
+ # TODO, use logprobs from ctx.last_request_output
960
+ logprobs=[],
961
+ )
962
+ )
963
+ elif (
964
+ ctx.parser.current_channel == "analysis"
965
+ and ctx.parser.current_recipient is None
966
+ ):
967
+ if not sent_output_item_added:
968
+ sent_output_item_added = True
969
+ yield _send_event(
970
+ openai_responses_types.ResponseOutputItemAddedEvent(
971
+ type="response.output_item.added",
972
+ sequence_number=-1,
973
+ output_index=current_output_index,
974
+ item=openai_responses_types.ResponseReasoningItem(
975
+ type="reasoning",
976
+ id=current_item_id,
977
+ summary=[],
978
+ status="in_progress",
979
+ ),
980
+ )
981
+ )
982
+ yield _send_event(
983
+ openai_responses_types.ResponseContentPartAddedEvent(
984
+ type="response.content_part.added",
985
+ sequence_number=-1,
986
+ output_index=current_output_index,
987
+ item_id=current_item_id,
988
+ content_index=current_content_index,
989
+ # TODO: migrate this to
990
+ # ResponseReasoningTextContent for now
991
+ part=openai_responses_types.ResponseOutputText(
992
+ type="output_text",
993
+ text="",
994
+ annotations=[],
995
+ logprobs=[],
996
+ ),
997
+ )
998
+ )
999
+ # TODO: migrate to OpenAI types once updated.
1000
+ yield _send_event(
1001
+ openai_responses_types.ResponseReasoningTextDeltaEvent(
1002
+ type="response.reasoning_text.delta",
1003
+ item_id=current_item_id,
1004
+ output_index=current_output_index,
1005
+ content_index=current_content_index,
1006
+ delta=ctx.parser.last_content_delta,
1007
+ sequence_number=-1,
1008
+ )
1009
+ )
1010
+
1011
+ if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
1012
+ previous_item = ctx.parser.messages[-1]
1013
+ if (
1014
+ self.supports_browsing
1015
+ and previous_item.recipient is not None
1016
+ and previous_item.recipient.startswith("browser.")
1017
+ ):
1018
+ function_name = previous_item.recipient[len("browser.") :]
1019
+ action = None
1020
+ parsed_args = json.loads(previous_item.content[0].text)
1021
+ if function_name == "search":
1022
+ action = openai_responses_types.response_function_web_search.ActionSearch(
1023
+ type="search",
1024
+ query=parsed_args["query"],
1025
+ )
1026
+ elif function_name == "open":
1027
+ action = openai_responses_types.response_function_web_search.ActionOpenPage(
1028
+ type="open_page",
1029
+ # TODO: translate to url
1030
+ url=f"cursor:{parsed_args.get('cursor', '')}",
1031
+ )
1032
+ elif function_name == "find":
1033
+ action = openai_responses_types.response_function_web_search.ActionFind(
1034
+ type="find",
1035
+ pattern=parsed_args["pattern"],
1036
+ # TODO: translate to url
1037
+ url=f"cursor:{parsed_args.get('cursor', '')}",
1038
+ )
1039
+ else:
1040
+ raise ValueError(f"Unknown function name: {function_name}")
1041
+
1042
+ yield _send_event(
1043
+ openai_responses_types.ResponseOutputItemAddedEvent(
1044
+ type="response.output_item.added",
1045
+ sequence_number=-1,
1046
+ output_index=current_output_index,
1047
+ item=openai_responses_types.response_function_web_search.ResponseFunctionWebSearch(
1048
+ # TODO: generate a unique id for web search call
1049
+ type="web_search_call",
1050
+ id=current_item_id,
1051
+ action=action,
1052
+ status="in_progress",
1053
+ ),
1054
+ )
1055
+ )
1056
+ yield _send_event(
1057
+ openai_responses_types.ResponseWebSearchCallInProgressEvent(
1058
+ type="response.web_search_call.in_progress",
1059
+ sequence_number=-1,
1060
+ output_index=current_output_index,
1061
+ item_id=current_item_id,
1062
+ )
1063
+ )
1064
+ yield _send_event(
1065
+ openai_responses_types.ResponseWebSearchCallSearchingEvent(
1066
+ type="response.web_search_call.searching",
1067
+ sequence_number=-1,
1068
+ output_index=current_output_index,
1069
+ item_id=current_item_id,
1070
+ )
1071
+ )
1072
+
1073
+ # enqueue
1074
+ yield _send_event(
1075
+ openai_responses_types.ResponseWebSearchCallCompletedEvent(
1076
+ type="response.web_search_call.completed",
1077
+ sequence_number=-1,
1078
+ output_index=current_output_index,
1079
+ item_id=current_item_id,
1080
+ )
1081
+ )
1082
+ yield _send_event(
1083
+ openai_responses_types.ResponseOutputItemDoneEvent(
1084
+ type="response.output_item.done",
1085
+ sequence_number=-1,
1086
+ output_index=current_output_index,
1087
+ item=openai_responses_types.ResponseFunctionWebSearch(
1088
+ type="web_search_call",
1089
+ id=current_item_id,
1090
+ action=action,
1091
+ status="completed",
1092
+ ),
1093
+ )
1094
+ )
1095
+
1096
+ if (
1097
+ self.supports_code_interpreter
1098
+ and previous_item.recipient is not None
1099
+ and previous_item.recipient.startswith("python")
1100
+ ):
1101
+ yield _send_event(
1102
+ openai_responses_types.ResponseOutputItemAddedEvent(
1103
+ type="response.output_item.added",
1104
+ sequence_number=-1,
1105
+ output_index=current_output_index,
1106
+ item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
1107
+ type="code_interpreter_call",
1108
+ id=current_item_id,
1109
+ code="",
1110
+ container_id="auto",
1111
+ outputs=[],
1112
+ status="in_progress",
1113
+ ),
1114
+ )
1115
+ )
1116
+ yield _send_event(
1117
+ openai_responses_types.ResponseCodeInterpreterCallInProgressEvent(
1118
+ type="response.code_interpreter_call.in_progress",
1119
+ sequence_number=-1,
1120
+ output_index=current_output_index,
1121
+ item_id=current_item_id,
1122
+ )
1123
+ )
1124
+ # TODO: do we need to add delta event here?
1125
+ yield _send_event(
1126
+ openai_responses_types.ResponseCodeInterpreterCallCodeDoneEvent(
1127
+ type="response.code_interpreter_call_code.done",
1128
+ sequence_number=-1,
1129
+ output_index=current_output_index,
1130
+ item_id=current_item_id,
1131
+ code=previous_item.content[0].text,
1132
+ )
1133
+ )
1134
+ yield _send_event(
1135
+ openai_responses_types.ResponseCodeInterpreterCallInterpretingEvent(
1136
+ type="response.code_interpreter_call.interpreting",
1137
+ sequence_number=-1,
1138
+ output_index=current_output_index,
1139
+ item_id=current_item_id,
1140
+ )
1141
+ )
1142
+ yield _send_event(
1143
+ openai_responses_types.ResponseCodeInterpreterCallCompletedEvent(
1144
+ type="response.code_interpreter_call.completed",
1145
+ sequence_number=-1,
1146
+ output_index=current_output_index,
1147
+ item_id=current_item_id,
1148
+ )
1149
+ )
1150
+ yield _send_event(
1151
+ openai_responses_types.ResponseOutputItemDoneEvent(
1152
+ type="response.output_item.done",
1153
+ sequence_number=-1,
1154
+ output_index=current_output_index,
1155
+ item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
1156
+ type="code_interpreter_call",
1157
+ id=current_item_id,
1158
+ code=previous_item.content[0].text,
1159
+ container_id="auto",
1160
+ # TODO: add outputs here
1161
+ outputs=[],
1162
+ status="completed",
1163
+ ),
1164
+ )
1165
+ )
1166
+
1167
+ async def empty_async_generator():
1168
+ if False:
1169
+ yield
1170
+
1171
+ final_response = await self.responses_full_generator(
1172
+ request,
1173
+ sampling_params,
1174
+ empty_async_generator(),
1175
+ context,
1176
+ model_name,
1177
+ tokenizer,
1178
+ request_metadata,
1179
+ created_time=created_time,
1180
+ )
1181
+ # Convert final_response to the format expected by ResponseCompletedEvent
1182
+ response_dict = final_response.model_dump()
1183
+
1184
+ # Convert UsageInfo to ResponseUsage format
1185
+ if response_dict.get("usage"):
1186
+ usage_info = response_dict["usage"]
1187
+ response_dict["usage"] = {
1188
+ "input_tokens": usage_info.get("prompt_tokens", 0),
1189
+ "input_tokens_details": {
1190
+ "cached_tokens": usage_info.get("cached_tokens", 0)
1191
+ },
1192
+ "output_tokens": usage_info.get("completion_tokens", 0),
1193
+ "output_tokens_details": {
1194
+ "reasoning_tokens": usage_info.get("reasoning_tokens", 0)
1195
+ },
1196
+ "total_tokens": usage_info.get("total_tokens", 0),
1197
+ }
1198
+
1199
+ yield _send_event(
1200
+ openai_responses_types.ResponseCompletedEvent(
1201
+ type="response.completed",
1202
+ sequence_number=-1,
1203
+ response=response_dict,
1204
+ )
1205
+ )
1206
+
1207
+ async def _generate_with_builtin_tools(
1208
+ self,
1209
+ request_id: str,
1210
+ request_prompt: Any,
1211
+ adapted_request: GenerateReqInput,
1212
+ sampling_params: Any,
1213
+ context: ConversationContext,
1214
+ raw_request: Optional[Request] = None,
1215
+ priority: Optional[int] = None,
1216
+ **kwargs,
1217
+ ) -> AsyncGenerator[Any, None]:
1218
+ """Generate with builtin tool support for harmony-based models."""
1219
+ orig_priority = priority or 0
1220
+
1221
+ while True:
1222
+ # Generate using SGLang's tokenizer manager
1223
+ generator = self.tokenizer_manager.generate_request(
1224
+ adapted_request, raw_request
1225
+ )
1226
+
1227
+ async for res in generator:
1228
+ context.append_output(res)
1229
+ # NOTE(woosuk): The stop condition is handled by the engine.
1230
+ yield context
1231
+
1232
+ if not context.need_builtin_tool_call():
1233
+ # The model did not ask for a tool call, so we're done.
1234
+ break
1235
+
1236
+ # Call the tool and update the context with the result.
1237
+ tool_output = await context.call_tool()
1238
+ context.append_output(tool_output)
1239
+
1240
+ # Prepare for the next generation turn
1241
+ # Render the updated conversation for the next completion
1242
+ prompt_token_ids = context.render_for_completion()
1243
+
1244
+ # Update the adapted request with new prompt
1245
+ adapted_request = GenerateReqInput(
1246
+ input_ids=prompt_token_ids,
1247
+ sampling_params=sampling_params,
1248
+ stream=adapted_request.stream,
1249
+ rid=request_id,
1250
+ return_logprob=adapted_request.return_logprob,
1251
+ logprob_start_len=adapted_request.logprob_start_len,
1252
+ top_logprobs_num=adapted_request.top_logprobs_num,
1253
+ return_text_in_logprobs=adapted_request.return_text_in_logprobs,
1254
+ return_hidden_states=adapted_request.return_hidden_states,
1255
+ background=adapted_request.background,
1256
+ )
1257
+
1258
+ # Update sampling params with reduced max_tokens
1259
+ if hasattr(sampling_params, "max_new_tokens") or isinstance(
1260
+ sampling_params, dict
1261
+ ):
1262
+ context_len = getattr(
1263
+ self.tokenizer_manager.model_config, "context_len", 4096
1264
+ )
1265
+ remaining_tokens = context_len - len(prompt_token_ids) - 1
1266
+
1267
+ if isinstance(sampling_params, dict):
1268
+ sampling_params["max_new_tokens"] = max(remaining_tokens, 1)
1269
+ else:
1270
+ sampling_params.max_new_tokens = max(remaining_tokens, 1)
1271
+
1272
+ # Slightly reduce priority for subsequent tool calls
1273
+ priority = orig_priority - 1