openai-agents 0.2.6__py3-none-any.whl → 0.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. agents/__init__.py +105 -4
  2. agents/_debug.py +15 -4
  3. agents/_run_impl.py +1203 -96
  4. agents/agent.py +294 -21
  5. agents/apply_diff.py +329 -0
  6. agents/editor.py +47 -0
  7. agents/exceptions.py +35 -0
  8. agents/extensions/experimental/__init__.py +6 -0
  9. agents/extensions/experimental/codex/__init__.py +92 -0
  10. agents/extensions/experimental/codex/codex.py +89 -0
  11. agents/extensions/experimental/codex/codex_options.py +35 -0
  12. agents/extensions/experimental/codex/codex_tool.py +1142 -0
  13. agents/extensions/experimental/codex/events.py +162 -0
  14. agents/extensions/experimental/codex/exec.py +263 -0
  15. agents/extensions/experimental/codex/items.py +245 -0
  16. agents/extensions/experimental/codex/output_schema_file.py +50 -0
  17. agents/extensions/experimental/codex/payloads.py +31 -0
  18. agents/extensions/experimental/codex/thread.py +214 -0
  19. agents/extensions/experimental/codex/thread_options.py +54 -0
  20. agents/extensions/experimental/codex/turn_options.py +36 -0
  21. agents/extensions/handoff_filters.py +13 -1
  22. agents/extensions/memory/__init__.py +120 -0
  23. agents/extensions/memory/advanced_sqlite_session.py +1285 -0
  24. agents/extensions/memory/async_sqlite_session.py +239 -0
  25. agents/extensions/memory/dapr_session.py +423 -0
  26. agents/extensions/memory/encrypt_session.py +185 -0
  27. agents/extensions/memory/redis_session.py +261 -0
  28. agents/extensions/memory/sqlalchemy_session.py +334 -0
  29. agents/extensions/models/litellm_model.py +449 -36
  30. agents/extensions/models/litellm_provider.py +3 -1
  31. agents/function_schema.py +47 -5
  32. agents/guardrail.py +16 -2
  33. agents/{handoffs.py → handoffs/__init__.py} +89 -47
  34. agents/handoffs/history.py +268 -0
  35. agents/items.py +238 -13
  36. agents/lifecycle.py +75 -14
  37. agents/mcp/server.py +280 -37
  38. agents/mcp/util.py +24 -3
  39. agents/memory/__init__.py +22 -2
  40. agents/memory/openai_conversations_session.py +91 -0
  41. agents/memory/openai_responses_compaction_session.py +249 -0
  42. agents/memory/session.py +19 -261
  43. agents/memory/sqlite_session.py +275 -0
  44. agents/memory/util.py +20 -0
  45. agents/model_settings.py +18 -3
  46. agents/models/__init__.py +13 -0
  47. agents/models/chatcmpl_converter.py +303 -50
  48. agents/models/chatcmpl_helpers.py +63 -0
  49. agents/models/chatcmpl_stream_handler.py +290 -68
  50. agents/models/default_models.py +58 -0
  51. agents/models/interface.py +4 -0
  52. agents/models/openai_chatcompletions.py +103 -48
  53. agents/models/openai_provider.py +10 -4
  54. agents/models/openai_responses.py +167 -46
  55. agents/realtime/__init__.py +4 -0
  56. agents/realtime/_util.py +14 -3
  57. agents/realtime/agent.py +7 -0
  58. agents/realtime/audio_formats.py +53 -0
  59. agents/realtime/config.py +78 -10
  60. agents/realtime/events.py +18 -0
  61. agents/realtime/handoffs.py +2 -2
  62. agents/realtime/items.py +17 -1
  63. agents/realtime/model.py +13 -0
  64. agents/realtime/model_events.py +12 -0
  65. agents/realtime/model_inputs.py +18 -1
  66. agents/realtime/openai_realtime.py +700 -151
  67. agents/realtime/session.py +309 -32
  68. agents/repl.py +7 -3
  69. agents/result.py +197 -38
  70. agents/run.py +1053 -178
  71. agents/run_context.py +13 -2
  72. agents/stream_events.py +1 -0
  73. agents/strict_schema.py +14 -0
  74. agents/tool.py +413 -15
  75. agents/tool_context.py +22 -1
  76. agents/tool_guardrails.py +279 -0
  77. agents/tracing/__init__.py +2 -0
  78. agents/tracing/config.py +9 -0
  79. agents/tracing/create.py +4 -0
  80. agents/tracing/processor_interface.py +84 -11
  81. agents/tracing/processors.py +65 -54
  82. agents/tracing/provider.py +64 -7
  83. agents/tracing/spans.py +105 -0
  84. agents/tracing/traces.py +116 -16
  85. agents/usage.py +134 -12
  86. agents/util/_json.py +19 -1
  87. agents/util/_transforms.py +12 -2
  88. agents/voice/input.py +5 -4
  89. agents/voice/models/openai_stt.py +17 -9
  90. agents/voice/pipeline.py +2 -0
  91. agents/voice/pipeline_config.py +4 -0
  92. {openai_agents-0.2.6.dist-info → openai_agents-0.6.8.dist-info}/METADATA +44 -19
  93. openai_agents-0.6.8.dist-info/RECORD +134 -0
  94. {openai_agents-0.2.6.dist-info → openai_agents-0.6.8.dist-info}/WHEEL +1 -1
  95. openai_agents-0.2.6.dist-info/RECORD +0 -103
  96. {openai_agents-0.2.6.dist-info → openai_agents-0.6.8.dist-info}/licenses/LICENSE +0 -0
@@ -5,69 +5,101 @@ import base64
5
5
  import inspect
6
6
  import json
7
7
  import os
8
+ from collections.abc import Mapping
8
9
  from datetime import datetime
9
- from typing import Any, Callable, Literal
10
+ from typing import Annotated, Any, Callable, Literal, Union, cast
10
11
 
11
12
  import pydantic
12
13
  import websockets
13
- from openai.types.beta.realtime.conversation_item import (
14
+ from openai.types.realtime import realtime_audio_config as _rt_audio_config
15
+ from openai.types.realtime.conversation_item import (
14
16
  ConversationItem,
15
17
  ConversationItem as OpenAIConversationItem,
16
18
  )
17
- from openai.types.beta.realtime.conversation_item_content import (
18
- ConversationItemContent as OpenAIConversationItemContent,
19
- )
20
- from openai.types.beta.realtime.conversation_item_create_event import (
19
+ from openai.types.realtime.conversation_item_create_event import (
21
20
  ConversationItemCreateEvent as OpenAIConversationItemCreateEvent,
22
21
  )
23
- from openai.types.beta.realtime.conversation_item_retrieve_event import (
22
+ from openai.types.realtime.conversation_item_retrieve_event import (
24
23
  ConversationItemRetrieveEvent as OpenAIConversationItemRetrieveEvent,
25
24
  )
26
- from openai.types.beta.realtime.conversation_item_truncate_event import (
25
+ from openai.types.realtime.conversation_item_truncate_event import (
27
26
  ConversationItemTruncateEvent as OpenAIConversationItemTruncateEvent,
28
27
  )
29
- from openai.types.beta.realtime.input_audio_buffer_append_event import (
28
+ from openai.types.realtime.input_audio_buffer_append_event import (
30
29
  InputAudioBufferAppendEvent as OpenAIInputAudioBufferAppendEvent,
31
30
  )
32
- from openai.types.beta.realtime.input_audio_buffer_commit_event import (
31
+ from openai.types.realtime.input_audio_buffer_commit_event import (
33
32
  InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent,
34
33
  )
35
- from openai.types.beta.realtime.realtime_client_event import (
34
+ from openai.types.realtime.realtime_audio_formats import (
35
+ AudioPCM,
36
+ AudioPCMA,
37
+ AudioPCMU,
38
+ )
39
+ from openai.types.realtime.realtime_client_event import (
36
40
  RealtimeClientEvent as OpenAIRealtimeClientEvent,
37
41
  )
38
- from openai.types.beta.realtime.realtime_server_event import (
42
+ from openai.types.realtime.realtime_conversation_item_assistant_message import (
43
+ RealtimeConversationItemAssistantMessage,
44
+ )
45
+ from openai.types.realtime.realtime_conversation_item_function_call_output import (
46
+ RealtimeConversationItemFunctionCallOutput,
47
+ )
48
+ from openai.types.realtime.realtime_conversation_item_system_message import (
49
+ RealtimeConversationItemSystemMessage,
50
+ )
51
+ from openai.types.realtime.realtime_conversation_item_user_message import (
52
+ Content,
53
+ RealtimeConversationItemUserMessage,
54
+ )
55
+ from openai.types.realtime.realtime_function_tool import (
56
+ RealtimeFunctionTool as OpenAISessionFunction,
57
+ )
58
+ from openai.types.realtime.realtime_server_event import (
39
59
  RealtimeServerEvent as OpenAIRealtimeServerEvent,
40
60
  )
41
- from openai.types.beta.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
42
- from openai.types.beta.realtime.response_cancel_event import (
61
+ from openai.types.realtime.realtime_session_create_request import (
62
+ RealtimeSessionCreateRequest as OpenAISessionCreateRequest,
63
+ )
64
+ from openai.types.realtime.realtime_tracing_config import (
65
+ TracingConfiguration as OpenAITracingConfiguration,
66
+ )
67
+ from openai.types.realtime.realtime_transcription_session_create_request import (
68
+ RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest,
69
+ )
70
+ from openai.types.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
71
+ from openai.types.realtime.response_cancel_event import (
43
72
  ResponseCancelEvent as OpenAIResponseCancelEvent,
44
73
  )
45
- from openai.types.beta.realtime.response_create_event import (
74
+ from openai.types.realtime.response_create_event import (
46
75
  ResponseCreateEvent as OpenAIResponseCreateEvent,
47
76
  )
48
- from openai.types.beta.realtime.session_update_event import (
49
- Session as OpenAISessionObject,
50
- SessionTool as OpenAISessionTool,
51
- SessionTracing as OpenAISessionTracing,
52
- SessionTracingTracingConfiguration as OpenAISessionTracingConfiguration,
77
+ from openai.types.realtime.session_update_event import (
53
78
  SessionUpdateEvent as OpenAISessionUpdateEvent,
54
79
  )
55
- from pydantic import TypeAdapter
56
- from typing_extensions import assert_never
80
+ from openai.types.responses.response_prompt import ResponsePrompt
81
+ from pydantic import Field, TypeAdapter
82
+ from typing_extensions import TypeAlias, assert_never
57
83
  from websockets.asyncio.client import ClientConnection
58
84
 
59
85
  from agents.handoffs import Handoff
86
+ from agents.prompts import Prompt
60
87
  from agents.realtime._default_tracker import ModelAudioTracker
88
+ from agents.realtime.audio_formats import to_realtime_audio_format
61
89
  from agents.tool import FunctionTool, Tool
62
90
  from agents.util._types import MaybeAwaitable
63
91
 
64
92
  from ..exceptions import UserError
65
93
  from ..logger import logger
94
+ from ..run_context import RunContextWrapper, TContext
66
95
  from ..version import __version__
96
+ from .agent import RealtimeAgent
67
97
  from .config import (
68
98
  RealtimeModelTracingConfig,
99
+ RealtimeRunConfig,
69
100
  RealtimeSessionModelSettings,
70
101
  )
102
+ from .handoffs import realtime_handoff
71
103
  from .items import RealtimeMessageItem, RealtimeToolCallItem
72
104
  from .model import (
73
105
  RealtimeModel,
@@ -83,6 +115,7 @@ from .model_events import (
83
115
  RealtimeModelErrorEvent,
84
116
  RealtimeModelEvent,
85
117
  RealtimeModelExceptionEvent,
118
+ RealtimeModelInputAudioTimeoutTriggeredEvent,
86
119
  RealtimeModelInputAudioTranscriptionCompletedEvent,
87
120
  RealtimeModelItemDeletedEvent,
88
121
  RealtimeModelItemUpdatedEvent,
@@ -102,17 +135,33 @@ from .model_inputs import (
102
135
  RealtimeModelSendUserInput,
103
136
  )
104
137
 
138
+ FormatInput: TypeAlias = Union[
139
+ str,
140
+ AudioPCM,
141
+ AudioPCMU,
142
+ AudioPCMA,
143
+ Mapping[str, Any],
144
+ None,
145
+ ]
146
+
147
+
148
+ # Avoid direct imports of non-exported names by referencing via module
149
+ OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig
150
+ OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput # type: ignore[attr-defined]
151
+ OpenAIRealtimeAudioOutput = _rt_audio_config.RealtimeAudioConfigOutput # type: ignore[attr-defined]
152
+
153
+
105
154
  _USER_AGENT = f"Agents/Python {__version__}"
106
155
 
107
156
  DEFAULT_MODEL_SETTINGS: RealtimeSessionModelSettings = {
108
157
  "voice": "ash",
109
- "modalities": ["text", "audio"],
158
+ "modalities": ["audio"],
110
159
  "input_audio_format": "pcm16",
111
160
  "output_audio_format": "pcm16",
112
161
  "input_audio_transcription": {
113
162
  "model": "gpt-4o-mini-transcribe",
114
163
  },
115
- "turn_detection": {"type": "semantic_vad"},
164
+ "turn_detection": {"type": "semantic_vad", "interrupt_response": True},
116
165
  }
117
166
 
118
167
 
@@ -128,11 +177,85 @@ async def get_api_key(key: str | Callable[[], MaybeAwaitable[str]] | None) -> st
128
177
  return os.getenv("OPENAI_API_KEY")
129
178
 
130
179
 
180
+ AllRealtimeServerEvents = Annotated[
181
+ Union[OpenAIRealtimeServerEvent,],
182
+ Field(discriminator="type"),
183
+ ]
184
+
185
+ ServerEventTypeAdapter: TypeAdapter[AllRealtimeServerEvents] | None = None
186
+
187
+
188
+ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
189
+ global ServerEventTypeAdapter
190
+ if not ServerEventTypeAdapter:
191
+ ServerEventTypeAdapter = TypeAdapter(AllRealtimeServerEvents)
192
+ return ServerEventTypeAdapter
193
+
194
+
195
+ async def _collect_enabled_handoffs(
196
+ agent: RealtimeAgent[Any], context_wrapper: RunContextWrapper[Any]
197
+ ) -> list[Handoff[Any, RealtimeAgent[Any]]]:
198
+ handoffs: list[Handoff[Any, RealtimeAgent[Any]]] = []
199
+ for handoff_item in agent.handoffs:
200
+ if isinstance(handoff_item, Handoff):
201
+ handoffs.append(handoff_item)
202
+ elif isinstance(handoff_item, RealtimeAgent):
203
+ handoffs.append(realtime_handoff(handoff_item))
204
+
205
+ async def _check_handoff_enabled(handoff_obj: Handoff[Any, RealtimeAgent[Any]]) -> bool:
206
+ attr = handoff_obj.is_enabled
207
+ if isinstance(attr, bool):
208
+ return attr
209
+ res = attr(context_wrapper, agent)
210
+ if inspect.isawaitable(res):
211
+ return await res
212
+ return res
213
+
214
+ results = await asyncio.gather(*(_check_handoff_enabled(h) for h in handoffs))
215
+ return [h for h, ok in zip(handoffs, results) if ok]
216
+
217
+
218
+ async def _build_model_settings_from_agent(
219
+ *,
220
+ agent: RealtimeAgent[Any],
221
+ context_wrapper: RunContextWrapper[Any],
222
+ base_settings: RealtimeSessionModelSettings,
223
+ starting_settings: RealtimeSessionModelSettings | None,
224
+ run_config: RealtimeRunConfig | None,
225
+ ) -> RealtimeSessionModelSettings:
226
+ updated_settings = base_settings.copy()
227
+
228
+ if agent.prompt is not None:
229
+ updated_settings["prompt"] = agent.prompt
230
+
231
+ instructions, tools, handoffs = await asyncio.gather(
232
+ agent.get_system_prompt(context_wrapper),
233
+ agent.get_all_tools(context_wrapper),
234
+ _collect_enabled_handoffs(agent, context_wrapper),
235
+ )
236
+ updated_settings["instructions"] = instructions or ""
237
+ updated_settings["tools"] = tools or []
238
+ updated_settings["handoffs"] = handoffs or []
239
+
240
+ if starting_settings:
241
+ updated_settings.update(starting_settings)
242
+
243
+ if run_config and run_config.get("tracing_disabled", False):
244
+ updated_settings["tracing"] = None
245
+
246
+ return updated_settings
247
+
248
+
249
+ # Note: Avoid a module-level union alias for Python 3.9 compatibility.
250
+ # Using a union at runtime (e.g., A | B) in a type alias triggers evaluation
251
+ # during import on 3.9. We instead inline the union in annotations below.
252
+
253
+
131
254
  class OpenAIRealtimeWebSocketModel(RealtimeModel):
132
255
  """A model that uses OpenAI's WebSocket API."""
133
256
 
134
257
  def __init__(self) -> None:
135
- self.model = "gpt-4o-realtime-preview" # Default model
258
+ self.model = "gpt-realtime" # Default model
136
259
  self._websocket: ClientConnection | None = None
137
260
  self._websocket_task: asyncio.Task[None] | None = None
138
261
  self._listeners: list[RealtimeModelListener] = []
@@ -141,7 +264,9 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
141
264
  self._ongoing_response: bool = False
142
265
  self._tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None = None
143
266
  self._playback_tracker: RealtimePlaybackTracker | None = None
144
- self._created_session: OpenAISessionObject | None = None
267
+ self._created_session: OpenAISessionCreateRequest | None = None
268
+ self._server_event_type_adapter = get_server_event_type_adapter()
269
+ self._call_id: str | None = None
145
270
 
146
271
  async def connect(self, options: RealtimeModelConfig) -> None:
147
272
  """Establish a connection to the model and keep it alive."""
@@ -152,7 +277,19 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
152
277
 
153
278
  self._playback_tracker = options.get("playback_tracker", None)
154
279
 
155
- self.model = model_settings.get("model_name", self.model)
280
+ call_id = options.get("call_id")
281
+ model_name = model_settings.get("model_name")
282
+ if call_id and model_name:
283
+ error_message = (
284
+ "Cannot specify both `call_id` and `model_name` "
285
+ "when attaching to an existing realtime call."
286
+ )
287
+ raise UserError(error_message)
288
+
289
+ if model_name:
290
+ self.model = model_name
291
+
292
+ self._call_id = call_id
156
293
  api_key = await get_api_key(options.get("api_key"))
157
294
 
158
295
  if "tracing" in model_settings:
@@ -160,17 +297,26 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
160
297
  else:
161
298
  self._tracing_config = "auto"
162
299
 
163
- if not api_key:
164
- raise UserError("API key is required but was not provided.")
300
+ if call_id:
301
+ url = options.get("url", f"wss://api.openai.com/v1/realtime?call_id={call_id}")
302
+ else:
303
+ url = options.get("url", f"wss://api.openai.com/v1/realtime?model={self.model}")
165
304
 
166
- url = options.get("url", f"wss://api.openai.com/v1/realtime?model={self.model}")
305
+ headers: dict[str, str] = {}
306
+ if options.get("headers") is not None:
307
+ # For customizing request headers
308
+ headers.update(options["headers"])
309
+ else:
310
+ # OpenAI's Realtime API
311
+ if not api_key:
312
+ raise UserError("API key is required but was not provided.")
167
313
 
168
- headers = {
169
- "Authorization": f"Bearer {api_key}",
170
- "OpenAI-Beta": "realtime=v1",
171
- }
314
+ headers.update({"Authorization": f"Bearer {api_key}"})
172
315
  self._websocket = await websockets.connect(
173
- url, user_agent_header=_USER_AGENT, additional_headers=headers
316
+ url,
317
+ user_agent_header=_USER_AGENT,
318
+ additional_headers=headers,
319
+ max_size=None, # Allow any size of message
174
320
  )
175
321
  self._websocket_task = asyncio.create_task(self._listen_for_messages())
176
322
  await self._update_session_config(model_settings)
@@ -183,7 +329,11 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
183
329
  converted_tracing_config = _ConversionHelper.convert_tracing_config(tracing_config)
184
330
  await self._send_raw_message(
185
331
  OpenAISessionUpdateEvent(
186
- session=OpenAISessionObject(tracing=converted_tracing_config),
332
+ session=OpenAISessionCreateRequest(
333
+ model=self.model,
334
+ type="realtime",
335
+ tracing=converted_tracing_config,
336
+ ),
187
337
  type="session.update",
188
338
  )
189
339
  )
@@ -200,7 +350,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
200
350
 
201
351
  async def _emit_event(self, event: RealtimeModelEvent) -> None:
202
352
  """Emit an event to the listeners."""
203
- for listener in self._listeners:
353
+ # Copy list to avoid modification during iteration
354
+ for listener in list(self._listeners):
204
355
  await listener.on_event(event)
205
356
 
206
357
  async def _listen_for_messages(self):
@@ -265,8 +416,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
265
416
  async def _send_raw_message(self, event: OpenAIRealtimeClientEvent) -> None:
266
417
  """Send a raw message to the model."""
267
418
  assert self._websocket is not None, "Not connected"
268
-
269
- await self._websocket.send(event.model_dump_json(exclude_none=True, exclude_unset=True))
419
+ payload = event.model_dump_json(exclude_unset=True)
420
+ await self._websocket.send(payload)
270
421
 
271
422
  async def _send_user_input(self, event: RealtimeModelSendUserInput) -> None:
272
423
  converted = _ConversionHelper.convert_user_input_to_item_create(event)
@@ -328,6 +479,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
328
479
  current_item_id = playback_state.get("current_item_id")
329
480
  current_item_content_index = playback_state.get("current_item_content_index")
330
481
  elapsed_ms = playback_state.get("elapsed_ms")
482
+
331
483
  if current_item_id is None or elapsed_ms is None:
332
484
  logger.debug(
333
485
  "Skipping interrupt. "
@@ -335,41 +487,47 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
335
487
  f"elapsed ms: {elapsed_ms}, "
336
488
  f"content index: {current_item_content_index}"
337
489
  )
338
- return
339
-
340
- current_item_content_index = current_item_content_index or 0
341
- if elapsed_ms > 0:
342
- await self._emit_event(
343
- RealtimeModelAudioInterruptedEvent(
344
- item_id=current_item_id,
345
- content_index=current_item_content_index,
346
- )
347
- )
348
- converted = _ConversionHelper.convert_interrupt(
349
- current_item_id,
350
- current_item_content_index,
351
- int(elapsed_ms),
352
- )
353
- await self._send_raw_message(converted)
354
490
  else:
355
- logger.debug(
356
- "Didn't interrupt bc elapsed ms is < 0. "
357
- f"Item id: {current_item_id}, "
358
- f"elapsed ms: {elapsed_ms}, "
359
- f"content index: {current_item_content_index}"
360
- )
491
+ current_item_content_index = current_item_content_index or 0
492
+ if elapsed_ms > 0:
493
+ await self._emit_event(
494
+ RealtimeModelAudioInterruptedEvent(
495
+ item_id=current_item_id,
496
+ content_index=current_item_content_index,
497
+ )
498
+ )
499
+ converted = _ConversionHelper.convert_interrupt(
500
+ current_item_id,
501
+ current_item_content_index,
502
+ int(elapsed_ms),
503
+ )
504
+ await self._send_raw_message(converted)
505
+ else:
506
+ logger.debug(
507
+ "Didn't interrupt bc elapsed ms is < 0. "
508
+ f"Item id: {current_item_id}, "
509
+ f"elapsed ms: {elapsed_ms}, "
510
+ f"content index: {current_item_content_index}"
511
+ )
361
512
 
513
+ session = self._created_session
362
514
  automatic_response_cancellation_enabled = (
363
- self._created_session
364
- and self._created_session.turn_detection
365
- and self._created_session.turn_detection.interrupt_response
515
+ session
516
+ and session.audio is not None
517
+ and session.audio.input is not None
518
+ and session.audio.input.turn_detection is not None
519
+ and session.audio.input.turn_detection.interrupt_response is True
520
+ )
521
+ should_cancel_response = event.force_response_cancel or (
522
+ not automatic_response_cancellation_enabled
366
523
  )
367
- if not automatic_response_cancellation_enabled:
524
+ if should_cancel_response:
368
525
  await self._cancel_response()
369
526
 
370
- self._audio_state_tracker.on_interrupted()
371
- if self._playback_tracker:
372
- self._playback_tracker.on_interrupted()
527
+ if current_item_id is not None and elapsed_ms is not None:
528
+ self._audio_state_tracker.on_interrupted()
529
+ if self._playback_tracker:
530
+ self._playback_tracker.on_interrupted()
373
531
 
374
532
  async def _send_session_update(self, event: RealtimeModelSendSessionUpdate) -> None:
375
533
  """Send a session update to the model."""
@@ -447,6 +605,10 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
447
605
  self._websocket = None
448
606
  if self._websocket_task:
449
607
  self._websocket_task.cancel()
608
+ try:
609
+ await self._websocket_task
610
+ except asyncio.CancelledError:
611
+ pass
450
612
  self._websocket_task = None
451
613
 
452
614
  async def _cancel_response(self) -> None:
@@ -456,42 +618,121 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
456
618
 
457
619
  async def _handle_ws_event(self, event: dict[str, Any]):
458
620
  await self._emit_event(RealtimeModelRawServerEvent(data=event))
621
+ # The public interface definedo on this Agents SDK side (e.g., RealtimeMessageItem)
622
+ # must be the same even after the GA migration, so this part does the conversion
623
+ if isinstance(event, dict) and event.get("type") in (
624
+ "response.output_item.added",
625
+ "response.output_item.done",
626
+ ):
627
+ item = event.get("item")
628
+ if isinstance(item, dict) and item.get("type") == "message":
629
+ raw_content = item.get("content") or []
630
+ converted_content: list[dict[str, Any]] = []
631
+ for part in raw_content:
632
+ if not isinstance(part, dict):
633
+ continue
634
+ if part.get("type") == "audio":
635
+ converted_content.append(
636
+ {
637
+ "type": "audio",
638
+ "audio": part.get("audio"),
639
+ "transcript": part.get("transcript"),
640
+ }
641
+ )
642
+ elif part.get("type") in ("text", "output_text"):
643
+ converted_content.append({"type": "text", "text": part.get("text")})
644
+ status = item.get("status")
645
+ if status not in ("in_progress", "completed", "incomplete"):
646
+ is_done = event.get("type") == "response.output_item.done"
647
+ status = "completed" if is_done else "in_progress"
648
+ # Explicitly type the adapter for mypy
649
+ type_adapter: TypeAdapter[RealtimeMessageItem] = TypeAdapter(RealtimeMessageItem)
650
+ message_item: RealtimeMessageItem = type_adapter.validate_python(
651
+ {
652
+ "item_id": item.get("id", ""),
653
+ "type": "message",
654
+ "role": item.get("role", "assistant"),
655
+ "content": converted_content,
656
+ "status": status,
657
+ }
658
+ )
659
+ await self._emit_event(RealtimeModelItemUpdatedEvent(item=message_item))
660
+ return
661
+
459
662
  try:
460
663
  if "previous_item_id" in event and event["previous_item_id"] is None:
461
664
  event["previous_item_id"] = "" # TODO (rm) remove
462
- parsed: OpenAIRealtimeServerEvent = TypeAdapter(
463
- OpenAIRealtimeServerEvent
464
- ).validate_python(event)
665
+ parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
465
666
  except pydantic.ValidationError as e:
466
667
  logger.error(f"Failed to validate server event: {event}", exc_info=True)
467
- await self._emit_event(
468
- RealtimeModelErrorEvent(
469
- error=e,
470
- )
471
- )
668
+ await self._emit_event(RealtimeModelErrorEvent(error=e))
472
669
  return
473
670
  except Exception as e:
474
671
  event_type = event.get("type", "unknown") if isinstance(event, dict) else "unknown"
475
672
  logger.error(f"Failed to validate server event: {event}", exc_info=True)
476
- await self._emit_event(
477
- RealtimeModelExceptionEvent(
478
- exception=e,
479
- context=f"Failed to validate server event: {event_type}",
480
- )
673
+ exception_event = RealtimeModelExceptionEvent(
674
+ exception=e,
675
+ context=f"Failed to validate server event: {event_type}",
481
676
  )
677
+ await self._emit_event(exception_event)
482
678
  return
483
679
 
484
- if parsed.type == "response.audio.delta":
680
+ if parsed.type == "response.output_audio.delta":
485
681
  await self._handle_audio_delta(parsed)
486
- elif parsed.type == "response.audio.done":
487
- await self._emit_event(
488
- RealtimeModelAudioDoneEvent(
489
- item_id=parsed.item_id,
490
- content_index=parsed.content_index,
491
- )
682
+ elif parsed.type == "response.output_audio.done":
683
+ audio_done_event = RealtimeModelAudioDoneEvent(
684
+ item_id=parsed.item_id,
685
+ content_index=parsed.content_index,
492
686
  )
687
+ await self._emit_event(audio_done_event)
493
688
  elif parsed.type == "input_audio_buffer.speech_started":
494
- await self._send_interrupt(RealtimeModelSendInterrupt())
689
+ # On VAD speech start, immediately stop local playback so the user can
690
+ # barge‑in without overlapping assistant audio.
691
+ last_audio = self._audio_state_tracker.get_last_audio_item()
692
+ if last_audio is not None:
693
+ item_id, content_index = last_audio
694
+ playback_state = self._get_playback_state()
695
+ playback_item_id = playback_state.get("current_item_id")
696
+ playback_content_index = playback_state.get("current_item_content_index") or 0
697
+ playback_elapsed_ms = playback_state.get("elapsed_ms")
698
+ await self._emit_event(
699
+ RealtimeModelAudioInterruptedEvent(item_id=item_id, content_index=content_index)
700
+ )
701
+
702
+ elapsed_override = getattr(parsed, "audio_end_ms", None)
703
+ if elapsed_override is None or elapsed_override <= 0:
704
+ effective_elapsed_ms = playback_elapsed_ms
705
+ else:
706
+ effective_elapsed_ms = float(elapsed_override)
707
+
708
+ if playback_item_id and effective_elapsed_ms is not None:
709
+ truncated_ms = max(int(round(effective_elapsed_ms)), 0)
710
+ await self._send_raw_message(
711
+ _ConversionHelper.convert_interrupt(
712
+ playback_item_id,
713
+ playback_content_index,
714
+ truncated_ms,
715
+ )
716
+ )
717
+
718
+ # Reset trackers so subsequent playback state queries don't
719
+ # reference audio that has been interrupted client‑side.
720
+ self._audio_state_tracker.on_interrupted()
721
+ if self._playback_tracker:
722
+ self._playback_tracker.on_interrupted()
723
+
724
+ # If server isn't configured to auto‑interrupt/cancel, cancel the
725
+ # response to prevent further audio.
726
+ session = self._created_session
727
+ automatic_response_cancellation_enabled = (
728
+ session
729
+ and session.audio is not None
730
+ and session.audio.input is not None
731
+ and session.audio.input.turn_detection is not None
732
+ and session.audio.input.turn_detection.interrupt_response is True
733
+ )
734
+ if not automatic_response_cancellation_enabled:
735
+ await self._cancel_response()
495
736
  elif parsed.type == "response.created":
496
737
  self._ongoing_response = True
497
738
  await self._emit_event(RealtimeModelTurnStartedEvent())
@@ -500,15 +741,16 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
500
741
  await self._emit_event(RealtimeModelTurnEndedEvent())
501
742
  elif parsed.type == "session.created":
502
743
  await self._send_tracing_config(self._tracing_config)
503
- self._update_created_session(parsed.session) # type: ignore
744
+ self._update_created_session(parsed.session)
504
745
  elif parsed.type == "session.updated":
505
- self._update_created_session(parsed.session) # type: ignore
746
+ self._update_created_session(parsed.session)
506
747
  elif parsed.type == "error":
507
748
  await self._emit_event(RealtimeModelErrorEvent(error=parsed.error))
508
749
  elif parsed.type == "conversation.item.deleted":
509
750
  await self._emit_event(RealtimeModelItemDeletedEvent(item_id=parsed.item_id))
510
751
  elif (
511
- parsed.type == "conversation.item.created"
752
+ parsed.type == "conversation.item.added"
753
+ or parsed.type == "conversation.item.created"
512
754
  or parsed.type == "conversation.item.retrieved"
513
755
  ):
514
756
  previous_item_id = (
@@ -533,7 +775,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
533
775
  item_id=parsed.item_id, transcript=parsed.transcript
534
776
  )
535
777
  )
536
- elif parsed.type == "response.audio_transcript.delta":
778
+ elif parsed.type == "response.output_audio_transcript.delta":
537
779
  await self._emit_event(
538
780
  RealtimeModelTranscriptDeltaEvent(
539
781
  item_id=parsed.item_id, delta=parsed.delta, response_id=parsed.response_id
@@ -541,7 +783,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
541
783
  )
542
784
  elif (
543
785
  parsed.type == "conversation.item.input_audio_transcription.delta"
544
- or parsed.type == "response.text.delta"
786
+ or parsed.type == "response.output_text.delta"
545
787
  or parsed.type == "response.function_call_arguments.delta"
546
788
  ):
547
789
  # No support for partials yet
@@ -551,13 +793,137 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
551
793
  or parsed.type == "response.output_item.done"
552
794
  ):
553
795
  await self._handle_output_item(parsed.item)
796
+ elif parsed.type == "input_audio_buffer.timeout_triggered":
797
+ await self._emit_event(
798
+ RealtimeModelInputAudioTimeoutTriggeredEvent(
799
+ item_id=parsed.item_id,
800
+ audio_start_ms=parsed.audio_start_ms,
801
+ audio_end_ms=parsed.audio_end_ms,
802
+ )
803
+ )
554
804
 
555
- def _update_created_session(self, session: OpenAISessionObject) -> None:
556
- self._created_session = session
557
- if session.output_audio_format:
558
- self._audio_state_tracker.set_audio_format(session.output_audio_format)
559
- if self._playback_tracker:
560
- self._playback_tracker.set_audio_format(session.output_audio_format)
805
+ def _update_created_session(
806
+ self,
807
+ session: OpenAISessionCreateRequest
808
+ | OpenAIRealtimeTranscriptionSessionCreateRequest
809
+ | Mapping[str, object]
810
+ | pydantic.BaseModel,
811
+ ) -> None:
812
+ # Only store/playback-format information for realtime sessions (not transcription-only)
813
+ normalized_session = self._normalize_session_payload(session)
814
+ if not normalized_session:
815
+ return
816
+
817
+ self._created_session = normalized_session
818
+ normalized_format = self._extract_audio_format(normalized_session)
819
+ if normalized_format is None:
820
+ return
821
+
822
+ self._audio_state_tracker.set_audio_format(normalized_format)
823
+ if self._playback_tracker:
824
+ self._playback_tracker.set_audio_format(normalized_format)
825
+
826
+ @staticmethod
827
+ def _normalize_session_payload(
828
+ session: OpenAISessionCreateRequest
829
+ | OpenAIRealtimeTranscriptionSessionCreateRequest
830
+ | Mapping[str, object]
831
+ | pydantic.BaseModel,
832
+ ) -> OpenAISessionCreateRequest | None:
833
+ if isinstance(session, OpenAISessionCreateRequest):
834
+ return session
835
+
836
+ if isinstance(session, OpenAIRealtimeTranscriptionSessionCreateRequest):
837
+ return None
838
+
839
+ session_payload: Mapping[str, object]
840
+ if isinstance(session, pydantic.BaseModel):
841
+ session_payload = cast(Mapping[str, object], session.model_dump())
842
+ elif isinstance(session, Mapping):
843
+ session_payload = session
844
+ else:
845
+ return None
846
+
847
+ if OpenAIRealtimeWebSocketModel._is_transcription_session(session_payload):
848
+ return None
849
+
850
+ try:
851
+ return OpenAISessionCreateRequest.model_validate(session_payload)
852
+ except pydantic.ValidationError:
853
+ return None
854
+
855
+ @staticmethod
856
+ def _is_transcription_session(payload: Mapping[str, object]) -> bool:
857
+ try:
858
+ OpenAIRealtimeTranscriptionSessionCreateRequest.model_validate(payload)
859
+ except pydantic.ValidationError:
860
+ return False
861
+ else:
862
+ return True
863
+
864
+ @staticmethod
865
+ def _extract_audio_format(session: OpenAISessionCreateRequest) -> str | None:
866
+ audio = session.audio
867
+ if not audio or not audio.output or not audio.output.format:
868
+ return None
869
+
870
+ return OpenAIRealtimeWebSocketModel._normalize_audio_format(audio.output.format)
871
+
872
+ @staticmethod
873
+ def _normalize_audio_format(fmt: object) -> str:
874
+ if isinstance(fmt, AudioPCM):
875
+ return "pcm16"
876
+ if isinstance(fmt, AudioPCMU):
877
+ return "g711_ulaw"
878
+ if isinstance(fmt, AudioPCMA):
879
+ return "g711_alaw"
880
+
881
+ fmt_type = OpenAIRealtimeWebSocketModel._read_format_type(fmt)
882
+ if isinstance(fmt_type, str) and fmt_type:
883
+ return fmt_type
884
+
885
+ return str(fmt)
886
+
887
+ @staticmethod
888
+ def _read_format_type(fmt: object) -> str | None:
889
+ if isinstance(fmt, str):
890
+ return fmt
891
+
892
+ if isinstance(fmt, Mapping):
893
+ type_value = fmt.get("type")
894
+ return type_value if isinstance(type_value, str) else None
895
+
896
+ if isinstance(fmt, pydantic.BaseModel):
897
+ type_value = fmt.model_dump().get("type")
898
+ return type_value if isinstance(type_value, str) else None
899
+
900
+ try:
901
+ type_value = fmt.type # type: ignore[attr-defined]
902
+ except AttributeError:
903
+ return None
904
+
905
+ return type_value if isinstance(type_value, str) else None
906
+
907
+ @staticmethod
908
+ def _normalize_turn_detection_config(config: object) -> object:
909
+ """Normalize camelCase turn detection keys to snake_case for API compatibility."""
910
+ if not isinstance(config, Mapping):
911
+ return config
912
+
913
+ normalized = dict(config)
914
+ key_map = {
915
+ "createResponse": "create_response",
916
+ "interruptResponse": "interrupt_response",
917
+ "prefixPaddingMs": "prefix_padding_ms",
918
+ "silenceDurationMs": "silence_duration_ms",
919
+ "idleTimeoutMs": "idle_timeout_ms",
920
+ }
921
+ for camel_key, snake_key in key_map.items():
922
+ if camel_key in normalized and snake_key not in normalized:
923
+ normalized[snake_key] = normalized[camel_key]
924
+ normalized.pop(camel_key, None)
925
+
926
+ return normalized
561
927
 
562
928
  async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None:
563
929
  session_config = self._get_session_config(model_settings)
@@ -567,51 +933,138 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
567
933
 
568
934
  def _get_session_config(
569
935
  self, model_settings: RealtimeSessionModelSettings
570
- ) -> OpenAISessionObject:
936
+ ) -> OpenAISessionCreateRequest:
571
937
  """Get the session config."""
572
- return OpenAISessionObject(
573
- instructions=model_settings.get("instructions", None),
574
- model=(
575
- model_settings.get("model_name", self.model) # type: ignore
576
- or DEFAULT_MODEL_SETTINGS.get("model_name")
577
- ),
578
- voice=model_settings.get("voice", DEFAULT_MODEL_SETTINGS.get("voice")),
579
- speed=model_settings.get("speed", None),
580
- modalities=model_settings.get("modalities", DEFAULT_MODEL_SETTINGS.get("modalities")),
581
- input_audio_format=model_settings.get(
582
- "input_audio_format",
583
- DEFAULT_MODEL_SETTINGS.get("input_audio_format"), # type: ignore
584
- ),
585
- output_audio_format=model_settings.get(
586
- "output_audio_format",
587
- DEFAULT_MODEL_SETTINGS.get("output_audio_format"), # type: ignore
588
- ),
589
- input_audio_transcription=model_settings.get(
590
- "input_audio_transcription",
591
- DEFAULT_MODEL_SETTINGS.get("input_audio_transcription"), # type: ignore
592
- ),
593
- turn_detection=model_settings.get(
594
- "turn_detection",
595
- DEFAULT_MODEL_SETTINGS.get("turn_detection"), # type: ignore
596
- ),
597
- tool_choice=model_settings.get(
598
- "tool_choice",
599
- DEFAULT_MODEL_SETTINGS.get("tool_choice"), # type: ignore
938
+ audio_input_args: dict[str, Any] = {}
939
+ audio_output_args: dict[str, Any] = {}
940
+
941
+ audio_config = model_settings.get("audio")
942
+ audio_config_mapping = audio_config if isinstance(audio_config, Mapping) else None
943
+ input_audio_config: Mapping[str, Any] = (
944
+ cast(Mapping[str, Any], audio_config_mapping.get("input", {}))
945
+ if audio_config_mapping
946
+ else {}
947
+ )
948
+ output_audio_config: Mapping[str, Any] = (
949
+ cast(Mapping[str, Any], audio_config_mapping.get("output", {}))
950
+ if audio_config_mapping
951
+ else {}
952
+ )
953
+
954
+ input_format_source: FormatInput = (
955
+ input_audio_config.get("format") if input_audio_config else None
956
+ )
957
+ if input_format_source is None:
958
+ if self._call_id:
959
+ input_format_source = model_settings.get("input_audio_format")
960
+ else:
961
+ input_format_source = model_settings.get(
962
+ "input_audio_format", DEFAULT_MODEL_SETTINGS.get("input_audio_format")
963
+ )
964
+ audio_input_args["format"] = to_realtime_audio_format(input_format_source)
965
+
966
+ if "noise_reduction" in input_audio_config:
967
+ audio_input_args["noise_reduction"] = input_audio_config.get("noise_reduction")
968
+ elif "input_audio_noise_reduction" in model_settings:
969
+ audio_input_args["noise_reduction"] = model_settings.get("input_audio_noise_reduction")
970
+
971
+ if "transcription" in input_audio_config:
972
+ audio_input_args["transcription"] = input_audio_config.get("transcription")
973
+ elif "input_audio_transcription" in model_settings:
974
+ audio_input_args["transcription"] = model_settings.get("input_audio_transcription")
975
+ else:
976
+ audio_input_args["transcription"] = DEFAULT_MODEL_SETTINGS.get(
977
+ "input_audio_transcription"
978
+ )
979
+
980
+ if "turn_detection" in input_audio_config:
981
+ audio_input_args["turn_detection"] = self._normalize_turn_detection_config(
982
+ input_audio_config.get("turn_detection")
983
+ )
984
+ elif "turn_detection" in model_settings:
985
+ audio_input_args["turn_detection"] = self._normalize_turn_detection_config(
986
+ model_settings.get("turn_detection")
987
+ )
988
+ else:
989
+ audio_input_args["turn_detection"] = DEFAULT_MODEL_SETTINGS.get("turn_detection")
990
+
991
+ requested_voice = output_audio_config.get("voice") if output_audio_config else None
992
+ audio_output_args["voice"] = requested_voice or model_settings.get(
993
+ "voice", DEFAULT_MODEL_SETTINGS.get("voice")
994
+ )
995
+
996
+ output_format_source: FormatInput = (
997
+ output_audio_config.get("format") if output_audio_config else None
998
+ )
999
+ if output_format_source is None:
1000
+ if self._call_id:
1001
+ output_format_source = model_settings.get("output_audio_format")
1002
+ else:
1003
+ output_format_source = model_settings.get(
1004
+ "output_audio_format", DEFAULT_MODEL_SETTINGS.get("output_audio_format")
1005
+ )
1006
+ audio_output_args["format"] = to_realtime_audio_format(output_format_source)
1007
+
1008
+ if "speed" in output_audio_config:
1009
+ audio_output_args["speed"] = output_audio_config.get("speed")
1010
+ elif "speed" in model_settings:
1011
+ audio_output_args["speed"] = model_settings.get("speed")
1012
+
1013
+ output_modalities = (
1014
+ model_settings.get("output_modalities")
1015
+ or model_settings.get("modalities")
1016
+ or DEFAULT_MODEL_SETTINGS.get("modalities")
1017
+ )
1018
+
1019
+ # Construct full session object. `type` will be excluded at serialization time for updates.
1020
+ session_create_request = OpenAISessionCreateRequest(
1021
+ type="realtime",
1022
+ model=(model_settings.get("model_name") or self.model) or "gpt-realtime",
1023
+ output_modalities=output_modalities,
1024
+ audio=OpenAIRealtimeAudioConfig(
1025
+ input=OpenAIRealtimeAudioInput(**audio_input_args),
1026
+ output=OpenAIRealtimeAudioOutput(**audio_output_args),
600
1027
  ),
601
- tools=self._tools_to_session_tools(
602
- tools=model_settings.get("tools", []), handoffs=model_settings.get("handoffs", [])
1028
+ tools=cast(
1029
+ Any,
1030
+ self._tools_to_session_tools(
1031
+ tools=model_settings.get("tools", []),
1032
+ handoffs=model_settings.get("handoffs", []),
1033
+ ),
603
1034
  ),
604
1035
  )
605
1036
 
1037
+ if "instructions" in model_settings:
1038
+ session_create_request.instructions = model_settings.get("instructions")
1039
+
1040
+ if "prompt" in model_settings:
1041
+ _passed_prompt: Prompt = model_settings["prompt"]
1042
+ variables: dict[str, Any] | None = _passed_prompt.get("variables")
1043
+ session_create_request.prompt = ResponsePrompt(
1044
+ id=_passed_prompt["id"],
1045
+ variables=variables,
1046
+ version=_passed_prompt.get("version"),
1047
+ )
1048
+
1049
+ if "max_output_tokens" in model_settings:
1050
+ session_create_request.max_output_tokens = cast(
1051
+ Any, model_settings.get("max_output_tokens")
1052
+ )
1053
+
1054
+ if "tool_choice" in model_settings:
1055
+ session_create_request.tool_choice = cast(Any, model_settings.get("tool_choice"))
1056
+
1057
+ return session_create_request
1058
+
606
1059
  def _tools_to_session_tools(
607
1060
  self, tools: list[Tool], handoffs: list[Handoff]
608
- ) -> list[OpenAISessionTool]:
609
- converted_tools: list[OpenAISessionTool] = []
1061
+ ) -> list[OpenAISessionFunction]:
1062
+ converted_tools: list[OpenAISessionFunction] = []
610
1063
  for tool in tools:
611
1064
  if not isinstance(tool, FunctionTool):
612
1065
  raise UserError(f"Tool {tool.name} is unsupported. Must be a function tool.")
613
1066
  converted_tools.append(
614
- OpenAISessionTool(
1067
+ OpenAISessionFunction(
615
1068
  name=tool.name,
616
1069
  description=tool.description,
617
1070
  parameters=tool.params_json_schema,
@@ -621,7 +1074,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
621
1074
 
622
1075
  for handoff in handoffs:
623
1076
  converted_tools.append(
624
- OpenAISessionTool(
1077
+ OpenAISessionFunction(
625
1078
  name=handoff.tool_name,
626
1079
  description=handoff.tool_description,
627
1080
  parameters=handoff.input_json_schema,
@@ -632,20 +1085,85 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
632
1085
  return converted_tools
633
1086
 
634
1087
 
1088
+ class OpenAIRealtimeSIPModel(OpenAIRealtimeWebSocketModel):
1089
+ """Realtime model that attaches to SIP-originated calls using a call ID."""
1090
+
1091
+ @staticmethod
1092
+ async def build_initial_session_payload(
1093
+ agent: RealtimeAgent[Any],
1094
+ *,
1095
+ context: TContext | None = None,
1096
+ model_config: RealtimeModelConfig | None = None,
1097
+ run_config: RealtimeRunConfig | None = None,
1098
+ overrides: RealtimeSessionModelSettings | None = None,
1099
+ ) -> OpenAISessionCreateRequest:
1100
+ """Build a session payload that mirrors what a RealtimeSession would send on connect.
1101
+
1102
+ This helper can be used to accept SIP-originated calls by forwarding the returned payload to
1103
+ the Realtime Calls API without duplicating session setup logic.
1104
+ """
1105
+ run_config_settings = (run_config or {}).get("model_settings") or {}
1106
+ initial_model_settings = (model_config or {}).get("initial_model_settings") or {}
1107
+ base_settings: RealtimeSessionModelSettings = {
1108
+ **run_config_settings,
1109
+ **initial_model_settings,
1110
+ }
1111
+
1112
+ context_wrapper = RunContextWrapper(context)
1113
+ merged_settings = await _build_model_settings_from_agent(
1114
+ agent=agent,
1115
+ context_wrapper=context_wrapper,
1116
+ base_settings=base_settings,
1117
+ starting_settings=initial_model_settings,
1118
+ run_config=run_config,
1119
+ )
1120
+
1121
+ if overrides:
1122
+ merged_settings.update(overrides)
1123
+
1124
+ model = OpenAIRealtimeWebSocketModel()
1125
+ return model._get_session_config(merged_settings)
1126
+
1127
+ async def connect(self, options: RealtimeModelConfig) -> None:
1128
+ call_id = options.get("call_id")
1129
+ if not call_id:
1130
+ raise UserError("OpenAIRealtimeSIPModel requires `call_id` in the model configuration.")
1131
+
1132
+ sip_options = options.copy()
1133
+ await super().connect(sip_options)
1134
+
1135
+
635
1136
  class _ConversionHelper:
636
1137
  @classmethod
637
1138
  def conversation_item_to_realtime_message_item(
638
1139
  cls, item: ConversationItem, previous_item_id: str | None
639
1140
  ) -> RealtimeMessageItem:
1141
+ if not isinstance(
1142
+ item,
1143
+ (
1144
+ RealtimeConversationItemUserMessage,
1145
+ RealtimeConversationItemAssistantMessage,
1146
+ RealtimeConversationItemSystemMessage,
1147
+ ),
1148
+ ):
1149
+ raise ValueError("Unsupported conversation item type for message conversion.")
1150
+ content: list[dict[str, Any]] = []
1151
+ for each in item.content:
1152
+ c = each.model_dump()
1153
+ if each.type == "output_text":
1154
+ # For backward-compatibility of assistant message items
1155
+ c["type"] = "text"
1156
+ elif each.type == "output_audio":
1157
+ # For backward-compatibility of assistant message items
1158
+ c["type"] = "audio"
1159
+ content.append(c)
640
1160
  return TypeAdapter(RealtimeMessageItem).validate_python(
641
1161
  {
642
1162
  "item_id": item.id or "",
643
1163
  "previous_item_id": previous_item_id,
644
1164
  "type": item.type,
645
1165
  "role": item.role,
646
- "content": (
647
- [content.model_dump() for content in item.content] if item.content else []
648
- ),
1166
+ "content": content,
649
1167
  "status": "in_progress",
650
1168
  },
651
1169
  )
@@ -665,12 +1183,12 @@ class _ConversionHelper:
665
1183
  @classmethod
666
1184
  def convert_tracing_config(
667
1185
  cls, tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None
668
- ) -> OpenAISessionTracing | None:
1186
+ ) -> OpenAITracingConfiguration | Literal["auto"] | None:
669
1187
  if tracing_config is None:
670
1188
  return None
671
1189
  elif tracing_config == "auto":
672
1190
  return "auto"
673
- return OpenAISessionTracingConfiguration(
1191
+ return OpenAITracingConfiguration(
674
1192
  group_id=tracing_config.get("group_id"),
675
1193
  metadata=tracing_config.get("metadata"),
676
1194
  workflow_name=tracing_config.get("workflow_name"),
@@ -683,22 +1201,53 @@ class _ConversionHelper:
683
1201
  user_input = event.user_input
684
1202
 
685
1203
  if isinstance(user_input, dict):
686
- return OpenAIConversationItem(
1204
+ content: list[Content] = []
1205
+ for item in user_input.get("content", []):
1206
+ try:
1207
+ if not isinstance(item, dict):
1208
+ continue
1209
+ t = item.get("type")
1210
+ if t == "input_text":
1211
+ _txt = item.get("text")
1212
+ text_val = _txt if isinstance(_txt, str) else None
1213
+ content.append(Content(type="input_text", text=text_val))
1214
+ elif t == "input_image":
1215
+ iu = item.get("image_url")
1216
+ if isinstance(iu, str) and iu:
1217
+ d = item.get("detail")
1218
+ detail_val = cast(
1219
+ Literal["auto", "low", "high"] | None,
1220
+ d if isinstance(d, str) and d in ("auto", "low", "high") else None,
1221
+ )
1222
+ if detail_val is None:
1223
+ content.append(
1224
+ Content(
1225
+ type="input_image",
1226
+ image_url=iu,
1227
+ )
1228
+ )
1229
+ else:
1230
+ content.append(
1231
+ Content(
1232
+ type="input_image",
1233
+ image_url=iu,
1234
+ detail=detail_val,
1235
+ )
1236
+ )
1237
+ # ignore unknown types for forward-compat
1238
+ except Exception:
1239
+ # best-effort; skip malformed parts
1240
+ continue
1241
+ return RealtimeConversationItemUserMessage(
687
1242
  type="message",
688
1243
  role="user",
689
- content=[
690
- OpenAIConversationItemContent(
691
- type="input_text",
692
- text=item.get("text"),
693
- )
694
- for item in user_input.get("content", [])
695
- ],
1244
+ content=content,
696
1245
  )
697
1246
  else:
698
- return OpenAIConversationItem(
1247
+ return RealtimeConversationItemUserMessage(
699
1248
  type="message",
700
1249
  role="user",
701
- content=[OpenAIConversationItemContent(type="input_text", text=user_input)],
1250
+ content=[Content(type="input_text", text=user_input)],
702
1251
  )
703
1252
 
704
1253
  @classmethod
@@ -724,7 +1273,7 @@ class _ConversionHelper:
724
1273
  def convert_tool_output(cls, event: RealtimeModelSendToolOutput) -> OpenAIRealtimeClientEvent:
725
1274
  return OpenAIConversationItemCreateEvent(
726
1275
  type="conversation.item.create",
727
- item=OpenAIConversationItem(
1276
+ item=RealtimeConversationItemFunctionCallOutput(
728
1277
  type="function_call_output",
729
1278
  output=event.output,
730
1279
  call_id=event.tool_call.call_id,