openai-agents 0.2.8__py3-none-any.whl → 0.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agents/__init__.py +105 -4
- agents/_debug.py +15 -4
- agents/_run_impl.py +1203 -96
- agents/agent.py +164 -19
- agents/apply_diff.py +329 -0
- agents/editor.py +47 -0
- agents/exceptions.py +35 -0
- agents/extensions/experimental/__init__.py +6 -0
- agents/extensions/experimental/codex/__init__.py +92 -0
- agents/extensions/experimental/codex/codex.py +89 -0
- agents/extensions/experimental/codex/codex_options.py +35 -0
- agents/extensions/experimental/codex/codex_tool.py +1142 -0
- agents/extensions/experimental/codex/events.py +162 -0
- agents/extensions/experimental/codex/exec.py +263 -0
- agents/extensions/experimental/codex/items.py +245 -0
- agents/extensions/experimental/codex/output_schema_file.py +50 -0
- agents/extensions/experimental/codex/payloads.py +31 -0
- agents/extensions/experimental/codex/thread.py +214 -0
- agents/extensions/experimental/codex/thread_options.py +54 -0
- agents/extensions/experimental/codex/turn_options.py +36 -0
- agents/extensions/handoff_filters.py +13 -1
- agents/extensions/memory/__init__.py +120 -0
- agents/extensions/memory/advanced_sqlite_session.py +1285 -0
- agents/extensions/memory/async_sqlite_session.py +239 -0
- agents/extensions/memory/dapr_session.py +423 -0
- agents/extensions/memory/encrypt_session.py +185 -0
- agents/extensions/memory/redis_session.py +261 -0
- agents/extensions/memory/sqlalchemy_session.py +334 -0
- agents/extensions/models/litellm_model.py +449 -36
- agents/extensions/models/litellm_provider.py +3 -1
- agents/function_schema.py +47 -5
- agents/guardrail.py +16 -2
- agents/{handoffs.py → handoffs/__init__.py} +89 -47
- agents/handoffs/history.py +268 -0
- agents/items.py +237 -11
- agents/lifecycle.py +75 -14
- agents/mcp/server.py +280 -37
- agents/mcp/util.py +24 -3
- agents/memory/__init__.py +22 -2
- agents/memory/openai_conversations_session.py +91 -0
- agents/memory/openai_responses_compaction_session.py +249 -0
- agents/memory/session.py +19 -261
- agents/memory/sqlite_session.py +275 -0
- agents/memory/util.py +20 -0
- agents/model_settings.py +14 -3
- agents/models/__init__.py +13 -0
- agents/models/chatcmpl_converter.py +303 -50
- agents/models/chatcmpl_helpers.py +63 -0
- agents/models/chatcmpl_stream_handler.py +290 -68
- agents/models/default_models.py +58 -0
- agents/models/interface.py +4 -0
- agents/models/openai_chatcompletions.py +103 -49
- agents/models/openai_provider.py +10 -4
- agents/models/openai_responses.py +162 -46
- agents/realtime/__init__.py +4 -0
- agents/realtime/_util.py +14 -3
- agents/realtime/agent.py +7 -0
- agents/realtime/audio_formats.py +53 -0
- agents/realtime/config.py +78 -10
- agents/realtime/events.py +18 -0
- agents/realtime/handoffs.py +2 -2
- agents/realtime/items.py +17 -1
- agents/realtime/model.py +13 -0
- agents/realtime/model_events.py +12 -0
- agents/realtime/model_inputs.py +18 -1
- agents/realtime/openai_realtime.py +696 -150
- agents/realtime/session.py +243 -23
- agents/repl.py +7 -3
- agents/result.py +197 -38
- agents/run.py +949 -168
- agents/run_context.py +13 -2
- agents/stream_events.py +1 -0
- agents/strict_schema.py +14 -0
- agents/tool.py +413 -15
- agents/tool_context.py +22 -1
- agents/tool_guardrails.py +279 -0
- agents/tracing/__init__.py +2 -0
- agents/tracing/config.py +9 -0
- agents/tracing/create.py +4 -0
- agents/tracing/processor_interface.py +84 -11
- agents/tracing/processors.py +65 -54
- agents/tracing/provider.py +64 -7
- agents/tracing/spans.py +105 -0
- agents/tracing/traces.py +116 -16
- agents/usage.py +134 -12
- agents/util/_json.py +19 -1
- agents/util/_transforms.py +12 -2
- agents/voice/input.py +5 -4
- agents/voice/models/openai_stt.py +17 -9
- agents/voice/pipeline.py +2 -0
- agents/voice/pipeline_config.py +4 -0
- {openai_agents-0.2.8.dist-info → openai_agents-0.6.8.dist-info}/METADATA +44 -19
- openai_agents-0.6.8.dist-info/RECORD +134 -0
- {openai_agents-0.2.8.dist-info → openai_agents-0.6.8.dist-info}/WHEEL +1 -1
- openai_agents-0.2.8.dist-info/RECORD +0 -103
- {openai_agents-0.2.8.dist-info → openai_agents-0.6.8.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,69 +5,101 @@ import base64
|
|
|
5
5
|
import inspect
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
+
from collections.abc import Mapping
|
|
8
9
|
from datetime import datetime
|
|
9
|
-
from typing import Any, Callable, Literal
|
|
10
|
+
from typing import Annotated, Any, Callable, Literal, Union, cast
|
|
10
11
|
|
|
11
12
|
import pydantic
|
|
12
13
|
import websockets
|
|
13
|
-
from openai.types.
|
|
14
|
+
from openai.types.realtime import realtime_audio_config as _rt_audio_config
|
|
15
|
+
from openai.types.realtime.conversation_item import (
|
|
14
16
|
ConversationItem,
|
|
15
17
|
ConversationItem as OpenAIConversationItem,
|
|
16
18
|
)
|
|
17
|
-
from openai.types.
|
|
18
|
-
ConversationItemContent as OpenAIConversationItemContent,
|
|
19
|
-
)
|
|
20
|
-
from openai.types.beta.realtime.conversation_item_create_event import (
|
|
19
|
+
from openai.types.realtime.conversation_item_create_event import (
|
|
21
20
|
ConversationItemCreateEvent as OpenAIConversationItemCreateEvent,
|
|
22
21
|
)
|
|
23
|
-
from openai.types.
|
|
22
|
+
from openai.types.realtime.conversation_item_retrieve_event import (
|
|
24
23
|
ConversationItemRetrieveEvent as OpenAIConversationItemRetrieveEvent,
|
|
25
24
|
)
|
|
26
|
-
from openai.types.
|
|
25
|
+
from openai.types.realtime.conversation_item_truncate_event import (
|
|
27
26
|
ConversationItemTruncateEvent as OpenAIConversationItemTruncateEvent,
|
|
28
27
|
)
|
|
29
|
-
from openai.types.
|
|
28
|
+
from openai.types.realtime.input_audio_buffer_append_event import (
|
|
30
29
|
InputAudioBufferAppendEvent as OpenAIInputAudioBufferAppendEvent,
|
|
31
30
|
)
|
|
32
|
-
from openai.types.
|
|
31
|
+
from openai.types.realtime.input_audio_buffer_commit_event import (
|
|
33
32
|
InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent,
|
|
34
33
|
)
|
|
35
|
-
from openai.types.
|
|
34
|
+
from openai.types.realtime.realtime_audio_formats import (
|
|
35
|
+
AudioPCM,
|
|
36
|
+
AudioPCMA,
|
|
37
|
+
AudioPCMU,
|
|
38
|
+
)
|
|
39
|
+
from openai.types.realtime.realtime_client_event import (
|
|
36
40
|
RealtimeClientEvent as OpenAIRealtimeClientEvent,
|
|
37
41
|
)
|
|
38
|
-
from openai.types.
|
|
42
|
+
from openai.types.realtime.realtime_conversation_item_assistant_message import (
|
|
43
|
+
RealtimeConversationItemAssistantMessage,
|
|
44
|
+
)
|
|
45
|
+
from openai.types.realtime.realtime_conversation_item_function_call_output import (
|
|
46
|
+
RealtimeConversationItemFunctionCallOutput,
|
|
47
|
+
)
|
|
48
|
+
from openai.types.realtime.realtime_conversation_item_system_message import (
|
|
49
|
+
RealtimeConversationItemSystemMessage,
|
|
50
|
+
)
|
|
51
|
+
from openai.types.realtime.realtime_conversation_item_user_message import (
|
|
52
|
+
Content,
|
|
53
|
+
RealtimeConversationItemUserMessage,
|
|
54
|
+
)
|
|
55
|
+
from openai.types.realtime.realtime_function_tool import (
|
|
56
|
+
RealtimeFunctionTool as OpenAISessionFunction,
|
|
57
|
+
)
|
|
58
|
+
from openai.types.realtime.realtime_server_event import (
|
|
39
59
|
RealtimeServerEvent as OpenAIRealtimeServerEvent,
|
|
40
60
|
)
|
|
41
|
-
from openai.types.
|
|
42
|
-
|
|
61
|
+
from openai.types.realtime.realtime_session_create_request import (
|
|
62
|
+
RealtimeSessionCreateRequest as OpenAISessionCreateRequest,
|
|
63
|
+
)
|
|
64
|
+
from openai.types.realtime.realtime_tracing_config import (
|
|
65
|
+
TracingConfiguration as OpenAITracingConfiguration,
|
|
66
|
+
)
|
|
67
|
+
from openai.types.realtime.realtime_transcription_session_create_request import (
|
|
68
|
+
RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest,
|
|
69
|
+
)
|
|
70
|
+
from openai.types.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
|
|
71
|
+
from openai.types.realtime.response_cancel_event import (
|
|
43
72
|
ResponseCancelEvent as OpenAIResponseCancelEvent,
|
|
44
73
|
)
|
|
45
|
-
from openai.types.
|
|
74
|
+
from openai.types.realtime.response_create_event import (
|
|
46
75
|
ResponseCreateEvent as OpenAIResponseCreateEvent,
|
|
47
76
|
)
|
|
48
|
-
from openai.types.
|
|
49
|
-
Session as OpenAISessionObject,
|
|
50
|
-
SessionTool as OpenAISessionTool,
|
|
51
|
-
SessionTracing as OpenAISessionTracing,
|
|
52
|
-
SessionTracingTracingConfiguration as OpenAISessionTracingConfiguration,
|
|
77
|
+
from openai.types.realtime.session_update_event import (
|
|
53
78
|
SessionUpdateEvent as OpenAISessionUpdateEvent,
|
|
54
79
|
)
|
|
55
|
-
from
|
|
56
|
-
from
|
|
80
|
+
from openai.types.responses.response_prompt import ResponsePrompt
|
|
81
|
+
from pydantic import Field, TypeAdapter
|
|
82
|
+
from typing_extensions import TypeAlias, assert_never
|
|
57
83
|
from websockets.asyncio.client import ClientConnection
|
|
58
84
|
|
|
59
85
|
from agents.handoffs import Handoff
|
|
86
|
+
from agents.prompts import Prompt
|
|
60
87
|
from agents.realtime._default_tracker import ModelAudioTracker
|
|
88
|
+
from agents.realtime.audio_formats import to_realtime_audio_format
|
|
61
89
|
from agents.tool import FunctionTool, Tool
|
|
62
90
|
from agents.util._types import MaybeAwaitable
|
|
63
91
|
|
|
64
92
|
from ..exceptions import UserError
|
|
65
93
|
from ..logger import logger
|
|
94
|
+
from ..run_context import RunContextWrapper, TContext
|
|
66
95
|
from ..version import __version__
|
|
96
|
+
from .agent import RealtimeAgent
|
|
67
97
|
from .config import (
|
|
68
98
|
RealtimeModelTracingConfig,
|
|
99
|
+
RealtimeRunConfig,
|
|
69
100
|
RealtimeSessionModelSettings,
|
|
70
101
|
)
|
|
102
|
+
from .handoffs import realtime_handoff
|
|
71
103
|
from .items import RealtimeMessageItem, RealtimeToolCallItem
|
|
72
104
|
from .model import (
|
|
73
105
|
RealtimeModel,
|
|
@@ -83,6 +115,7 @@ from .model_events import (
|
|
|
83
115
|
RealtimeModelErrorEvent,
|
|
84
116
|
RealtimeModelEvent,
|
|
85
117
|
RealtimeModelExceptionEvent,
|
|
118
|
+
RealtimeModelInputAudioTimeoutTriggeredEvent,
|
|
86
119
|
RealtimeModelInputAudioTranscriptionCompletedEvent,
|
|
87
120
|
RealtimeModelItemDeletedEvent,
|
|
88
121
|
RealtimeModelItemUpdatedEvent,
|
|
@@ -102,17 +135,33 @@ from .model_inputs import (
|
|
|
102
135
|
RealtimeModelSendUserInput,
|
|
103
136
|
)
|
|
104
137
|
|
|
138
|
+
FormatInput: TypeAlias = Union[
|
|
139
|
+
str,
|
|
140
|
+
AudioPCM,
|
|
141
|
+
AudioPCMU,
|
|
142
|
+
AudioPCMA,
|
|
143
|
+
Mapping[str, Any],
|
|
144
|
+
None,
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# Avoid direct imports of non-exported names by referencing via module
|
|
149
|
+
OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig
|
|
150
|
+
OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput # type: ignore[attr-defined]
|
|
151
|
+
OpenAIRealtimeAudioOutput = _rt_audio_config.RealtimeAudioConfigOutput # type: ignore[attr-defined]
|
|
152
|
+
|
|
153
|
+
|
|
105
154
|
_USER_AGENT = f"Agents/Python {__version__}"
|
|
106
155
|
|
|
107
156
|
DEFAULT_MODEL_SETTINGS: RealtimeSessionModelSettings = {
|
|
108
157
|
"voice": "ash",
|
|
109
|
-
"modalities": ["
|
|
158
|
+
"modalities": ["audio"],
|
|
110
159
|
"input_audio_format": "pcm16",
|
|
111
160
|
"output_audio_format": "pcm16",
|
|
112
161
|
"input_audio_transcription": {
|
|
113
162
|
"model": "gpt-4o-mini-transcribe",
|
|
114
163
|
},
|
|
115
|
-
"turn_detection": {"type": "semantic_vad"},
|
|
164
|
+
"turn_detection": {"type": "semantic_vad", "interrupt_response": True},
|
|
116
165
|
}
|
|
117
166
|
|
|
118
167
|
|
|
@@ -128,11 +177,85 @@ async def get_api_key(key: str | Callable[[], MaybeAwaitable[str]] | None) -> st
|
|
|
128
177
|
return os.getenv("OPENAI_API_KEY")
|
|
129
178
|
|
|
130
179
|
|
|
180
|
+
AllRealtimeServerEvents = Annotated[
|
|
181
|
+
Union[OpenAIRealtimeServerEvent,],
|
|
182
|
+
Field(discriminator="type"),
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
ServerEventTypeAdapter: TypeAdapter[AllRealtimeServerEvents] | None = None
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
|
|
189
|
+
global ServerEventTypeAdapter
|
|
190
|
+
if not ServerEventTypeAdapter:
|
|
191
|
+
ServerEventTypeAdapter = TypeAdapter(AllRealtimeServerEvents)
|
|
192
|
+
return ServerEventTypeAdapter
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
async def _collect_enabled_handoffs(
|
|
196
|
+
agent: RealtimeAgent[Any], context_wrapper: RunContextWrapper[Any]
|
|
197
|
+
) -> list[Handoff[Any, RealtimeAgent[Any]]]:
|
|
198
|
+
handoffs: list[Handoff[Any, RealtimeAgent[Any]]] = []
|
|
199
|
+
for handoff_item in agent.handoffs:
|
|
200
|
+
if isinstance(handoff_item, Handoff):
|
|
201
|
+
handoffs.append(handoff_item)
|
|
202
|
+
elif isinstance(handoff_item, RealtimeAgent):
|
|
203
|
+
handoffs.append(realtime_handoff(handoff_item))
|
|
204
|
+
|
|
205
|
+
async def _check_handoff_enabled(handoff_obj: Handoff[Any, RealtimeAgent[Any]]) -> bool:
|
|
206
|
+
attr = handoff_obj.is_enabled
|
|
207
|
+
if isinstance(attr, bool):
|
|
208
|
+
return attr
|
|
209
|
+
res = attr(context_wrapper, agent)
|
|
210
|
+
if inspect.isawaitable(res):
|
|
211
|
+
return await res
|
|
212
|
+
return res
|
|
213
|
+
|
|
214
|
+
results = await asyncio.gather(*(_check_handoff_enabled(h) for h in handoffs))
|
|
215
|
+
return [h for h, ok in zip(handoffs, results) if ok]
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
async def _build_model_settings_from_agent(
|
|
219
|
+
*,
|
|
220
|
+
agent: RealtimeAgent[Any],
|
|
221
|
+
context_wrapper: RunContextWrapper[Any],
|
|
222
|
+
base_settings: RealtimeSessionModelSettings,
|
|
223
|
+
starting_settings: RealtimeSessionModelSettings | None,
|
|
224
|
+
run_config: RealtimeRunConfig | None,
|
|
225
|
+
) -> RealtimeSessionModelSettings:
|
|
226
|
+
updated_settings = base_settings.copy()
|
|
227
|
+
|
|
228
|
+
if agent.prompt is not None:
|
|
229
|
+
updated_settings["prompt"] = agent.prompt
|
|
230
|
+
|
|
231
|
+
instructions, tools, handoffs = await asyncio.gather(
|
|
232
|
+
agent.get_system_prompt(context_wrapper),
|
|
233
|
+
agent.get_all_tools(context_wrapper),
|
|
234
|
+
_collect_enabled_handoffs(agent, context_wrapper),
|
|
235
|
+
)
|
|
236
|
+
updated_settings["instructions"] = instructions or ""
|
|
237
|
+
updated_settings["tools"] = tools or []
|
|
238
|
+
updated_settings["handoffs"] = handoffs or []
|
|
239
|
+
|
|
240
|
+
if starting_settings:
|
|
241
|
+
updated_settings.update(starting_settings)
|
|
242
|
+
|
|
243
|
+
if run_config and run_config.get("tracing_disabled", False):
|
|
244
|
+
updated_settings["tracing"] = None
|
|
245
|
+
|
|
246
|
+
return updated_settings
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# Note: Avoid a module-level union alias for Python 3.9 compatibility.
|
|
250
|
+
# Using a union at runtime (e.g., A | B) in a type alias triggers evaluation
|
|
251
|
+
# during import on 3.9. We instead inline the union in annotations below.
|
|
252
|
+
|
|
253
|
+
|
|
131
254
|
class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
132
255
|
"""A model that uses OpenAI's WebSocket API."""
|
|
133
256
|
|
|
134
257
|
def __init__(self) -> None:
|
|
135
|
-
self.model = "gpt-
|
|
258
|
+
self.model = "gpt-realtime" # Default model
|
|
136
259
|
self._websocket: ClientConnection | None = None
|
|
137
260
|
self._websocket_task: asyncio.Task[None] | None = None
|
|
138
261
|
self._listeners: list[RealtimeModelListener] = []
|
|
@@ -141,7 +264,9 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
141
264
|
self._ongoing_response: bool = False
|
|
142
265
|
self._tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None = None
|
|
143
266
|
self._playback_tracker: RealtimePlaybackTracker | None = None
|
|
144
|
-
self._created_session:
|
|
267
|
+
self._created_session: OpenAISessionCreateRequest | None = None
|
|
268
|
+
self._server_event_type_adapter = get_server_event_type_adapter()
|
|
269
|
+
self._call_id: str | None = None
|
|
145
270
|
|
|
146
271
|
async def connect(self, options: RealtimeModelConfig) -> None:
|
|
147
272
|
"""Establish a connection to the model and keep it alive."""
|
|
@@ -152,7 +277,19 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
152
277
|
|
|
153
278
|
self._playback_tracker = options.get("playback_tracker", None)
|
|
154
279
|
|
|
155
|
-
|
|
280
|
+
call_id = options.get("call_id")
|
|
281
|
+
model_name = model_settings.get("model_name")
|
|
282
|
+
if call_id and model_name:
|
|
283
|
+
error_message = (
|
|
284
|
+
"Cannot specify both `call_id` and `model_name` "
|
|
285
|
+
"when attaching to an existing realtime call."
|
|
286
|
+
)
|
|
287
|
+
raise UserError(error_message)
|
|
288
|
+
|
|
289
|
+
if model_name:
|
|
290
|
+
self.model = model_name
|
|
291
|
+
|
|
292
|
+
self._call_id = call_id
|
|
156
293
|
api_key = await get_api_key(options.get("api_key"))
|
|
157
294
|
|
|
158
295
|
if "tracing" in model_settings:
|
|
@@ -160,15 +297,21 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
160
297
|
else:
|
|
161
298
|
self._tracing_config = "auto"
|
|
162
299
|
|
|
163
|
-
if
|
|
164
|
-
|
|
300
|
+
if call_id:
|
|
301
|
+
url = options.get("url", f"wss://api.openai.com/v1/realtime?call_id={call_id}")
|
|
302
|
+
else:
|
|
303
|
+
url = options.get("url", f"wss://api.openai.com/v1/realtime?model={self.model}")
|
|
165
304
|
|
|
166
|
-
|
|
305
|
+
headers: dict[str, str] = {}
|
|
306
|
+
if options.get("headers") is not None:
|
|
307
|
+
# For customizing request headers
|
|
308
|
+
headers.update(options["headers"])
|
|
309
|
+
else:
|
|
310
|
+
# OpenAI's Realtime API
|
|
311
|
+
if not api_key:
|
|
312
|
+
raise UserError("API key is required but was not provided.")
|
|
167
313
|
|
|
168
|
-
|
|
169
|
-
"Authorization": f"Bearer {api_key}",
|
|
170
|
-
"OpenAI-Beta": "realtime=v1",
|
|
171
|
-
}
|
|
314
|
+
headers.update({"Authorization": f"Bearer {api_key}"})
|
|
172
315
|
self._websocket = await websockets.connect(
|
|
173
316
|
url,
|
|
174
317
|
user_agent_header=_USER_AGENT,
|
|
@@ -186,7 +329,11 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
186
329
|
converted_tracing_config = _ConversionHelper.convert_tracing_config(tracing_config)
|
|
187
330
|
await self._send_raw_message(
|
|
188
331
|
OpenAISessionUpdateEvent(
|
|
189
|
-
session=
|
|
332
|
+
session=OpenAISessionCreateRequest(
|
|
333
|
+
model=self.model,
|
|
334
|
+
type="realtime",
|
|
335
|
+
tracing=converted_tracing_config,
|
|
336
|
+
),
|
|
190
337
|
type="session.update",
|
|
191
338
|
)
|
|
192
339
|
)
|
|
@@ -203,7 +350,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
203
350
|
|
|
204
351
|
async def _emit_event(self, event: RealtimeModelEvent) -> None:
|
|
205
352
|
"""Emit an event to the listeners."""
|
|
206
|
-
|
|
353
|
+
# Copy list to avoid modification during iteration
|
|
354
|
+
for listener in list(self._listeners):
|
|
207
355
|
await listener.on_event(event)
|
|
208
356
|
|
|
209
357
|
async def _listen_for_messages(self):
|
|
@@ -268,8 +416,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
268
416
|
async def _send_raw_message(self, event: OpenAIRealtimeClientEvent) -> None:
|
|
269
417
|
"""Send a raw message to the model."""
|
|
270
418
|
assert self._websocket is not None, "Not connected"
|
|
271
|
-
|
|
272
|
-
await self._websocket.send(
|
|
419
|
+
payload = event.model_dump_json(exclude_unset=True)
|
|
420
|
+
await self._websocket.send(payload)
|
|
273
421
|
|
|
274
422
|
async def _send_user_input(self, event: RealtimeModelSendUserInput) -> None:
|
|
275
423
|
converted = _ConversionHelper.convert_user_input_to_item_create(event)
|
|
@@ -331,6 +479,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
331
479
|
current_item_id = playback_state.get("current_item_id")
|
|
332
480
|
current_item_content_index = playback_state.get("current_item_content_index")
|
|
333
481
|
elapsed_ms = playback_state.get("elapsed_ms")
|
|
482
|
+
|
|
334
483
|
if current_item_id is None or elapsed_ms is None:
|
|
335
484
|
logger.debug(
|
|
336
485
|
"Skipping interrupt. "
|
|
@@ -338,41 +487,47 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
338
487
|
f"elapsed ms: {elapsed_ms}, "
|
|
339
488
|
f"content index: {current_item_content_index}"
|
|
340
489
|
)
|
|
341
|
-
return
|
|
342
|
-
|
|
343
|
-
current_item_content_index = current_item_content_index or 0
|
|
344
|
-
if elapsed_ms > 0:
|
|
345
|
-
await self._emit_event(
|
|
346
|
-
RealtimeModelAudioInterruptedEvent(
|
|
347
|
-
item_id=current_item_id,
|
|
348
|
-
content_index=current_item_content_index,
|
|
349
|
-
)
|
|
350
|
-
)
|
|
351
|
-
converted = _ConversionHelper.convert_interrupt(
|
|
352
|
-
current_item_id,
|
|
353
|
-
current_item_content_index,
|
|
354
|
-
int(elapsed_ms),
|
|
355
|
-
)
|
|
356
|
-
await self._send_raw_message(converted)
|
|
357
490
|
else:
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
491
|
+
current_item_content_index = current_item_content_index or 0
|
|
492
|
+
if elapsed_ms > 0:
|
|
493
|
+
await self._emit_event(
|
|
494
|
+
RealtimeModelAudioInterruptedEvent(
|
|
495
|
+
item_id=current_item_id,
|
|
496
|
+
content_index=current_item_content_index,
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
converted = _ConversionHelper.convert_interrupt(
|
|
500
|
+
current_item_id,
|
|
501
|
+
current_item_content_index,
|
|
502
|
+
int(elapsed_ms),
|
|
503
|
+
)
|
|
504
|
+
await self._send_raw_message(converted)
|
|
505
|
+
else:
|
|
506
|
+
logger.debug(
|
|
507
|
+
"Didn't interrupt bc elapsed ms is < 0. "
|
|
508
|
+
f"Item id: {current_item_id}, "
|
|
509
|
+
f"elapsed ms: {elapsed_ms}, "
|
|
510
|
+
f"content index: {current_item_content_index}"
|
|
511
|
+
)
|
|
364
512
|
|
|
513
|
+
session = self._created_session
|
|
365
514
|
automatic_response_cancellation_enabled = (
|
|
366
|
-
|
|
367
|
-
and
|
|
368
|
-
and
|
|
515
|
+
session
|
|
516
|
+
and session.audio is not None
|
|
517
|
+
and session.audio.input is not None
|
|
518
|
+
and session.audio.input.turn_detection is not None
|
|
519
|
+
and session.audio.input.turn_detection.interrupt_response is True
|
|
520
|
+
)
|
|
521
|
+
should_cancel_response = event.force_response_cancel or (
|
|
522
|
+
not automatic_response_cancellation_enabled
|
|
369
523
|
)
|
|
370
|
-
if
|
|
524
|
+
if should_cancel_response:
|
|
371
525
|
await self._cancel_response()
|
|
372
526
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
self._playback_tracker
|
|
527
|
+
if current_item_id is not None and elapsed_ms is not None:
|
|
528
|
+
self._audio_state_tracker.on_interrupted()
|
|
529
|
+
if self._playback_tracker:
|
|
530
|
+
self._playback_tracker.on_interrupted()
|
|
376
531
|
|
|
377
532
|
async def _send_session_update(self, event: RealtimeModelSendSessionUpdate) -> None:
|
|
378
533
|
"""Send a session update to the model."""
|
|
@@ -450,6 +605,10 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
450
605
|
self._websocket = None
|
|
451
606
|
if self._websocket_task:
|
|
452
607
|
self._websocket_task.cancel()
|
|
608
|
+
try:
|
|
609
|
+
await self._websocket_task
|
|
610
|
+
except asyncio.CancelledError:
|
|
611
|
+
pass
|
|
453
612
|
self._websocket_task = None
|
|
454
613
|
|
|
455
614
|
async def _cancel_response(self) -> None:
|
|
@@ -459,42 +618,121 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
459
618
|
|
|
460
619
|
async def _handle_ws_event(self, event: dict[str, Any]):
|
|
461
620
|
await self._emit_event(RealtimeModelRawServerEvent(data=event))
|
|
621
|
+
# The public interface definedo on this Agents SDK side (e.g., RealtimeMessageItem)
|
|
622
|
+
# must be the same even after the GA migration, so this part does the conversion
|
|
623
|
+
if isinstance(event, dict) and event.get("type") in (
|
|
624
|
+
"response.output_item.added",
|
|
625
|
+
"response.output_item.done",
|
|
626
|
+
):
|
|
627
|
+
item = event.get("item")
|
|
628
|
+
if isinstance(item, dict) and item.get("type") == "message":
|
|
629
|
+
raw_content = item.get("content") or []
|
|
630
|
+
converted_content: list[dict[str, Any]] = []
|
|
631
|
+
for part in raw_content:
|
|
632
|
+
if not isinstance(part, dict):
|
|
633
|
+
continue
|
|
634
|
+
if part.get("type") == "audio":
|
|
635
|
+
converted_content.append(
|
|
636
|
+
{
|
|
637
|
+
"type": "audio",
|
|
638
|
+
"audio": part.get("audio"),
|
|
639
|
+
"transcript": part.get("transcript"),
|
|
640
|
+
}
|
|
641
|
+
)
|
|
642
|
+
elif part.get("type") in ("text", "output_text"):
|
|
643
|
+
converted_content.append({"type": "text", "text": part.get("text")})
|
|
644
|
+
status = item.get("status")
|
|
645
|
+
if status not in ("in_progress", "completed", "incomplete"):
|
|
646
|
+
is_done = event.get("type") == "response.output_item.done"
|
|
647
|
+
status = "completed" if is_done else "in_progress"
|
|
648
|
+
# Explicitly type the adapter for mypy
|
|
649
|
+
type_adapter: TypeAdapter[RealtimeMessageItem] = TypeAdapter(RealtimeMessageItem)
|
|
650
|
+
message_item: RealtimeMessageItem = type_adapter.validate_python(
|
|
651
|
+
{
|
|
652
|
+
"item_id": item.get("id", ""),
|
|
653
|
+
"type": "message",
|
|
654
|
+
"role": item.get("role", "assistant"),
|
|
655
|
+
"content": converted_content,
|
|
656
|
+
"status": status,
|
|
657
|
+
}
|
|
658
|
+
)
|
|
659
|
+
await self._emit_event(RealtimeModelItemUpdatedEvent(item=message_item))
|
|
660
|
+
return
|
|
661
|
+
|
|
462
662
|
try:
|
|
463
663
|
if "previous_item_id" in event and event["previous_item_id"] is None:
|
|
464
664
|
event["previous_item_id"] = "" # TODO (rm) remove
|
|
465
|
-
parsed:
|
|
466
|
-
OpenAIRealtimeServerEvent
|
|
467
|
-
).validate_python(event)
|
|
665
|
+
parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
|
|
468
666
|
except pydantic.ValidationError as e:
|
|
469
667
|
logger.error(f"Failed to validate server event: {event}", exc_info=True)
|
|
470
|
-
await self._emit_event(
|
|
471
|
-
RealtimeModelErrorEvent(
|
|
472
|
-
error=e,
|
|
473
|
-
)
|
|
474
|
-
)
|
|
668
|
+
await self._emit_event(RealtimeModelErrorEvent(error=e))
|
|
475
669
|
return
|
|
476
670
|
except Exception as e:
|
|
477
671
|
event_type = event.get("type", "unknown") if isinstance(event, dict) else "unknown"
|
|
478
672
|
logger.error(f"Failed to validate server event: {event}", exc_info=True)
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
context=f"Failed to validate server event: {event_type}",
|
|
483
|
-
)
|
|
673
|
+
exception_event = RealtimeModelExceptionEvent(
|
|
674
|
+
exception=e,
|
|
675
|
+
context=f"Failed to validate server event: {event_type}",
|
|
484
676
|
)
|
|
677
|
+
await self._emit_event(exception_event)
|
|
485
678
|
return
|
|
486
679
|
|
|
487
|
-
if parsed.type == "response.
|
|
680
|
+
if parsed.type == "response.output_audio.delta":
|
|
488
681
|
await self._handle_audio_delta(parsed)
|
|
489
|
-
elif parsed.type == "response.
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
content_index=parsed.content_index,
|
|
494
|
-
)
|
|
682
|
+
elif parsed.type == "response.output_audio.done":
|
|
683
|
+
audio_done_event = RealtimeModelAudioDoneEvent(
|
|
684
|
+
item_id=parsed.item_id,
|
|
685
|
+
content_index=parsed.content_index,
|
|
495
686
|
)
|
|
687
|
+
await self._emit_event(audio_done_event)
|
|
496
688
|
elif parsed.type == "input_audio_buffer.speech_started":
|
|
497
|
-
|
|
689
|
+
# On VAD speech start, immediately stop local playback so the user can
|
|
690
|
+
# barge‑in without overlapping assistant audio.
|
|
691
|
+
last_audio = self._audio_state_tracker.get_last_audio_item()
|
|
692
|
+
if last_audio is not None:
|
|
693
|
+
item_id, content_index = last_audio
|
|
694
|
+
playback_state = self._get_playback_state()
|
|
695
|
+
playback_item_id = playback_state.get("current_item_id")
|
|
696
|
+
playback_content_index = playback_state.get("current_item_content_index") or 0
|
|
697
|
+
playback_elapsed_ms = playback_state.get("elapsed_ms")
|
|
698
|
+
await self._emit_event(
|
|
699
|
+
RealtimeModelAudioInterruptedEvent(item_id=item_id, content_index=content_index)
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
elapsed_override = getattr(parsed, "audio_end_ms", None)
|
|
703
|
+
if elapsed_override is None or elapsed_override <= 0:
|
|
704
|
+
effective_elapsed_ms = playback_elapsed_ms
|
|
705
|
+
else:
|
|
706
|
+
effective_elapsed_ms = float(elapsed_override)
|
|
707
|
+
|
|
708
|
+
if playback_item_id and effective_elapsed_ms is not None:
|
|
709
|
+
truncated_ms = max(int(round(effective_elapsed_ms)), 0)
|
|
710
|
+
await self._send_raw_message(
|
|
711
|
+
_ConversionHelper.convert_interrupt(
|
|
712
|
+
playback_item_id,
|
|
713
|
+
playback_content_index,
|
|
714
|
+
truncated_ms,
|
|
715
|
+
)
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Reset trackers so subsequent playback state queries don't
|
|
719
|
+
# reference audio that has been interrupted client‑side.
|
|
720
|
+
self._audio_state_tracker.on_interrupted()
|
|
721
|
+
if self._playback_tracker:
|
|
722
|
+
self._playback_tracker.on_interrupted()
|
|
723
|
+
|
|
724
|
+
# If server isn't configured to auto‑interrupt/cancel, cancel the
|
|
725
|
+
# response to prevent further audio.
|
|
726
|
+
session = self._created_session
|
|
727
|
+
automatic_response_cancellation_enabled = (
|
|
728
|
+
session
|
|
729
|
+
and session.audio is not None
|
|
730
|
+
and session.audio.input is not None
|
|
731
|
+
and session.audio.input.turn_detection is not None
|
|
732
|
+
and session.audio.input.turn_detection.interrupt_response is True
|
|
733
|
+
)
|
|
734
|
+
if not automatic_response_cancellation_enabled:
|
|
735
|
+
await self._cancel_response()
|
|
498
736
|
elif parsed.type == "response.created":
|
|
499
737
|
self._ongoing_response = True
|
|
500
738
|
await self._emit_event(RealtimeModelTurnStartedEvent())
|
|
@@ -503,15 +741,16 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
503
741
|
await self._emit_event(RealtimeModelTurnEndedEvent())
|
|
504
742
|
elif parsed.type == "session.created":
|
|
505
743
|
await self._send_tracing_config(self._tracing_config)
|
|
506
|
-
self._update_created_session(parsed.session)
|
|
744
|
+
self._update_created_session(parsed.session)
|
|
507
745
|
elif parsed.type == "session.updated":
|
|
508
|
-
self._update_created_session(parsed.session)
|
|
746
|
+
self._update_created_session(parsed.session)
|
|
509
747
|
elif parsed.type == "error":
|
|
510
748
|
await self._emit_event(RealtimeModelErrorEvent(error=parsed.error))
|
|
511
749
|
elif parsed.type == "conversation.item.deleted":
|
|
512
750
|
await self._emit_event(RealtimeModelItemDeletedEvent(item_id=parsed.item_id))
|
|
513
751
|
elif (
|
|
514
|
-
parsed.type == "conversation.item.
|
|
752
|
+
parsed.type == "conversation.item.added"
|
|
753
|
+
or parsed.type == "conversation.item.created"
|
|
515
754
|
or parsed.type == "conversation.item.retrieved"
|
|
516
755
|
):
|
|
517
756
|
previous_item_id = (
|
|
@@ -536,7 +775,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
536
775
|
item_id=parsed.item_id, transcript=parsed.transcript
|
|
537
776
|
)
|
|
538
777
|
)
|
|
539
|
-
elif parsed.type == "response.
|
|
778
|
+
elif parsed.type == "response.output_audio_transcript.delta":
|
|
540
779
|
await self._emit_event(
|
|
541
780
|
RealtimeModelTranscriptDeltaEvent(
|
|
542
781
|
item_id=parsed.item_id, delta=parsed.delta, response_id=parsed.response_id
|
|
@@ -544,7 +783,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
544
783
|
)
|
|
545
784
|
elif (
|
|
546
785
|
parsed.type == "conversation.item.input_audio_transcription.delta"
|
|
547
|
-
or parsed.type == "response.
|
|
786
|
+
or parsed.type == "response.output_text.delta"
|
|
548
787
|
or parsed.type == "response.function_call_arguments.delta"
|
|
549
788
|
):
|
|
550
789
|
# No support for partials yet
|
|
@@ -554,13 +793,137 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
554
793
|
or parsed.type == "response.output_item.done"
|
|
555
794
|
):
|
|
556
795
|
await self._handle_output_item(parsed.item)
|
|
796
|
+
elif parsed.type == "input_audio_buffer.timeout_triggered":
|
|
797
|
+
await self._emit_event(
|
|
798
|
+
RealtimeModelInputAudioTimeoutTriggeredEvent(
|
|
799
|
+
item_id=parsed.item_id,
|
|
800
|
+
audio_start_ms=parsed.audio_start_ms,
|
|
801
|
+
audio_end_ms=parsed.audio_end_ms,
|
|
802
|
+
)
|
|
803
|
+
)
|
|
557
804
|
|
|
558
|
-
def _update_created_session(
|
|
559
|
-
self
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
805
|
+
def _update_created_session(
|
|
806
|
+
self,
|
|
807
|
+
session: OpenAISessionCreateRequest
|
|
808
|
+
| OpenAIRealtimeTranscriptionSessionCreateRequest
|
|
809
|
+
| Mapping[str, object]
|
|
810
|
+
| pydantic.BaseModel,
|
|
811
|
+
) -> None:
|
|
812
|
+
# Only store/playback-format information for realtime sessions (not transcription-only)
|
|
813
|
+
normalized_session = self._normalize_session_payload(session)
|
|
814
|
+
if not normalized_session:
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
self._created_session = normalized_session
|
|
818
|
+
normalized_format = self._extract_audio_format(normalized_session)
|
|
819
|
+
if normalized_format is None:
|
|
820
|
+
return
|
|
821
|
+
|
|
822
|
+
self._audio_state_tracker.set_audio_format(normalized_format)
|
|
823
|
+
if self._playback_tracker:
|
|
824
|
+
self._playback_tracker.set_audio_format(normalized_format)
|
|
825
|
+
|
|
826
|
+
@staticmethod
|
|
827
|
+
def _normalize_session_payload(
|
|
828
|
+
session: OpenAISessionCreateRequest
|
|
829
|
+
| OpenAIRealtimeTranscriptionSessionCreateRequest
|
|
830
|
+
| Mapping[str, object]
|
|
831
|
+
| pydantic.BaseModel,
|
|
832
|
+
) -> OpenAISessionCreateRequest | None:
|
|
833
|
+
if isinstance(session, OpenAISessionCreateRequest):
|
|
834
|
+
return session
|
|
835
|
+
|
|
836
|
+
if isinstance(session, OpenAIRealtimeTranscriptionSessionCreateRequest):
|
|
837
|
+
return None
|
|
838
|
+
|
|
839
|
+
session_payload: Mapping[str, object]
|
|
840
|
+
if isinstance(session, pydantic.BaseModel):
|
|
841
|
+
session_payload = cast(Mapping[str, object], session.model_dump())
|
|
842
|
+
elif isinstance(session, Mapping):
|
|
843
|
+
session_payload = session
|
|
844
|
+
else:
|
|
845
|
+
return None
|
|
846
|
+
|
|
847
|
+
if OpenAIRealtimeWebSocketModel._is_transcription_session(session_payload):
|
|
848
|
+
return None
|
|
849
|
+
|
|
850
|
+
try:
|
|
851
|
+
return OpenAISessionCreateRequest.model_validate(session_payload)
|
|
852
|
+
except pydantic.ValidationError:
|
|
853
|
+
return None
|
|
854
|
+
|
|
855
|
+
@staticmethod
|
|
856
|
+
def _is_transcription_session(payload: Mapping[str, object]) -> bool:
|
|
857
|
+
try:
|
|
858
|
+
OpenAIRealtimeTranscriptionSessionCreateRequest.model_validate(payload)
|
|
859
|
+
except pydantic.ValidationError:
|
|
860
|
+
return False
|
|
861
|
+
else:
|
|
862
|
+
return True
|
|
863
|
+
|
|
864
|
+
@staticmethod
|
|
865
|
+
def _extract_audio_format(session: OpenAISessionCreateRequest) -> str | None:
|
|
866
|
+
audio = session.audio
|
|
867
|
+
if not audio or not audio.output or not audio.output.format:
|
|
868
|
+
return None
|
|
869
|
+
|
|
870
|
+
return OpenAIRealtimeWebSocketModel._normalize_audio_format(audio.output.format)
|
|
871
|
+
|
|
872
|
+
@staticmethod
|
|
873
|
+
def _normalize_audio_format(fmt: object) -> str:
|
|
874
|
+
if isinstance(fmt, AudioPCM):
|
|
875
|
+
return "pcm16"
|
|
876
|
+
if isinstance(fmt, AudioPCMU):
|
|
877
|
+
return "g711_ulaw"
|
|
878
|
+
if isinstance(fmt, AudioPCMA):
|
|
879
|
+
return "g711_alaw"
|
|
880
|
+
|
|
881
|
+
fmt_type = OpenAIRealtimeWebSocketModel._read_format_type(fmt)
|
|
882
|
+
if isinstance(fmt_type, str) and fmt_type:
|
|
883
|
+
return fmt_type
|
|
884
|
+
|
|
885
|
+
return str(fmt)
|
|
886
|
+
|
|
887
|
+
@staticmethod
|
|
888
|
+
def _read_format_type(fmt: object) -> str | None:
|
|
889
|
+
if isinstance(fmt, str):
|
|
890
|
+
return fmt
|
|
891
|
+
|
|
892
|
+
if isinstance(fmt, Mapping):
|
|
893
|
+
type_value = fmt.get("type")
|
|
894
|
+
return type_value if isinstance(type_value, str) else None
|
|
895
|
+
|
|
896
|
+
if isinstance(fmt, pydantic.BaseModel):
|
|
897
|
+
type_value = fmt.model_dump().get("type")
|
|
898
|
+
return type_value if isinstance(type_value, str) else None
|
|
899
|
+
|
|
900
|
+
try:
|
|
901
|
+
type_value = fmt.type # type: ignore[attr-defined]
|
|
902
|
+
except AttributeError:
|
|
903
|
+
return None
|
|
904
|
+
|
|
905
|
+
return type_value if isinstance(type_value, str) else None
|
|
906
|
+
|
|
907
|
+
@staticmethod
|
|
908
|
+
def _normalize_turn_detection_config(config: object) -> object:
|
|
909
|
+
"""Normalize camelCase turn detection keys to snake_case for API compatibility."""
|
|
910
|
+
if not isinstance(config, Mapping):
|
|
911
|
+
return config
|
|
912
|
+
|
|
913
|
+
normalized = dict(config)
|
|
914
|
+
key_map = {
|
|
915
|
+
"createResponse": "create_response",
|
|
916
|
+
"interruptResponse": "interrupt_response",
|
|
917
|
+
"prefixPaddingMs": "prefix_padding_ms",
|
|
918
|
+
"silenceDurationMs": "silence_duration_ms",
|
|
919
|
+
"idleTimeoutMs": "idle_timeout_ms",
|
|
920
|
+
}
|
|
921
|
+
for camel_key, snake_key in key_map.items():
|
|
922
|
+
if camel_key in normalized and snake_key not in normalized:
|
|
923
|
+
normalized[snake_key] = normalized[camel_key]
|
|
924
|
+
normalized.pop(camel_key, None)
|
|
925
|
+
|
|
926
|
+
return normalized
|
|
564
927
|
|
|
565
928
|
async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None:
|
|
566
929
|
session_config = self._get_session_config(model_settings)
|
|
@@ -570,51 +933,138 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
570
933
|
|
|
571
934
|
def _get_session_config(
|
|
572
935
|
self, model_settings: RealtimeSessionModelSettings
|
|
573
|
-
) ->
|
|
936
|
+
) -> OpenAISessionCreateRequest:
|
|
574
937
|
"""Get the session config."""
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
938
|
+
audio_input_args: dict[str, Any] = {}
|
|
939
|
+
audio_output_args: dict[str, Any] = {}
|
|
940
|
+
|
|
941
|
+
audio_config = model_settings.get("audio")
|
|
942
|
+
audio_config_mapping = audio_config if isinstance(audio_config, Mapping) else None
|
|
943
|
+
input_audio_config: Mapping[str, Any] = (
|
|
944
|
+
cast(Mapping[str, Any], audio_config_mapping.get("input", {}))
|
|
945
|
+
if audio_config_mapping
|
|
946
|
+
else {}
|
|
947
|
+
)
|
|
948
|
+
output_audio_config: Mapping[str, Any] = (
|
|
949
|
+
cast(Mapping[str, Any], audio_config_mapping.get("output", {}))
|
|
950
|
+
if audio_config_mapping
|
|
951
|
+
else {}
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
input_format_source: FormatInput = (
|
|
955
|
+
input_audio_config.get("format") if input_audio_config else None
|
|
956
|
+
)
|
|
957
|
+
if input_format_source is None:
|
|
958
|
+
if self._call_id:
|
|
959
|
+
input_format_source = model_settings.get("input_audio_format")
|
|
960
|
+
else:
|
|
961
|
+
input_format_source = model_settings.get(
|
|
962
|
+
"input_audio_format", DEFAULT_MODEL_SETTINGS.get("input_audio_format")
|
|
963
|
+
)
|
|
964
|
+
audio_input_args["format"] = to_realtime_audio_format(input_format_source)
|
|
965
|
+
|
|
966
|
+
if "noise_reduction" in input_audio_config:
|
|
967
|
+
audio_input_args["noise_reduction"] = input_audio_config.get("noise_reduction")
|
|
968
|
+
elif "input_audio_noise_reduction" in model_settings:
|
|
969
|
+
audio_input_args["noise_reduction"] = model_settings.get("input_audio_noise_reduction")
|
|
970
|
+
|
|
971
|
+
if "transcription" in input_audio_config:
|
|
972
|
+
audio_input_args["transcription"] = input_audio_config.get("transcription")
|
|
973
|
+
elif "input_audio_transcription" in model_settings:
|
|
974
|
+
audio_input_args["transcription"] = model_settings.get("input_audio_transcription")
|
|
975
|
+
else:
|
|
976
|
+
audio_input_args["transcription"] = DEFAULT_MODEL_SETTINGS.get(
|
|
977
|
+
"input_audio_transcription"
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
if "turn_detection" in input_audio_config:
|
|
981
|
+
audio_input_args["turn_detection"] = self._normalize_turn_detection_config(
|
|
982
|
+
input_audio_config.get("turn_detection")
|
|
983
|
+
)
|
|
984
|
+
elif "turn_detection" in model_settings:
|
|
985
|
+
audio_input_args["turn_detection"] = self._normalize_turn_detection_config(
|
|
986
|
+
model_settings.get("turn_detection")
|
|
987
|
+
)
|
|
988
|
+
else:
|
|
989
|
+
audio_input_args["turn_detection"] = DEFAULT_MODEL_SETTINGS.get("turn_detection")
|
|
990
|
+
|
|
991
|
+
requested_voice = output_audio_config.get("voice") if output_audio_config else None
|
|
992
|
+
audio_output_args["voice"] = requested_voice or model_settings.get(
|
|
993
|
+
"voice", DEFAULT_MODEL_SETTINGS.get("voice")
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
output_format_source: FormatInput = (
|
|
997
|
+
output_audio_config.get("format") if output_audio_config else None
|
|
998
|
+
)
|
|
999
|
+
if output_format_source is None:
|
|
1000
|
+
if self._call_id:
|
|
1001
|
+
output_format_source = model_settings.get("output_audio_format")
|
|
1002
|
+
else:
|
|
1003
|
+
output_format_source = model_settings.get(
|
|
1004
|
+
"output_audio_format", DEFAULT_MODEL_SETTINGS.get("output_audio_format")
|
|
1005
|
+
)
|
|
1006
|
+
audio_output_args["format"] = to_realtime_audio_format(output_format_source)
|
|
1007
|
+
|
|
1008
|
+
if "speed" in output_audio_config:
|
|
1009
|
+
audio_output_args["speed"] = output_audio_config.get("speed")
|
|
1010
|
+
elif "speed" in model_settings:
|
|
1011
|
+
audio_output_args["speed"] = model_settings.get("speed")
|
|
1012
|
+
|
|
1013
|
+
output_modalities = (
|
|
1014
|
+
model_settings.get("output_modalities")
|
|
1015
|
+
or model_settings.get("modalities")
|
|
1016
|
+
or DEFAULT_MODEL_SETTINGS.get("modalities")
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
# Construct full session object. `type` will be excluded at serialization time for updates.
|
|
1020
|
+
session_create_request = OpenAISessionCreateRequest(
|
|
1021
|
+
type="realtime",
|
|
1022
|
+
model=(model_settings.get("model_name") or self.model) or "gpt-realtime",
|
|
1023
|
+
output_modalities=output_modalities,
|
|
1024
|
+
audio=OpenAIRealtimeAudioConfig(
|
|
1025
|
+
input=OpenAIRealtimeAudioInput(**audio_input_args),
|
|
1026
|
+
output=OpenAIRealtimeAudioOutput(**audio_output_args),
|
|
603
1027
|
),
|
|
604
|
-
tools=
|
|
605
|
-
|
|
1028
|
+
tools=cast(
|
|
1029
|
+
Any,
|
|
1030
|
+
self._tools_to_session_tools(
|
|
1031
|
+
tools=model_settings.get("tools", []),
|
|
1032
|
+
handoffs=model_settings.get("handoffs", []),
|
|
1033
|
+
),
|
|
606
1034
|
),
|
|
607
1035
|
)
|
|
608
1036
|
|
|
1037
|
+
if "instructions" in model_settings:
|
|
1038
|
+
session_create_request.instructions = model_settings.get("instructions")
|
|
1039
|
+
|
|
1040
|
+
if "prompt" in model_settings:
|
|
1041
|
+
_passed_prompt: Prompt = model_settings["prompt"]
|
|
1042
|
+
variables: dict[str, Any] | None = _passed_prompt.get("variables")
|
|
1043
|
+
session_create_request.prompt = ResponsePrompt(
|
|
1044
|
+
id=_passed_prompt["id"],
|
|
1045
|
+
variables=variables,
|
|
1046
|
+
version=_passed_prompt.get("version"),
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
if "max_output_tokens" in model_settings:
|
|
1050
|
+
session_create_request.max_output_tokens = cast(
|
|
1051
|
+
Any, model_settings.get("max_output_tokens")
|
|
1052
|
+
)
|
|
1053
|
+
|
|
1054
|
+
if "tool_choice" in model_settings:
|
|
1055
|
+
session_create_request.tool_choice = cast(Any, model_settings.get("tool_choice"))
|
|
1056
|
+
|
|
1057
|
+
return session_create_request
|
|
1058
|
+
|
|
609
1059
|
def _tools_to_session_tools(
|
|
610
1060
|
self, tools: list[Tool], handoffs: list[Handoff]
|
|
611
|
-
) -> list[
|
|
612
|
-
converted_tools: list[
|
|
1061
|
+
) -> list[OpenAISessionFunction]:
|
|
1062
|
+
converted_tools: list[OpenAISessionFunction] = []
|
|
613
1063
|
for tool in tools:
|
|
614
1064
|
if not isinstance(tool, FunctionTool):
|
|
615
1065
|
raise UserError(f"Tool {tool.name} is unsupported. Must be a function tool.")
|
|
616
1066
|
converted_tools.append(
|
|
617
|
-
|
|
1067
|
+
OpenAISessionFunction(
|
|
618
1068
|
name=tool.name,
|
|
619
1069
|
description=tool.description,
|
|
620
1070
|
parameters=tool.params_json_schema,
|
|
@@ -624,7 +1074,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
624
1074
|
|
|
625
1075
|
for handoff in handoffs:
|
|
626
1076
|
converted_tools.append(
|
|
627
|
-
|
|
1077
|
+
OpenAISessionFunction(
|
|
628
1078
|
name=handoff.tool_name,
|
|
629
1079
|
description=handoff.tool_description,
|
|
630
1080
|
parameters=handoff.input_json_schema,
|
|
@@ -635,20 +1085,85 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
635
1085
|
return converted_tools
|
|
636
1086
|
|
|
637
1087
|
|
|
1088
|
+
class OpenAIRealtimeSIPModel(OpenAIRealtimeWebSocketModel):
|
|
1089
|
+
"""Realtime model that attaches to SIP-originated calls using a call ID."""
|
|
1090
|
+
|
|
1091
|
+
@staticmethod
|
|
1092
|
+
async def build_initial_session_payload(
|
|
1093
|
+
agent: RealtimeAgent[Any],
|
|
1094
|
+
*,
|
|
1095
|
+
context: TContext | None = None,
|
|
1096
|
+
model_config: RealtimeModelConfig | None = None,
|
|
1097
|
+
run_config: RealtimeRunConfig | None = None,
|
|
1098
|
+
overrides: RealtimeSessionModelSettings | None = None,
|
|
1099
|
+
) -> OpenAISessionCreateRequest:
|
|
1100
|
+
"""Build a session payload that mirrors what a RealtimeSession would send on connect.
|
|
1101
|
+
|
|
1102
|
+
This helper can be used to accept SIP-originated calls by forwarding the returned payload to
|
|
1103
|
+
the Realtime Calls API without duplicating session setup logic.
|
|
1104
|
+
"""
|
|
1105
|
+
run_config_settings = (run_config or {}).get("model_settings") or {}
|
|
1106
|
+
initial_model_settings = (model_config or {}).get("initial_model_settings") or {}
|
|
1107
|
+
base_settings: RealtimeSessionModelSettings = {
|
|
1108
|
+
**run_config_settings,
|
|
1109
|
+
**initial_model_settings,
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
context_wrapper = RunContextWrapper(context)
|
|
1113
|
+
merged_settings = await _build_model_settings_from_agent(
|
|
1114
|
+
agent=agent,
|
|
1115
|
+
context_wrapper=context_wrapper,
|
|
1116
|
+
base_settings=base_settings,
|
|
1117
|
+
starting_settings=initial_model_settings,
|
|
1118
|
+
run_config=run_config,
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
if overrides:
|
|
1122
|
+
merged_settings.update(overrides)
|
|
1123
|
+
|
|
1124
|
+
model = OpenAIRealtimeWebSocketModel()
|
|
1125
|
+
return model._get_session_config(merged_settings)
|
|
1126
|
+
|
|
1127
|
+
async def connect(self, options: RealtimeModelConfig) -> None:
|
|
1128
|
+
call_id = options.get("call_id")
|
|
1129
|
+
if not call_id:
|
|
1130
|
+
raise UserError("OpenAIRealtimeSIPModel requires `call_id` in the model configuration.")
|
|
1131
|
+
|
|
1132
|
+
sip_options = options.copy()
|
|
1133
|
+
await super().connect(sip_options)
|
|
1134
|
+
|
|
1135
|
+
|
|
638
1136
|
class _ConversionHelper:
|
|
639
1137
|
@classmethod
|
|
640
1138
|
def conversation_item_to_realtime_message_item(
|
|
641
1139
|
cls, item: ConversationItem, previous_item_id: str | None
|
|
642
1140
|
) -> RealtimeMessageItem:
|
|
1141
|
+
if not isinstance(
|
|
1142
|
+
item,
|
|
1143
|
+
(
|
|
1144
|
+
RealtimeConversationItemUserMessage,
|
|
1145
|
+
RealtimeConversationItemAssistantMessage,
|
|
1146
|
+
RealtimeConversationItemSystemMessage,
|
|
1147
|
+
),
|
|
1148
|
+
):
|
|
1149
|
+
raise ValueError("Unsupported conversation item type for message conversion.")
|
|
1150
|
+
content: list[dict[str, Any]] = []
|
|
1151
|
+
for each in item.content:
|
|
1152
|
+
c = each.model_dump()
|
|
1153
|
+
if each.type == "output_text":
|
|
1154
|
+
# For backward-compatibility of assistant message items
|
|
1155
|
+
c["type"] = "text"
|
|
1156
|
+
elif each.type == "output_audio":
|
|
1157
|
+
# For backward-compatibility of assistant message items
|
|
1158
|
+
c["type"] = "audio"
|
|
1159
|
+
content.append(c)
|
|
643
1160
|
return TypeAdapter(RealtimeMessageItem).validate_python(
|
|
644
1161
|
{
|
|
645
1162
|
"item_id": item.id or "",
|
|
646
1163
|
"previous_item_id": previous_item_id,
|
|
647
1164
|
"type": item.type,
|
|
648
1165
|
"role": item.role,
|
|
649
|
-
"content":
|
|
650
|
-
[content.model_dump() for content in item.content] if item.content else []
|
|
651
|
-
),
|
|
1166
|
+
"content": content,
|
|
652
1167
|
"status": "in_progress",
|
|
653
1168
|
},
|
|
654
1169
|
)
|
|
@@ -668,12 +1183,12 @@ class _ConversionHelper:
|
|
|
668
1183
|
@classmethod
|
|
669
1184
|
def convert_tracing_config(
|
|
670
1185
|
cls, tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None
|
|
671
|
-
) ->
|
|
1186
|
+
) -> OpenAITracingConfiguration | Literal["auto"] | None:
|
|
672
1187
|
if tracing_config is None:
|
|
673
1188
|
return None
|
|
674
1189
|
elif tracing_config == "auto":
|
|
675
1190
|
return "auto"
|
|
676
|
-
return
|
|
1191
|
+
return OpenAITracingConfiguration(
|
|
677
1192
|
group_id=tracing_config.get("group_id"),
|
|
678
1193
|
metadata=tracing_config.get("metadata"),
|
|
679
1194
|
workflow_name=tracing_config.get("workflow_name"),
|
|
@@ -686,22 +1201,53 @@ class _ConversionHelper:
|
|
|
686
1201
|
user_input = event.user_input
|
|
687
1202
|
|
|
688
1203
|
if isinstance(user_input, dict):
|
|
689
|
-
|
|
1204
|
+
content: list[Content] = []
|
|
1205
|
+
for item in user_input.get("content", []):
|
|
1206
|
+
try:
|
|
1207
|
+
if not isinstance(item, dict):
|
|
1208
|
+
continue
|
|
1209
|
+
t = item.get("type")
|
|
1210
|
+
if t == "input_text":
|
|
1211
|
+
_txt = item.get("text")
|
|
1212
|
+
text_val = _txt if isinstance(_txt, str) else None
|
|
1213
|
+
content.append(Content(type="input_text", text=text_val))
|
|
1214
|
+
elif t == "input_image":
|
|
1215
|
+
iu = item.get("image_url")
|
|
1216
|
+
if isinstance(iu, str) and iu:
|
|
1217
|
+
d = item.get("detail")
|
|
1218
|
+
detail_val = cast(
|
|
1219
|
+
Literal["auto", "low", "high"] | None,
|
|
1220
|
+
d if isinstance(d, str) and d in ("auto", "low", "high") else None,
|
|
1221
|
+
)
|
|
1222
|
+
if detail_val is None:
|
|
1223
|
+
content.append(
|
|
1224
|
+
Content(
|
|
1225
|
+
type="input_image",
|
|
1226
|
+
image_url=iu,
|
|
1227
|
+
)
|
|
1228
|
+
)
|
|
1229
|
+
else:
|
|
1230
|
+
content.append(
|
|
1231
|
+
Content(
|
|
1232
|
+
type="input_image",
|
|
1233
|
+
image_url=iu,
|
|
1234
|
+
detail=detail_val,
|
|
1235
|
+
)
|
|
1236
|
+
)
|
|
1237
|
+
# ignore unknown types for forward-compat
|
|
1238
|
+
except Exception:
|
|
1239
|
+
# best-effort; skip malformed parts
|
|
1240
|
+
continue
|
|
1241
|
+
return RealtimeConversationItemUserMessage(
|
|
690
1242
|
type="message",
|
|
691
1243
|
role="user",
|
|
692
|
-
content=
|
|
693
|
-
OpenAIConversationItemContent(
|
|
694
|
-
type="input_text",
|
|
695
|
-
text=item.get("text"),
|
|
696
|
-
)
|
|
697
|
-
for item in user_input.get("content", [])
|
|
698
|
-
],
|
|
1244
|
+
content=content,
|
|
699
1245
|
)
|
|
700
1246
|
else:
|
|
701
|
-
return
|
|
1247
|
+
return RealtimeConversationItemUserMessage(
|
|
702
1248
|
type="message",
|
|
703
1249
|
role="user",
|
|
704
|
-
content=[
|
|
1250
|
+
content=[Content(type="input_text", text=user_input)],
|
|
705
1251
|
)
|
|
706
1252
|
|
|
707
1253
|
@classmethod
|
|
@@ -727,7 +1273,7 @@ class _ConversionHelper:
|
|
|
727
1273
|
def convert_tool_output(cls, event: RealtimeModelSendToolOutput) -> OpenAIRealtimeClientEvent:
|
|
728
1274
|
return OpenAIConversationItemCreateEvent(
|
|
729
1275
|
type="conversation.item.create",
|
|
730
|
-
item=
|
|
1276
|
+
item=RealtimeConversationItemFunctionCallOutput(
|
|
731
1277
|
type="function_call_output",
|
|
732
1278
|
output=event.output,
|
|
733
1279
|
call_id=event.tool_call.call_id,
|