openai-agents 0.2.10__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of openai-agents might be problematic. Click here for more details.
- agents/_debug.py +15 -4
- agents/_run_impl.py +34 -37
- agents/extensions/models/litellm_model.py +20 -5
- agents/memory/__init__.py +2 -0
- agents/memory/openai_conversations_session.py +0 -3
- agents/memory/util.py +20 -0
- agents/models/openai_chatcompletions.py +17 -2
- agents/models/openai_responses.py +17 -4
- agents/realtime/_util.py +1 -1
- agents/realtime/agent.py +7 -0
- agents/realtime/audio_formats.py +29 -0
- agents/realtime/config.py +22 -4
- agents/realtime/items.py +17 -1
- agents/realtime/model.py +6 -0
- agents/realtime/model_inputs.py +15 -1
- agents/realtime/openai_realtime.py +428 -139
- agents/realtime/session.py +167 -14
- agents/run.py +102 -54
- agents/tool.py +2 -2
- agents/util/_json.py +19 -1
- agents/voice/input.py +5 -4
- agents/voice/models/openai_stt.py +6 -4
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/METADATA +2 -2
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/RECORD +26 -24
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/WHEEL +0 -0
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,59 +5,87 @@ import base64
|
|
|
5
5
|
import inspect
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
+
from collections.abc import Mapping
|
|
8
9
|
from datetime import datetime
|
|
9
|
-
from typing import Annotated, Any, Callable, Literal, Union
|
|
10
|
+
from typing import Annotated, Any, Callable, Literal, Union, cast
|
|
10
11
|
|
|
11
12
|
import pydantic
|
|
12
13
|
import websockets
|
|
13
|
-
from openai.types.
|
|
14
|
+
from openai.types.realtime import realtime_audio_config as _rt_audio_config
|
|
15
|
+
from openai.types.realtime.conversation_item import (
|
|
14
16
|
ConversationItem,
|
|
15
17
|
ConversationItem as OpenAIConversationItem,
|
|
16
18
|
)
|
|
17
|
-
from openai.types.
|
|
18
|
-
ConversationItemContent as OpenAIConversationItemContent,
|
|
19
|
-
)
|
|
20
|
-
from openai.types.beta.realtime.conversation_item_create_event import (
|
|
19
|
+
from openai.types.realtime.conversation_item_create_event import (
|
|
21
20
|
ConversationItemCreateEvent as OpenAIConversationItemCreateEvent,
|
|
22
21
|
)
|
|
23
|
-
from openai.types.
|
|
22
|
+
from openai.types.realtime.conversation_item_retrieve_event import (
|
|
24
23
|
ConversationItemRetrieveEvent as OpenAIConversationItemRetrieveEvent,
|
|
25
24
|
)
|
|
26
|
-
from openai.types.
|
|
25
|
+
from openai.types.realtime.conversation_item_truncate_event import (
|
|
27
26
|
ConversationItemTruncateEvent as OpenAIConversationItemTruncateEvent,
|
|
28
27
|
)
|
|
29
|
-
from openai.types.
|
|
28
|
+
from openai.types.realtime.input_audio_buffer_append_event import (
|
|
30
29
|
InputAudioBufferAppendEvent as OpenAIInputAudioBufferAppendEvent,
|
|
31
30
|
)
|
|
32
|
-
from openai.types.
|
|
31
|
+
from openai.types.realtime.input_audio_buffer_commit_event import (
|
|
33
32
|
InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent,
|
|
34
33
|
)
|
|
35
|
-
from openai.types.
|
|
34
|
+
from openai.types.realtime.realtime_audio_formats import (
|
|
35
|
+
AudioPCM,
|
|
36
|
+
AudioPCMA,
|
|
37
|
+
AudioPCMU,
|
|
38
|
+
)
|
|
39
|
+
from openai.types.realtime.realtime_client_event import (
|
|
36
40
|
RealtimeClientEvent as OpenAIRealtimeClientEvent,
|
|
37
41
|
)
|
|
38
|
-
from openai.types.
|
|
42
|
+
from openai.types.realtime.realtime_conversation_item_assistant_message import (
|
|
43
|
+
RealtimeConversationItemAssistantMessage,
|
|
44
|
+
)
|
|
45
|
+
from openai.types.realtime.realtime_conversation_item_function_call_output import (
|
|
46
|
+
RealtimeConversationItemFunctionCallOutput,
|
|
47
|
+
)
|
|
48
|
+
from openai.types.realtime.realtime_conversation_item_system_message import (
|
|
49
|
+
RealtimeConversationItemSystemMessage,
|
|
50
|
+
)
|
|
51
|
+
from openai.types.realtime.realtime_conversation_item_user_message import (
|
|
52
|
+
Content,
|
|
53
|
+
RealtimeConversationItemUserMessage,
|
|
54
|
+
)
|
|
55
|
+
from openai.types.realtime.realtime_function_tool import (
|
|
56
|
+
RealtimeFunctionTool as OpenAISessionFunction,
|
|
57
|
+
)
|
|
58
|
+
from openai.types.realtime.realtime_server_event import (
|
|
39
59
|
RealtimeServerEvent as OpenAIRealtimeServerEvent,
|
|
40
60
|
)
|
|
41
|
-
from openai.types.
|
|
42
|
-
|
|
61
|
+
from openai.types.realtime.realtime_session_create_request import (
|
|
62
|
+
RealtimeSessionCreateRequest as OpenAISessionCreateRequest,
|
|
63
|
+
)
|
|
64
|
+
from openai.types.realtime.realtime_tracing_config import (
|
|
65
|
+
TracingConfiguration as OpenAITracingConfiguration,
|
|
66
|
+
)
|
|
67
|
+
from openai.types.realtime.realtime_transcription_session_create_request import (
|
|
68
|
+
RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest,
|
|
69
|
+
)
|
|
70
|
+
from openai.types.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
|
|
71
|
+
from openai.types.realtime.response_cancel_event import (
|
|
43
72
|
ResponseCancelEvent as OpenAIResponseCancelEvent,
|
|
44
73
|
)
|
|
45
|
-
from openai.types.
|
|
74
|
+
from openai.types.realtime.response_create_event import (
|
|
46
75
|
ResponseCreateEvent as OpenAIResponseCreateEvent,
|
|
47
76
|
)
|
|
48
|
-
from openai.types.
|
|
49
|
-
Session as OpenAISessionObject,
|
|
50
|
-
SessionTool as OpenAISessionTool,
|
|
51
|
-
SessionTracing as OpenAISessionTracing,
|
|
52
|
-
SessionTracingTracingConfiguration as OpenAISessionTracingConfiguration,
|
|
77
|
+
from openai.types.realtime.session_update_event import (
|
|
53
78
|
SessionUpdateEvent as OpenAISessionUpdateEvent,
|
|
54
79
|
)
|
|
55
|
-
from
|
|
80
|
+
from openai.types.responses.response_prompt import ResponsePrompt
|
|
81
|
+
from pydantic import Field, TypeAdapter
|
|
56
82
|
from typing_extensions import assert_never
|
|
57
83
|
from websockets.asyncio.client import ClientConnection
|
|
58
84
|
|
|
59
85
|
from agents.handoffs import Handoff
|
|
86
|
+
from agents.prompts import Prompt
|
|
60
87
|
from agents.realtime._default_tracker import ModelAudioTracker
|
|
88
|
+
from agents.realtime.audio_formats import to_realtime_audio_format
|
|
61
89
|
from agents.tool import FunctionTool, Tool
|
|
62
90
|
from agents.util._types import MaybeAwaitable
|
|
63
91
|
|
|
@@ -103,17 +131,23 @@ from .model_inputs import (
|
|
|
103
131
|
RealtimeModelSendUserInput,
|
|
104
132
|
)
|
|
105
133
|
|
|
134
|
+
# Avoid direct imports of non-exported names by referencing via module
|
|
135
|
+
OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig
|
|
136
|
+
OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput # type: ignore[attr-defined]
|
|
137
|
+
OpenAIRealtimeAudioOutput = _rt_audio_config.RealtimeAudioConfigOutput # type: ignore[attr-defined]
|
|
138
|
+
|
|
139
|
+
|
|
106
140
|
_USER_AGENT = f"Agents/Python {__version__}"
|
|
107
141
|
|
|
108
142
|
DEFAULT_MODEL_SETTINGS: RealtimeSessionModelSettings = {
|
|
109
143
|
"voice": "ash",
|
|
110
|
-
"modalities": ["
|
|
144
|
+
"modalities": ["audio"],
|
|
111
145
|
"input_audio_format": "pcm16",
|
|
112
146
|
"output_audio_format": "pcm16",
|
|
113
147
|
"input_audio_transcription": {
|
|
114
148
|
"model": "gpt-4o-mini-transcribe",
|
|
115
149
|
},
|
|
116
|
-
"turn_detection": {"type": "semantic_vad"},
|
|
150
|
+
"turn_detection": {"type": "semantic_vad", "interrupt_response": True},
|
|
117
151
|
}
|
|
118
152
|
|
|
119
153
|
|
|
@@ -129,19 +163,8 @@ async def get_api_key(key: str | Callable[[], MaybeAwaitable[str]] | None) -> st
|
|
|
129
163
|
return os.getenv("OPENAI_API_KEY")
|
|
130
164
|
|
|
131
165
|
|
|
132
|
-
class _InputAudioBufferTimeoutTriggeredEvent(BaseModel):
|
|
133
|
-
type: Literal["input_audio_buffer.timeout_triggered"]
|
|
134
|
-
event_id: str
|
|
135
|
-
audio_start_ms: int
|
|
136
|
-
audio_end_ms: int
|
|
137
|
-
item_id: str
|
|
138
|
-
|
|
139
|
-
|
|
140
166
|
AllRealtimeServerEvents = Annotated[
|
|
141
|
-
Union[
|
|
142
|
-
OpenAIRealtimeServerEvent,
|
|
143
|
-
_InputAudioBufferTimeoutTriggeredEvent,
|
|
144
|
-
],
|
|
167
|
+
Union[OpenAIRealtimeServerEvent,],
|
|
145
168
|
Field(discriminator="type"),
|
|
146
169
|
]
|
|
147
170
|
|
|
@@ -155,11 +178,16 @@ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
|
|
|
155
178
|
return ServerEventTypeAdapter
|
|
156
179
|
|
|
157
180
|
|
|
181
|
+
# Note: Avoid a module-level union alias for Python 3.9 compatibility.
|
|
182
|
+
# Using a union at runtime (e.g., A | B) in a type alias triggers evaluation
|
|
183
|
+
# during import on 3.9. We instead inline the union in annotations below.
|
|
184
|
+
|
|
185
|
+
|
|
158
186
|
class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
159
187
|
"""A model that uses OpenAI's WebSocket API."""
|
|
160
188
|
|
|
161
189
|
def __init__(self) -> None:
|
|
162
|
-
self.model = "gpt-
|
|
190
|
+
self.model = "gpt-realtime" # Default model
|
|
163
191
|
self._websocket: ClientConnection | None = None
|
|
164
192
|
self._websocket_task: asyncio.Task[None] | None = None
|
|
165
193
|
self._listeners: list[RealtimeModelListener] = []
|
|
@@ -168,7 +196,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
168
196
|
self._ongoing_response: bool = False
|
|
169
197
|
self._tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None = None
|
|
170
198
|
self._playback_tracker: RealtimePlaybackTracker | None = None
|
|
171
|
-
self._created_session:
|
|
199
|
+
self._created_session: OpenAISessionCreateRequest | None = None
|
|
172
200
|
self._server_event_type_adapter = get_server_event_type_adapter()
|
|
173
201
|
|
|
174
202
|
async def connect(self, options: RealtimeModelConfig) -> None:
|
|
@@ -188,15 +216,18 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
188
216
|
else:
|
|
189
217
|
self._tracing_config = "auto"
|
|
190
218
|
|
|
191
|
-
if not api_key:
|
|
192
|
-
raise UserError("API key is required but was not provided.")
|
|
193
|
-
|
|
194
219
|
url = options.get("url", f"wss://api.openai.com/v1/realtime?model={self.model}")
|
|
195
220
|
|
|
196
|
-
headers = {
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
221
|
+
headers: dict[str, str] = {}
|
|
222
|
+
if options.get("headers") is not None:
|
|
223
|
+
# For customizing request headers
|
|
224
|
+
headers.update(options["headers"])
|
|
225
|
+
else:
|
|
226
|
+
# OpenAI's Realtime API
|
|
227
|
+
if not api_key:
|
|
228
|
+
raise UserError("API key is required but was not provided.")
|
|
229
|
+
|
|
230
|
+
headers.update({"Authorization": f"Bearer {api_key}"})
|
|
200
231
|
self._websocket = await websockets.connect(
|
|
201
232
|
url,
|
|
202
233
|
user_agent_header=_USER_AGENT,
|
|
@@ -214,7 +245,11 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
214
245
|
converted_tracing_config = _ConversionHelper.convert_tracing_config(tracing_config)
|
|
215
246
|
await self._send_raw_message(
|
|
216
247
|
OpenAISessionUpdateEvent(
|
|
217
|
-
session=
|
|
248
|
+
session=OpenAISessionCreateRequest(
|
|
249
|
+
model=self.model,
|
|
250
|
+
type="realtime",
|
|
251
|
+
tracing=converted_tracing_config,
|
|
252
|
+
),
|
|
218
253
|
type="session.update",
|
|
219
254
|
)
|
|
220
255
|
)
|
|
@@ -296,8 +331,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
296
331
|
async def _send_raw_message(self, event: OpenAIRealtimeClientEvent) -> None:
|
|
297
332
|
"""Send a raw message to the model."""
|
|
298
333
|
assert self._websocket is not None, "Not connected"
|
|
299
|
-
|
|
300
|
-
await self._websocket.send(
|
|
334
|
+
payload = event.model_dump_json(exclude_none=True, exclude_unset=True)
|
|
335
|
+
await self._websocket.send(payload)
|
|
301
336
|
|
|
302
337
|
async def _send_user_input(self, event: RealtimeModelSendUserInput) -> None:
|
|
303
338
|
converted = _ConversionHelper.convert_user_input_to_item_create(event)
|
|
@@ -390,10 +425,13 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
390
425
|
f"content index: {current_item_content_index}"
|
|
391
426
|
)
|
|
392
427
|
|
|
428
|
+
session = self._created_session
|
|
393
429
|
automatic_response_cancellation_enabled = (
|
|
394
|
-
|
|
395
|
-
and
|
|
396
|
-
and
|
|
430
|
+
session
|
|
431
|
+
and session.audio is not None
|
|
432
|
+
and session.audio.input is not None
|
|
433
|
+
and session.audio.input.turn_detection is not None
|
|
434
|
+
and session.audio.input.turn_detection.interrupt_response is True,
|
|
397
435
|
)
|
|
398
436
|
if not automatic_response_cancellation_enabled:
|
|
399
437
|
await self._cancel_response()
|
|
@@ -487,42 +525,103 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
487
525
|
|
|
488
526
|
async def _handle_ws_event(self, event: dict[str, Any]):
|
|
489
527
|
await self._emit_event(RealtimeModelRawServerEvent(data=event))
|
|
528
|
+
# The public interface definedo on this Agents SDK side (e.g., RealtimeMessageItem)
|
|
529
|
+
# must be the same even after the GA migration, so this part does the conversion
|
|
530
|
+
if isinstance(event, dict) and event.get("type") in (
|
|
531
|
+
"response.output_item.added",
|
|
532
|
+
"response.output_item.done",
|
|
533
|
+
):
|
|
534
|
+
item = event.get("item")
|
|
535
|
+
if isinstance(item, dict) and item.get("type") == "message":
|
|
536
|
+
raw_content = item.get("content") or []
|
|
537
|
+
converted_content: list[dict[str, Any]] = []
|
|
538
|
+
for part in raw_content:
|
|
539
|
+
if not isinstance(part, dict):
|
|
540
|
+
continue
|
|
541
|
+
if part.get("type") == "audio":
|
|
542
|
+
converted_content.append(
|
|
543
|
+
{
|
|
544
|
+
"type": "audio",
|
|
545
|
+
"audio": part.get("audio"),
|
|
546
|
+
"transcript": part.get("transcript"),
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
elif part.get("type") == "text":
|
|
550
|
+
converted_content.append({"type": "text", "text": part.get("text")})
|
|
551
|
+
status = item.get("status")
|
|
552
|
+
if status not in ("in_progress", "completed", "incomplete"):
|
|
553
|
+
is_done = event.get("type") == "response.output_item.done"
|
|
554
|
+
status = "completed" if is_done else "in_progress"
|
|
555
|
+
# Explicitly type the adapter for mypy
|
|
556
|
+
type_adapter: TypeAdapter[RealtimeMessageItem] = TypeAdapter(RealtimeMessageItem)
|
|
557
|
+
message_item: RealtimeMessageItem = type_adapter.validate_python(
|
|
558
|
+
{
|
|
559
|
+
"item_id": item.get("id", ""),
|
|
560
|
+
"type": "message",
|
|
561
|
+
"role": item.get("role", "assistant"),
|
|
562
|
+
"content": converted_content,
|
|
563
|
+
"status": status,
|
|
564
|
+
}
|
|
565
|
+
)
|
|
566
|
+
await self._emit_event(RealtimeModelItemUpdatedEvent(item=message_item))
|
|
567
|
+
return
|
|
568
|
+
|
|
490
569
|
try:
|
|
491
570
|
if "previous_item_id" in event and event["previous_item_id"] is None:
|
|
492
571
|
event["previous_item_id"] = "" # TODO (rm) remove
|
|
493
|
-
parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(
|
|
494
|
-
event
|
|
495
|
-
)
|
|
572
|
+
parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
|
|
496
573
|
except pydantic.ValidationError as e:
|
|
497
574
|
logger.error(f"Failed to validate server event: {event}", exc_info=True)
|
|
498
|
-
await self._emit_event(
|
|
499
|
-
RealtimeModelErrorEvent(
|
|
500
|
-
error=e,
|
|
501
|
-
)
|
|
502
|
-
)
|
|
575
|
+
await self._emit_event(RealtimeModelErrorEvent(error=e))
|
|
503
576
|
return
|
|
504
577
|
except Exception as e:
|
|
505
578
|
event_type = event.get("type", "unknown") if isinstance(event, dict) else "unknown"
|
|
506
579
|
logger.error(f"Failed to validate server event: {event}", exc_info=True)
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
context=f"Failed to validate server event: {event_type}",
|
|
511
|
-
)
|
|
580
|
+
exception_event = RealtimeModelExceptionEvent(
|
|
581
|
+
exception=e,
|
|
582
|
+
context=f"Failed to validate server event: {event_type}",
|
|
512
583
|
)
|
|
584
|
+
await self._emit_event(exception_event)
|
|
513
585
|
return
|
|
514
586
|
|
|
515
|
-
if parsed.type == "response.
|
|
587
|
+
if parsed.type == "response.output_audio.delta":
|
|
516
588
|
await self._handle_audio_delta(parsed)
|
|
517
|
-
elif parsed.type == "response.
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
content_index=parsed.content_index,
|
|
522
|
-
)
|
|
589
|
+
elif parsed.type == "response.output_audio.done":
|
|
590
|
+
audio_done_event = RealtimeModelAudioDoneEvent(
|
|
591
|
+
item_id=parsed.item_id,
|
|
592
|
+
content_index=parsed.content_index,
|
|
523
593
|
)
|
|
594
|
+
await self._emit_event(audio_done_event)
|
|
524
595
|
elif parsed.type == "input_audio_buffer.speech_started":
|
|
525
|
-
|
|
596
|
+
# On VAD speech start, immediately stop local playback so the user can
|
|
597
|
+
# barge‑in without overlapping assistant audio.
|
|
598
|
+
last_audio = self._audio_state_tracker.get_last_audio_item()
|
|
599
|
+
if last_audio is not None:
|
|
600
|
+
item_id, content_index = last_audio
|
|
601
|
+
await self._emit_event(
|
|
602
|
+
RealtimeModelAudioInterruptedEvent(item_id=item_id, content_index=content_index)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# Reset trackers so subsequent playback state queries don't
|
|
606
|
+
# reference audio that has been interrupted client‑side.
|
|
607
|
+
self._audio_state_tracker.on_interrupted()
|
|
608
|
+
if self._playback_tracker:
|
|
609
|
+
self._playback_tracker.on_interrupted()
|
|
610
|
+
|
|
611
|
+
# If server isn't configured to auto‑interrupt/cancel, cancel the
|
|
612
|
+
# response to prevent further audio.
|
|
613
|
+
session = self._created_session
|
|
614
|
+
automatic_response_cancellation_enabled = (
|
|
615
|
+
session
|
|
616
|
+
and session.audio is not None
|
|
617
|
+
and session.audio.input is not None
|
|
618
|
+
and session.audio.input.turn_detection is not None
|
|
619
|
+
and session.audio.input.turn_detection.interrupt_response is True,
|
|
620
|
+
)
|
|
621
|
+
if not automatic_response_cancellation_enabled:
|
|
622
|
+
await self._cancel_response()
|
|
623
|
+
# Avoid sending conversation.item.truncate here; when GA is set to
|
|
624
|
+
# interrupt on VAD start, the server will handle truncation.
|
|
526
625
|
elif parsed.type == "response.created":
|
|
527
626
|
self._ongoing_response = True
|
|
528
627
|
await self._emit_event(RealtimeModelTurnStartedEvent())
|
|
@@ -531,15 +630,16 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
531
630
|
await self._emit_event(RealtimeModelTurnEndedEvent())
|
|
532
631
|
elif parsed.type == "session.created":
|
|
533
632
|
await self._send_tracing_config(self._tracing_config)
|
|
534
|
-
self._update_created_session(parsed.session)
|
|
633
|
+
self._update_created_session(parsed.session)
|
|
535
634
|
elif parsed.type == "session.updated":
|
|
536
|
-
self._update_created_session(parsed.session)
|
|
635
|
+
self._update_created_session(parsed.session)
|
|
537
636
|
elif parsed.type == "error":
|
|
538
637
|
await self._emit_event(RealtimeModelErrorEvent(error=parsed.error))
|
|
539
638
|
elif parsed.type == "conversation.item.deleted":
|
|
540
639
|
await self._emit_event(RealtimeModelItemDeletedEvent(item_id=parsed.item_id))
|
|
541
640
|
elif (
|
|
542
|
-
parsed.type == "conversation.item.
|
|
641
|
+
parsed.type == "conversation.item.added"
|
|
642
|
+
or parsed.type == "conversation.item.created"
|
|
543
643
|
or parsed.type == "conversation.item.retrieved"
|
|
544
644
|
):
|
|
545
645
|
previous_item_id = (
|
|
@@ -564,7 +664,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
564
664
|
item_id=parsed.item_id, transcript=parsed.transcript
|
|
565
665
|
)
|
|
566
666
|
)
|
|
567
|
-
elif parsed.type == "response.
|
|
667
|
+
elif parsed.type == "response.output_audio_transcript.delta":
|
|
568
668
|
await self._emit_event(
|
|
569
669
|
RealtimeModelTranscriptDeltaEvent(
|
|
570
670
|
item_id=parsed.item_id, delta=parsed.delta, response_id=parsed.response_id
|
|
@@ -572,7 +672,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
572
672
|
)
|
|
573
673
|
elif (
|
|
574
674
|
parsed.type == "conversation.item.input_audio_transcription.delta"
|
|
575
|
-
or parsed.type == "response.
|
|
675
|
+
or parsed.type == "response.output_text.delta"
|
|
576
676
|
or parsed.type == "response.function_call_arguments.delta"
|
|
577
677
|
):
|
|
578
678
|
# No support for partials yet
|
|
@@ -583,18 +683,115 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
583
683
|
):
|
|
584
684
|
await self._handle_output_item(parsed.item)
|
|
585
685
|
elif parsed.type == "input_audio_buffer.timeout_triggered":
|
|
586
|
-
await self._emit_event(
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
686
|
+
await self._emit_event(
|
|
687
|
+
RealtimeModelInputAudioTimeoutTriggeredEvent(
|
|
688
|
+
item_id=parsed.item_id,
|
|
689
|
+
audio_start_ms=parsed.audio_start_ms,
|
|
690
|
+
audio_end_ms=parsed.audio_end_ms,
|
|
691
|
+
)
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
def _update_created_session(
|
|
695
|
+
self,
|
|
696
|
+
session: OpenAISessionCreateRequest
|
|
697
|
+
| OpenAIRealtimeTranscriptionSessionCreateRequest
|
|
698
|
+
| Mapping[str, object]
|
|
699
|
+
| pydantic.BaseModel,
|
|
700
|
+
) -> None:
|
|
701
|
+
# Only store/playback-format information for realtime sessions (not transcription-only)
|
|
702
|
+
normalized_session = self._normalize_session_payload(session)
|
|
703
|
+
if not normalized_session:
|
|
704
|
+
return
|
|
705
|
+
|
|
706
|
+
self._created_session = normalized_session
|
|
707
|
+
normalized_format = self._extract_audio_format(normalized_session)
|
|
708
|
+
if normalized_format is None:
|
|
709
|
+
return
|
|
710
|
+
|
|
711
|
+
self._audio_state_tracker.set_audio_format(normalized_format)
|
|
712
|
+
if self._playback_tracker:
|
|
713
|
+
self._playback_tracker.set_audio_format(normalized_format)
|
|
714
|
+
|
|
715
|
+
@staticmethod
|
|
716
|
+
def _normalize_session_payload(
|
|
717
|
+
session: OpenAISessionCreateRequest
|
|
718
|
+
| OpenAIRealtimeTranscriptionSessionCreateRequest
|
|
719
|
+
| Mapping[str, object]
|
|
720
|
+
| pydantic.BaseModel,
|
|
721
|
+
) -> OpenAISessionCreateRequest | None:
|
|
722
|
+
if isinstance(session, OpenAISessionCreateRequest):
|
|
723
|
+
return session
|
|
724
|
+
|
|
725
|
+
if isinstance(session, OpenAIRealtimeTranscriptionSessionCreateRequest):
|
|
726
|
+
return None
|
|
727
|
+
|
|
728
|
+
session_payload: Mapping[str, object]
|
|
729
|
+
if isinstance(session, pydantic.BaseModel):
|
|
730
|
+
session_payload = cast(Mapping[str, object], session.model_dump())
|
|
731
|
+
elif isinstance(session, Mapping):
|
|
732
|
+
session_payload = session
|
|
733
|
+
else:
|
|
734
|
+
return None
|
|
735
|
+
|
|
736
|
+
if OpenAIRealtimeWebSocketModel._is_transcription_session(session_payload):
|
|
737
|
+
return None
|
|
591
738
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
739
|
+
try:
|
|
740
|
+
return OpenAISessionCreateRequest.model_validate(session_payload)
|
|
741
|
+
except pydantic.ValidationError:
|
|
742
|
+
return None
|
|
743
|
+
|
|
744
|
+
@staticmethod
|
|
745
|
+
def _is_transcription_session(payload: Mapping[str, object]) -> bool:
|
|
746
|
+
try:
|
|
747
|
+
OpenAIRealtimeTranscriptionSessionCreateRequest.model_validate(payload)
|
|
748
|
+
except pydantic.ValidationError:
|
|
749
|
+
return False
|
|
750
|
+
else:
|
|
751
|
+
return True
|
|
752
|
+
|
|
753
|
+
@staticmethod
|
|
754
|
+
def _extract_audio_format(session: OpenAISessionCreateRequest) -> str | None:
|
|
755
|
+
audio = session.audio
|
|
756
|
+
if not audio or not audio.output or not audio.output.format:
|
|
757
|
+
return None
|
|
758
|
+
|
|
759
|
+
return OpenAIRealtimeWebSocketModel._normalize_audio_format(audio.output.format)
|
|
760
|
+
|
|
761
|
+
@staticmethod
|
|
762
|
+
def _normalize_audio_format(fmt: object) -> str:
|
|
763
|
+
if isinstance(fmt, AudioPCM):
|
|
764
|
+
return "pcm16"
|
|
765
|
+
if isinstance(fmt, AudioPCMU):
|
|
766
|
+
return "g711_ulaw"
|
|
767
|
+
if isinstance(fmt, AudioPCMA):
|
|
768
|
+
return "g711_alaw"
|
|
769
|
+
|
|
770
|
+
fmt_type = OpenAIRealtimeWebSocketModel._read_format_type(fmt)
|
|
771
|
+
if isinstance(fmt_type, str) and fmt_type:
|
|
772
|
+
return fmt_type
|
|
773
|
+
|
|
774
|
+
return str(fmt)
|
|
775
|
+
|
|
776
|
+
@staticmethod
|
|
777
|
+
def _read_format_type(fmt: object) -> str | None:
|
|
778
|
+
if isinstance(fmt, str):
|
|
779
|
+
return fmt
|
|
780
|
+
|
|
781
|
+
if isinstance(fmt, Mapping):
|
|
782
|
+
type_value = fmt.get("type")
|
|
783
|
+
return type_value if isinstance(type_value, str) else None
|
|
784
|
+
|
|
785
|
+
if isinstance(fmt, pydantic.BaseModel):
|
|
786
|
+
type_value = fmt.model_dump().get("type")
|
|
787
|
+
return type_value if isinstance(type_value, str) else None
|
|
788
|
+
|
|
789
|
+
try:
|
|
790
|
+
type_value = fmt.type # type: ignore[attr-defined]
|
|
791
|
+
except AttributeError:
|
|
792
|
+
return None
|
|
793
|
+
|
|
794
|
+
return type_value if isinstance(type_value, str) else None
|
|
598
795
|
|
|
599
796
|
async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None:
|
|
600
797
|
session_config = self._get_session_config(model_settings)
|
|
@@ -604,51 +801,95 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
604
801
|
|
|
605
802
|
def _get_session_config(
|
|
606
803
|
self, model_settings: RealtimeSessionModelSettings
|
|
607
|
-
) ->
|
|
804
|
+
) -> OpenAISessionCreateRequest:
|
|
608
805
|
"""Get the session config."""
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
),
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
),
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
),
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
806
|
+
model_name = (model_settings.get("model_name") or self.model) or "gpt-realtime"
|
|
807
|
+
|
|
808
|
+
voice = model_settings.get("voice", DEFAULT_MODEL_SETTINGS.get("voice"))
|
|
809
|
+
speed = model_settings.get("speed")
|
|
810
|
+
modalities = model_settings.get("modalities", DEFAULT_MODEL_SETTINGS.get("modalities"))
|
|
811
|
+
|
|
812
|
+
input_audio_format = model_settings.get(
|
|
813
|
+
"input_audio_format",
|
|
814
|
+
DEFAULT_MODEL_SETTINGS.get("input_audio_format"),
|
|
815
|
+
)
|
|
816
|
+
input_audio_transcription = model_settings.get(
|
|
817
|
+
"input_audio_transcription",
|
|
818
|
+
DEFAULT_MODEL_SETTINGS.get("input_audio_transcription"),
|
|
819
|
+
)
|
|
820
|
+
turn_detection = model_settings.get(
|
|
821
|
+
"turn_detection",
|
|
822
|
+
DEFAULT_MODEL_SETTINGS.get("turn_detection"),
|
|
823
|
+
)
|
|
824
|
+
output_audio_format = model_settings.get(
|
|
825
|
+
"output_audio_format",
|
|
826
|
+
DEFAULT_MODEL_SETTINGS.get("output_audio_format"),
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
input_audio_config = None
|
|
830
|
+
if any(
|
|
831
|
+
value is not None
|
|
832
|
+
for value in [input_audio_format, input_audio_transcription, turn_detection]
|
|
833
|
+
):
|
|
834
|
+
input_audio_config = OpenAIRealtimeAudioInput(
|
|
835
|
+
format=to_realtime_audio_format(input_audio_format),
|
|
836
|
+
transcription=cast(Any, input_audio_transcription),
|
|
837
|
+
turn_detection=cast(Any, turn_detection),
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
output_audio_config = None
|
|
841
|
+
if any(value is not None for value in [output_audio_format, speed, voice]):
|
|
842
|
+
output_audio_config = OpenAIRealtimeAudioOutput(
|
|
843
|
+
format=to_realtime_audio_format(output_audio_format),
|
|
844
|
+
speed=speed,
|
|
845
|
+
voice=voice,
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
audio_config = None
|
|
849
|
+
if input_audio_config or output_audio_config:
|
|
850
|
+
audio_config = OpenAIRealtimeAudioConfig(
|
|
851
|
+
input=input_audio_config,
|
|
852
|
+
output=output_audio_config,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
prompt: ResponsePrompt | None = None
|
|
856
|
+
if model_settings.get("prompt") is not None:
|
|
857
|
+
_passed_prompt: Prompt = model_settings["prompt"]
|
|
858
|
+
variables: dict[str, Any] | None = _passed_prompt.get("variables")
|
|
859
|
+
prompt = ResponsePrompt(
|
|
860
|
+
id=_passed_prompt["id"],
|
|
861
|
+
variables=variables,
|
|
862
|
+
version=_passed_prompt.get("version"),
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Construct full session object. `type` will be excluded at serialization time for updates.
|
|
866
|
+
return OpenAISessionCreateRequest(
|
|
867
|
+
model=model_name,
|
|
868
|
+
type="realtime",
|
|
869
|
+
instructions=model_settings.get("instructions"),
|
|
870
|
+
prompt=prompt,
|
|
871
|
+
output_modalities=modalities,
|
|
872
|
+
audio=audio_config,
|
|
873
|
+
max_output_tokens=cast(Any, model_settings.get("max_output_tokens")),
|
|
874
|
+
tool_choice=cast(Any, model_settings.get("tool_choice")),
|
|
875
|
+
tools=cast(
|
|
876
|
+
Any,
|
|
877
|
+
self._tools_to_session_tools(
|
|
878
|
+
tools=model_settings.get("tools", []),
|
|
879
|
+
handoffs=model_settings.get("handoffs", []),
|
|
880
|
+
),
|
|
640
881
|
),
|
|
641
882
|
)
|
|
642
883
|
|
|
643
884
|
def _tools_to_session_tools(
|
|
644
885
|
self, tools: list[Tool], handoffs: list[Handoff]
|
|
645
|
-
) -> list[
|
|
646
|
-
converted_tools: list[
|
|
886
|
+
) -> list[OpenAISessionFunction]:
|
|
887
|
+
converted_tools: list[OpenAISessionFunction] = []
|
|
647
888
|
for tool in tools:
|
|
648
889
|
if not isinstance(tool, FunctionTool):
|
|
649
890
|
raise UserError(f"Tool {tool.name} is unsupported. Must be a function tool.")
|
|
650
891
|
converted_tools.append(
|
|
651
|
-
|
|
892
|
+
OpenAISessionFunction(
|
|
652
893
|
name=tool.name,
|
|
653
894
|
description=tool.description,
|
|
654
895
|
parameters=tool.params_json_schema,
|
|
@@ -658,7 +899,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
|
|
|
658
899
|
|
|
659
900
|
for handoff in handoffs:
|
|
660
901
|
converted_tools.append(
|
|
661
|
-
|
|
902
|
+
OpenAISessionFunction(
|
|
662
903
|
name=handoff.tool_name,
|
|
663
904
|
description=handoff.tool_description,
|
|
664
905
|
parameters=handoff.input_json_schema,
|
|
@@ -674,15 +915,32 @@ class _ConversionHelper:
|
|
|
674
915
|
def conversation_item_to_realtime_message_item(
|
|
675
916
|
cls, item: ConversationItem, previous_item_id: str | None
|
|
676
917
|
) -> RealtimeMessageItem:
|
|
918
|
+
if not isinstance(
|
|
919
|
+
item,
|
|
920
|
+
(
|
|
921
|
+
RealtimeConversationItemUserMessage,
|
|
922
|
+
RealtimeConversationItemAssistantMessage,
|
|
923
|
+
RealtimeConversationItemSystemMessage,
|
|
924
|
+
),
|
|
925
|
+
):
|
|
926
|
+
raise ValueError("Unsupported conversation item type for message conversion.")
|
|
927
|
+
content: list[dict[str, Any]] = []
|
|
928
|
+
for each in item.content:
|
|
929
|
+
c = each.model_dump()
|
|
930
|
+
if each.type == "output_text":
|
|
931
|
+
# For backward-compatibility of assistant message items
|
|
932
|
+
c["type"] = "text"
|
|
933
|
+
elif each.type == "output_audio":
|
|
934
|
+
# For backward-compatibility of assistant message items
|
|
935
|
+
c["type"] = "audio"
|
|
936
|
+
content.append(c)
|
|
677
937
|
return TypeAdapter(RealtimeMessageItem).validate_python(
|
|
678
938
|
{
|
|
679
939
|
"item_id": item.id or "",
|
|
680
940
|
"previous_item_id": previous_item_id,
|
|
681
941
|
"type": item.type,
|
|
682
942
|
"role": item.role,
|
|
683
|
-
"content":
|
|
684
|
-
[content.model_dump() for content in item.content] if item.content else []
|
|
685
|
-
),
|
|
943
|
+
"content": content,
|
|
686
944
|
"status": "in_progress",
|
|
687
945
|
},
|
|
688
946
|
)
|
|
@@ -702,12 +960,12 @@ class _ConversionHelper:
|
|
|
702
960
|
@classmethod
|
|
703
961
|
def convert_tracing_config(
|
|
704
962
|
cls, tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None
|
|
705
|
-
) ->
|
|
963
|
+
) -> OpenAITracingConfiguration | Literal["auto"] | None:
|
|
706
964
|
if tracing_config is None:
|
|
707
965
|
return None
|
|
708
966
|
elif tracing_config == "auto":
|
|
709
967
|
return "auto"
|
|
710
|
-
return
|
|
968
|
+
return OpenAITracingConfiguration(
|
|
711
969
|
group_id=tracing_config.get("group_id"),
|
|
712
970
|
metadata=tracing_config.get("metadata"),
|
|
713
971
|
workflow_name=tracing_config.get("workflow_name"),
|
|
@@ -720,22 +978,53 @@ class _ConversionHelper:
|
|
|
720
978
|
user_input = event.user_input
|
|
721
979
|
|
|
722
980
|
if isinstance(user_input, dict):
|
|
723
|
-
|
|
981
|
+
content: list[Content] = []
|
|
982
|
+
for item in user_input.get("content", []):
|
|
983
|
+
try:
|
|
984
|
+
if not isinstance(item, dict):
|
|
985
|
+
continue
|
|
986
|
+
t = item.get("type")
|
|
987
|
+
if t == "input_text":
|
|
988
|
+
_txt = item.get("text")
|
|
989
|
+
text_val = _txt if isinstance(_txt, str) else None
|
|
990
|
+
content.append(Content(type="input_text", text=text_val))
|
|
991
|
+
elif t == "input_image":
|
|
992
|
+
iu = item.get("image_url")
|
|
993
|
+
if isinstance(iu, str) and iu:
|
|
994
|
+
d = item.get("detail")
|
|
995
|
+
detail_val = cast(
|
|
996
|
+
Literal["auto", "low", "high"] | None,
|
|
997
|
+
d if isinstance(d, str) and d in ("auto", "low", "high") else None,
|
|
998
|
+
)
|
|
999
|
+
if detail_val is None:
|
|
1000
|
+
content.append(
|
|
1001
|
+
Content(
|
|
1002
|
+
type="input_image",
|
|
1003
|
+
image_url=iu,
|
|
1004
|
+
)
|
|
1005
|
+
)
|
|
1006
|
+
else:
|
|
1007
|
+
content.append(
|
|
1008
|
+
Content(
|
|
1009
|
+
type="input_image",
|
|
1010
|
+
image_url=iu,
|
|
1011
|
+
detail=detail_val,
|
|
1012
|
+
)
|
|
1013
|
+
)
|
|
1014
|
+
# ignore unknown types for forward-compat
|
|
1015
|
+
except Exception:
|
|
1016
|
+
# best-effort; skip malformed parts
|
|
1017
|
+
continue
|
|
1018
|
+
return RealtimeConversationItemUserMessage(
|
|
724
1019
|
type="message",
|
|
725
1020
|
role="user",
|
|
726
|
-
content=
|
|
727
|
-
OpenAIConversationItemContent(
|
|
728
|
-
type="input_text",
|
|
729
|
-
text=item.get("text"),
|
|
730
|
-
)
|
|
731
|
-
for item in user_input.get("content", [])
|
|
732
|
-
],
|
|
1021
|
+
content=content,
|
|
733
1022
|
)
|
|
734
1023
|
else:
|
|
735
|
-
return
|
|
1024
|
+
return RealtimeConversationItemUserMessage(
|
|
736
1025
|
type="message",
|
|
737
1026
|
role="user",
|
|
738
|
-
content=[
|
|
1027
|
+
content=[Content(type="input_text", text=user_input)],
|
|
739
1028
|
)
|
|
740
1029
|
|
|
741
1030
|
@classmethod
|
|
@@ -761,7 +1050,7 @@ class _ConversionHelper:
|
|
|
761
1050
|
def convert_tool_output(cls, event: RealtimeModelSendToolOutput) -> OpenAIRealtimeClientEvent:
|
|
762
1051
|
return OpenAIConversationItemCreateEvent(
|
|
763
1052
|
type="conversation.item.create",
|
|
764
|
-
item=
|
|
1053
|
+
item=RealtimeConversationItemFunctionCallOutput(
|
|
765
1054
|
type="function_call_output",
|
|
766
1055
|
output=event.output,
|
|
767
1056
|
call_id=event.tool_call.call_id,
|