openai-agents 0.2.11__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of openai-agents might be problematic. Click here for more details.

Files changed (40) hide show
  1. agents/_debug.py +15 -4
  2. agents/_run_impl.py +34 -37
  3. agents/agent.py +18 -2
  4. agents/extensions/handoff_filters.py +2 -0
  5. agents/extensions/memory/__init__.py +42 -15
  6. agents/extensions/memory/encrypt_session.py +185 -0
  7. agents/extensions/models/litellm_model.py +62 -10
  8. agents/function_schema.py +45 -3
  9. agents/memory/__init__.py +2 -0
  10. agents/memory/openai_conversations_session.py +0 -3
  11. agents/memory/util.py +20 -0
  12. agents/models/chatcmpl_converter.py +74 -15
  13. agents/models/chatcmpl_helpers.py +6 -0
  14. agents/models/chatcmpl_stream_handler.py +29 -1
  15. agents/models/openai_chatcompletions.py +26 -4
  16. agents/models/openai_responses.py +30 -4
  17. agents/realtime/__init__.py +2 -0
  18. agents/realtime/_util.py +1 -1
  19. agents/realtime/agent.py +7 -0
  20. agents/realtime/audio_formats.py +29 -0
  21. agents/realtime/config.py +32 -4
  22. agents/realtime/items.py +17 -1
  23. agents/realtime/model_events.py +2 -0
  24. agents/realtime/model_inputs.py +15 -1
  25. agents/realtime/openai_realtime.py +421 -130
  26. agents/realtime/session.py +167 -14
  27. agents/result.py +47 -20
  28. agents/run.py +191 -106
  29. agents/tool.py +1 -1
  30. agents/tracing/processor_interface.py +84 -11
  31. agents/tracing/spans.py +88 -0
  32. agents/tracing/traces.py +99 -16
  33. agents/util/_json.py +19 -1
  34. agents/util/_transforms.py +12 -2
  35. agents/voice/input.py +5 -4
  36. agents/voice/models/openai_stt.py +15 -8
  37. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/METADATA +4 -2
  38. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/RECORD +40 -37
  39. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/WHEEL +0 -0
  40. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,59 +5,87 @@ import base64
5
5
  import inspect
6
6
  import json
7
7
  import os
8
+ from collections.abc import Mapping
8
9
  from datetime import datetime
9
- from typing import Annotated, Any, Callable, Literal, Union
10
+ from typing import Annotated, Any, Callable, Literal, Union, cast
10
11
 
11
12
  import pydantic
12
13
  import websockets
13
- from openai.types.beta.realtime.conversation_item import (
14
+ from openai.types.realtime import realtime_audio_config as _rt_audio_config
15
+ from openai.types.realtime.conversation_item import (
14
16
  ConversationItem,
15
17
  ConversationItem as OpenAIConversationItem,
16
18
  )
17
- from openai.types.beta.realtime.conversation_item_content import (
18
- ConversationItemContent as OpenAIConversationItemContent,
19
- )
20
- from openai.types.beta.realtime.conversation_item_create_event import (
19
+ from openai.types.realtime.conversation_item_create_event import (
21
20
  ConversationItemCreateEvent as OpenAIConversationItemCreateEvent,
22
21
  )
23
- from openai.types.beta.realtime.conversation_item_retrieve_event import (
22
+ from openai.types.realtime.conversation_item_retrieve_event import (
24
23
  ConversationItemRetrieveEvent as OpenAIConversationItemRetrieveEvent,
25
24
  )
26
- from openai.types.beta.realtime.conversation_item_truncate_event import (
25
+ from openai.types.realtime.conversation_item_truncate_event import (
27
26
  ConversationItemTruncateEvent as OpenAIConversationItemTruncateEvent,
28
27
  )
29
- from openai.types.beta.realtime.input_audio_buffer_append_event import (
28
+ from openai.types.realtime.input_audio_buffer_append_event import (
30
29
  InputAudioBufferAppendEvent as OpenAIInputAudioBufferAppendEvent,
31
30
  )
32
- from openai.types.beta.realtime.input_audio_buffer_commit_event import (
31
+ from openai.types.realtime.input_audio_buffer_commit_event import (
33
32
  InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent,
34
33
  )
35
- from openai.types.beta.realtime.realtime_client_event import (
34
+ from openai.types.realtime.realtime_audio_formats import (
35
+ AudioPCM,
36
+ AudioPCMA,
37
+ AudioPCMU,
38
+ )
39
+ from openai.types.realtime.realtime_client_event import (
36
40
  RealtimeClientEvent as OpenAIRealtimeClientEvent,
37
41
  )
38
- from openai.types.beta.realtime.realtime_server_event import (
42
+ from openai.types.realtime.realtime_conversation_item_assistant_message import (
43
+ RealtimeConversationItemAssistantMessage,
44
+ )
45
+ from openai.types.realtime.realtime_conversation_item_function_call_output import (
46
+ RealtimeConversationItemFunctionCallOutput,
47
+ )
48
+ from openai.types.realtime.realtime_conversation_item_system_message import (
49
+ RealtimeConversationItemSystemMessage,
50
+ )
51
+ from openai.types.realtime.realtime_conversation_item_user_message import (
52
+ Content,
53
+ RealtimeConversationItemUserMessage,
54
+ )
55
+ from openai.types.realtime.realtime_function_tool import (
56
+ RealtimeFunctionTool as OpenAISessionFunction,
57
+ )
58
+ from openai.types.realtime.realtime_server_event import (
39
59
  RealtimeServerEvent as OpenAIRealtimeServerEvent,
40
60
  )
41
- from openai.types.beta.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
42
- from openai.types.beta.realtime.response_cancel_event import (
61
+ from openai.types.realtime.realtime_session_create_request import (
62
+ RealtimeSessionCreateRequest as OpenAISessionCreateRequest,
63
+ )
64
+ from openai.types.realtime.realtime_tracing_config import (
65
+ TracingConfiguration as OpenAITracingConfiguration,
66
+ )
67
+ from openai.types.realtime.realtime_transcription_session_create_request import (
68
+ RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest,
69
+ )
70
+ from openai.types.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
71
+ from openai.types.realtime.response_cancel_event import (
43
72
  ResponseCancelEvent as OpenAIResponseCancelEvent,
44
73
  )
45
- from openai.types.beta.realtime.response_create_event import (
74
+ from openai.types.realtime.response_create_event import (
46
75
  ResponseCreateEvent as OpenAIResponseCreateEvent,
47
76
  )
48
- from openai.types.beta.realtime.session_update_event import (
49
- Session as OpenAISessionObject,
50
- SessionTool as OpenAISessionTool,
51
- SessionTracing as OpenAISessionTracing,
52
- SessionTracingTracingConfiguration as OpenAISessionTracingConfiguration,
77
+ from openai.types.realtime.session_update_event import (
53
78
  SessionUpdateEvent as OpenAISessionUpdateEvent,
54
79
  )
55
- from pydantic import BaseModel, Field, TypeAdapter
80
+ from openai.types.responses.response_prompt import ResponsePrompt
81
+ from pydantic import Field, TypeAdapter
56
82
  from typing_extensions import assert_never
57
83
  from websockets.asyncio.client import ClientConnection
58
84
 
59
85
  from agents.handoffs import Handoff
86
+ from agents.prompts import Prompt
60
87
  from agents.realtime._default_tracker import ModelAudioTracker
88
+ from agents.realtime.audio_formats import to_realtime_audio_format
61
89
  from agents.tool import FunctionTool, Tool
62
90
  from agents.util._types import MaybeAwaitable
63
91
 
@@ -103,17 +131,23 @@ from .model_inputs import (
103
131
  RealtimeModelSendUserInput,
104
132
  )
105
133
 
134
+ # Avoid direct imports of non-exported names by referencing via module
135
+ OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig
136
+ OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput # type: ignore[attr-defined]
137
+ OpenAIRealtimeAudioOutput = _rt_audio_config.RealtimeAudioConfigOutput # type: ignore[attr-defined]
138
+
139
+
106
140
  _USER_AGENT = f"Agents/Python {__version__}"
107
141
 
108
142
  DEFAULT_MODEL_SETTINGS: RealtimeSessionModelSettings = {
109
143
  "voice": "ash",
110
- "modalities": ["text", "audio"],
144
+ "modalities": ["audio"],
111
145
  "input_audio_format": "pcm16",
112
146
  "output_audio_format": "pcm16",
113
147
  "input_audio_transcription": {
114
148
  "model": "gpt-4o-mini-transcribe",
115
149
  },
116
- "turn_detection": {"type": "semantic_vad"},
150
+ "turn_detection": {"type": "semantic_vad", "interrupt_response": True},
117
151
  }
118
152
 
119
153
 
@@ -129,19 +163,8 @@ async def get_api_key(key: str | Callable[[], MaybeAwaitable[str]] | None) -> st
129
163
  return os.getenv("OPENAI_API_KEY")
130
164
 
131
165
 
132
- class _InputAudioBufferTimeoutTriggeredEvent(BaseModel):
133
- type: Literal["input_audio_buffer.timeout_triggered"]
134
- event_id: str
135
- audio_start_ms: int
136
- audio_end_ms: int
137
- item_id: str
138
-
139
-
140
166
  AllRealtimeServerEvents = Annotated[
141
- Union[
142
- OpenAIRealtimeServerEvent,
143
- _InputAudioBufferTimeoutTriggeredEvent,
144
- ],
167
+ Union[OpenAIRealtimeServerEvent,],
145
168
  Field(discriminator="type"),
146
169
  ]
147
170
 
@@ -155,11 +178,16 @@ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
155
178
  return ServerEventTypeAdapter
156
179
 
157
180
 
181
+ # Note: Avoid a module-level union alias for Python 3.9 compatibility.
182
+ # Using a union at runtime (e.g., A | B) in a type alias triggers evaluation
183
+ # during import on 3.9. We instead inline the union in annotations below.
184
+
185
+
158
186
  class OpenAIRealtimeWebSocketModel(RealtimeModel):
159
187
  """A model that uses OpenAI's WebSocket API."""
160
188
 
161
189
  def __init__(self) -> None:
162
- self.model = "gpt-4o-realtime-preview" # Default model
190
+ self.model = "gpt-realtime" # Default model
163
191
  self._websocket: ClientConnection | None = None
164
192
  self._websocket_task: asyncio.Task[None] | None = None
165
193
  self._listeners: list[RealtimeModelListener] = []
@@ -168,7 +196,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
168
196
  self._ongoing_response: bool = False
169
197
  self._tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None = None
170
198
  self._playback_tracker: RealtimePlaybackTracker | None = None
171
- self._created_session: OpenAISessionObject | None = None
199
+ self._created_session: OpenAISessionCreateRequest | None = None
172
200
  self._server_event_type_adapter = get_server_event_type_adapter()
173
201
 
174
202
  async def connect(self, options: RealtimeModelConfig) -> None:
@@ -199,12 +227,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
199
227
  if not api_key:
200
228
  raise UserError("API key is required but was not provided.")
201
229
 
202
- headers.update(
203
- {
204
- "Authorization": f"Bearer {api_key}",
205
- "OpenAI-Beta": "realtime=v1",
206
- }
207
- )
230
+ headers.update({"Authorization": f"Bearer {api_key}"})
208
231
  self._websocket = await websockets.connect(
209
232
  url,
210
233
  user_agent_header=_USER_AGENT,
@@ -222,7 +245,11 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
222
245
  converted_tracing_config = _ConversionHelper.convert_tracing_config(tracing_config)
223
246
  await self._send_raw_message(
224
247
  OpenAISessionUpdateEvent(
225
- session=OpenAISessionObject(tracing=converted_tracing_config),
248
+ session=OpenAISessionCreateRequest(
249
+ model=self.model,
250
+ type="realtime",
251
+ tracing=converted_tracing_config,
252
+ ),
226
253
  type="session.update",
227
254
  )
228
255
  )
@@ -304,8 +331,8 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
304
331
  async def _send_raw_message(self, event: OpenAIRealtimeClientEvent) -> None:
305
332
  """Send a raw message to the model."""
306
333
  assert self._websocket is not None, "Not connected"
307
-
308
- await self._websocket.send(event.model_dump_json(exclude_none=True, exclude_unset=True))
334
+ payload = event.model_dump_json(exclude_none=True, exclude_unset=True)
335
+ await self._websocket.send(payload)
309
336
 
310
337
  async def _send_user_input(self, event: RealtimeModelSendUserInput) -> None:
311
338
  converted = _ConversionHelper.convert_user_input_to_item_create(event)
@@ -398,10 +425,13 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
398
425
  f"content index: {current_item_content_index}"
399
426
  )
400
427
 
428
+ session = self._created_session
401
429
  automatic_response_cancellation_enabled = (
402
- self._created_session
403
- and self._created_session.turn_detection
404
- and self._created_session.turn_detection.interrupt_response
430
+ session
431
+ and session.audio is not None
432
+ and session.audio.input is not None
433
+ and session.audio.input.turn_detection is not None
434
+ and session.audio.input.turn_detection.interrupt_response is True,
405
435
  )
406
436
  if not automatic_response_cancellation_enabled:
407
437
  await self._cancel_response()
@@ -495,40 +525,103 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
495
525
 
496
526
  async def _handle_ws_event(self, event: dict[str, Any]):
497
527
  await self._emit_event(RealtimeModelRawServerEvent(data=event))
528
+ # The public interface definedo on this Agents SDK side (e.g., RealtimeMessageItem)
529
+ # must be the same even after the GA migration, so this part does the conversion
530
+ if isinstance(event, dict) and event.get("type") in (
531
+ "response.output_item.added",
532
+ "response.output_item.done",
533
+ ):
534
+ item = event.get("item")
535
+ if isinstance(item, dict) and item.get("type") == "message":
536
+ raw_content = item.get("content") or []
537
+ converted_content: list[dict[str, Any]] = []
538
+ for part in raw_content:
539
+ if not isinstance(part, dict):
540
+ continue
541
+ if part.get("type") == "audio":
542
+ converted_content.append(
543
+ {
544
+ "type": "audio",
545
+ "audio": part.get("audio"),
546
+ "transcript": part.get("transcript"),
547
+ }
548
+ )
549
+ elif part.get("type") == "text":
550
+ converted_content.append({"type": "text", "text": part.get("text")})
551
+ status = item.get("status")
552
+ if status not in ("in_progress", "completed", "incomplete"):
553
+ is_done = event.get("type") == "response.output_item.done"
554
+ status = "completed" if is_done else "in_progress"
555
+ # Explicitly type the adapter for mypy
556
+ type_adapter: TypeAdapter[RealtimeMessageItem] = TypeAdapter(RealtimeMessageItem)
557
+ message_item: RealtimeMessageItem = type_adapter.validate_python(
558
+ {
559
+ "item_id": item.get("id", ""),
560
+ "type": "message",
561
+ "role": item.get("role", "assistant"),
562
+ "content": converted_content,
563
+ "status": status,
564
+ }
565
+ )
566
+ await self._emit_event(RealtimeModelItemUpdatedEvent(item=message_item))
567
+ return
568
+
498
569
  try:
499
570
  if "previous_item_id" in event and event["previous_item_id"] is None:
500
571
  event["previous_item_id"] = "" # TODO (rm) remove
501
572
  parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
502
573
  except pydantic.ValidationError as e:
503
574
  logger.error(f"Failed to validate server event: {event}", exc_info=True)
504
- await self._emit_event(
505
- RealtimeModelErrorEvent(
506
- error=e,
507
- )
508
- )
575
+ await self._emit_event(RealtimeModelErrorEvent(error=e))
509
576
  return
510
577
  except Exception as e:
511
578
  event_type = event.get("type", "unknown") if isinstance(event, dict) else "unknown"
512
579
  logger.error(f"Failed to validate server event: {event}", exc_info=True)
513
- await self._emit_event(
514
- RealtimeModelExceptionEvent(
515
- exception=e,
516
- context=f"Failed to validate server event: {event_type}",
517
- )
580
+ exception_event = RealtimeModelExceptionEvent(
581
+ exception=e,
582
+ context=f"Failed to validate server event: {event_type}",
518
583
  )
584
+ await self._emit_event(exception_event)
519
585
  return
520
586
 
521
- if parsed.type == "response.audio.delta":
587
+ if parsed.type == "response.output_audio.delta":
522
588
  await self._handle_audio_delta(parsed)
523
- elif parsed.type == "response.audio.done":
524
- await self._emit_event(
525
- RealtimeModelAudioDoneEvent(
526
- item_id=parsed.item_id,
527
- content_index=parsed.content_index,
528
- )
589
+ elif parsed.type == "response.output_audio.done":
590
+ audio_done_event = RealtimeModelAudioDoneEvent(
591
+ item_id=parsed.item_id,
592
+ content_index=parsed.content_index,
529
593
  )
594
+ await self._emit_event(audio_done_event)
530
595
  elif parsed.type == "input_audio_buffer.speech_started":
531
- await self._send_interrupt(RealtimeModelSendInterrupt())
596
+ # On VAD speech start, immediately stop local playback so the user can
597
+ # barge‑in without overlapping assistant audio.
598
+ last_audio = self._audio_state_tracker.get_last_audio_item()
599
+ if last_audio is not None:
600
+ item_id, content_index = last_audio
601
+ await self._emit_event(
602
+ RealtimeModelAudioInterruptedEvent(item_id=item_id, content_index=content_index)
603
+ )
604
+
605
+ # Reset trackers so subsequent playback state queries don't
606
+ # reference audio that has been interrupted client‑side.
607
+ self._audio_state_tracker.on_interrupted()
608
+ if self._playback_tracker:
609
+ self._playback_tracker.on_interrupted()
610
+
611
+ # If server isn't configured to auto‑interrupt/cancel, cancel the
612
+ # response to prevent further audio.
613
+ session = self._created_session
614
+ automatic_response_cancellation_enabled = (
615
+ session
616
+ and session.audio is not None
617
+ and session.audio.input is not None
618
+ and session.audio.input.turn_detection is not None
619
+ and session.audio.input.turn_detection.interrupt_response is True,
620
+ )
621
+ if not automatic_response_cancellation_enabled:
622
+ await self._cancel_response()
623
+ # Avoid sending conversation.item.truncate here; when GA is set to
624
+ # interrupt on VAD start, the server will handle truncation.
532
625
  elif parsed.type == "response.created":
533
626
  self._ongoing_response = True
534
627
  await self._emit_event(RealtimeModelTurnStartedEvent())
@@ -537,15 +630,16 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
537
630
  await self._emit_event(RealtimeModelTurnEndedEvent())
538
631
  elif parsed.type == "session.created":
539
632
  await self._send_tracing_config(self._tracing_config)
540
- self._update_created_session(parsed.session) # type: ignore
633
+ self._update_created_session(parsed.session)
541
634
  elif parsed.type == "session.updated":
542
- self._update_created_session(parsed.session) # type: ignore
635
+ self._update_created_session(parsed.session)
543
636
  elif parsed.type == "error":
544
637
  await self._emit_event(RealtimeModelErrorEvent(error=parsed.error))
545
638
  elif parsed.type == "conversation.item.deleted":
546
639
  await self._emit_event(RealtimeModelItemDeletedEvent(item_id=parsed.item_id))
547
640
  elif (
548
- parsed.type == "conversation.item.created"
641
+ parsed.type == "conversation.item.added"
642
+ or parsed.type == "conversation.item.created"
549
643
  or parsed.type == "conversation.item.retrieved"
550
644
  ):
551
645
  previous_item_id = (
@@ -570,7 +664,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
570
664
  item_id=parsed.item_id, transcript=parsed.transcript
571
665
  )
572
666
  )
573
- elif parsed.type == "response.audio_transcript.delta":
667
+ elif parsed.type == "response.output_audio_transcript.delta":
574
668
  await self._emit_event(
575
669
  RealtimeModelTranscriptDeltaEvent(
576
670
  item_id=parsed.item_id, delta=parsed.delta, response_id=parsed.response_id
@@ -578,7 +672,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
578
672
  )
579
673
  elif (
580
674
  parsed.type == "conversation.item.input_audio_transcription.delta"
581
- or parsed.type == "response.text.delta"
675
+ or parsed.type == "response.output_text.delta"
582
676
  or parsed.type == "response.function_call_arguments.delta"
583
677
  ):
584
678
  # No support for partials yet
@@ -597,12 +691,107 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
597
691
  )
598
692
  )
599
693
 
600
- def _update_created_session(self, session: OpenAISessionObject) -> None:
601
- self._created_session = session
602
- if session.output_audio_format:
603
- self._audio_state_tracker.set_audio_format(session.output_audio_format)
604
- if self._playback_tracker:
605
- self._playback_tracker.set_audio_format(session.output_audio_format)
694
+ def _update_created_session(
695
+ self,
696
+ session: OpenAISessionCreateRequest
697
+ | OpenAIRealtimeTranscriptionSessionCreateRequest
698
+ | Mapping[str, object]
699
+ | pydantic.BaseModel,
700
+ ) -> None:
701
+ # Only store/playback-format information for realtime sessions (not transcription-only)
702
+ normalized_session = self._normalize_session_payload(session)
703
+ if not normalized_session:
704
+ return
705
+
706
+ self._created_session = normalized_session
707
+ normalized_format = self._extract_audio_format(normalized_session)
708
+ if normalized_format is None:
709
+ return
710
+
711
+ self._audio_state_tracker.set_audio_format(normalized_format)
712
+ if self._playback_tracker:
713
+ self._playback_tracker.set_audio_format(normalized_format)
714
+
715
+ @staticmethod
716
+ def _normalize_session_payload(
717
+ session: OpenAISessionCreateRequest
718
+ | OpenAIRealtimeTranscriptionSessionCreateRequest
719
+ | Mapping[str, object]
720
+ | pydantic.BaseModel,
721
+ ) -> OpenAISessionCreateRequest | None:
722
+ if isinstance(session, OpenAISessionCreateRequest):
723
+ return session
724
+
725
+ if isinstance(session, OpenAIRealtimeTranscriptionSessionCreateRequest):
726
+ return None
727
+
728
+ session_payload: Mapping[str, object]
729
+ if isinstance(session, pydantic.BaseModel):
730
+ session_payload = cast(Mapping[str, object], session.model_dump())
731
+ elif isinstance(session, Mapping):
732
+ session_payload = session
733
+ else:
734
+ return None
735
+
736
+ if OpenAIRealtimeWebSocketModel._is_transcription_session(session_payload):
737
+ return None
738
+
739
+ try:
740
+ return OpenAISessionCreateRequest.model_validate(session_payload)
741
+ except pydantic.ValidationError:
742
+ return None
743
+
744
+ @staticmethod
745
+ def _is_transcription_session(payload: Mapping[str, object]) -> bool:
746
+ try:
747
+ OpenAIRealtimeTranscriptionSessionCreateRequest.model_validate(payload)
748
+ except pydantic.ValidationError:
749
+ return False
750
+ else:
751
+ return True
752
+
753
+ @staticmethod
754
+ def _extract_audio_format(session: OpenAISessionCreateRequest) -> str | None:
755
+ audio = session.audio
756
+ if not audio or not audio.output or not audio.output.format:
757
+ return None
758
+
759
+ return OpenAIRealtimeWebSocketModel._normalize_audio_format(audio.output.format)
760
+
761
+ @staticmethod
762
+ def _normalize_audio_format(fmt: object) -> str:
763
+ if isinstance(fmt, AudioPCM):
764
+ return "pcm16"
765
+ if isinstance(fmt, AudioPCMU):
766
+ return "g711_ulaw"
767
+ if isinstance(fmt, AudioPCMA):
768
+ return "g711_alaw"
769
+
770
+ fmt_type = OpenAIRealtimeWebSocketModel._read_format_type(fmt)
771
+ if isinstance(fmt_type, str) and fmt_type:
772
+ return fmt_type
773
+
774
+ return str(fmt)
775
+
776
+ @staticmethod
777
+ def _read_format_type(fmt: object) -> str | None:
778
+ if isinstance(fmt, str):
779
+ return fmt
780
+
781
+ if isinstance(fmt, Mapping):
782
+ type_value = fmt.get("type")
783
+ return type_value if isinstance(type_value, str) else None
784
+
785
+ if isinstance(fmt, pydantic.BaseModel):
786
+ type_value = fmt.model_dump().get("type")
787
+ return type_value if isinstance(type_value, str) else None
788
+
789
+ try:
790
+ type_value = fmt.type # type: ignore[attr-defined]
791
+ except AttributeError:
792
+ return None
793
+
794
+ return type_value if isinstance(type_value, str) else None
606
795
 
607
796
  async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None:
608
797
  session_config = self._get_session_config(model_settings)
@@ -612,51 +801,105 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
612
801
 
613
802
  def _get_session_config(
614
803
  self, model_settings: RealtimeSessionModelSettings
615
- ) -> OpenAISessionObject:
804
+ ) -> OpenAISessionCreateRequest:
616
805
  """Get the session config."""
617
- return OpenAISessionObject(
618
- instructions=model_settings.get("instructions", None),
619
- model=(
620
- model_settings.get("model_name", self.model) # type: ignore
621
- or DEFAULT_MODEL_SETTINGS.get("model_name")
622
- ),
623
- voice=model_settings.get("voice", DEFAULT_MODEL_SETTINGS.get("voice")),
624
- speed=model_settings.get("speed", None),
625
- modalities=model_settings.get("modalities", DEFAULT_MODEL_SETTINGS.get("modalities")),
626
- input_audio_format=model_settings.get(
627
- "input_audio_format",
628
- DEFAULT_MODEL_SETTINGS.get("input_audio_format"), # type: ignore
629
- ),
630
- output_audio_format=model_settings.get(
631
- "output_audio_format",
632
- DEFAULT_MODEL_SETTINGS.get("output_audio_format"), # type: ignore
633
- ),
634
- input_audio_transcription=model_settings.get(
635
- "input_audio_transcription",
636
- DEFAULT_MODEL_SETTINGS.get("input_audio_transcription"), # type: ignore
637
- ),
638
- turn_detection=model_settings.get(
639
- "turn_detection",
640
- DEFAULT_MODEL_SETTINGS.get("turn_detection"), # type: ignore
641
- ),
642
- tool_choice=model_settings.get(
643
- "tool_choice",
644
- DEFAULT_MODEL_SETTINGS.get("tool_choice"), # type: ignore
645
- ),
646
- tools=self._tools_to_session_tools(
647
- tools=model_settings.get("tools", []), handoffs=model_settings.get("handoffs", [])
806
+ model_name = (model_settings.get("model_name") or self.model) or "gpt-realtime"
807
+
808
+ voice = model_settings.get("voice", DEFAULT_MODEL_SETTINGS.get("voice"))
809
+ speed = model_settings.get("speed")
810
+ modalities = model_settings.get("modalities", DEFAULT_MODEL_SETTINGS.get("modalities"))
811
+
812
+ input_audio_format = model_settings.get(
813
+ "input_audio_format",
814
+ DEFAULT_MODEL_SETTINGS.get("input_audio_format"),
815
+ )
816
+ input_audio_transcription = model_settings.get(
817
+ "input_audio_transcription",
818
+ DEFAULT_MODEL_SETTINGS.get("input_audio_transcription"),
819
+ )
820
+ turn_detection = model_settings.get(
821
+ "turn_detection",
822
+ DEFAULT_MODEL_SETTINGS.get("turn_detection"),
823
+ )
824
+ output_audio_format = model_settings.get(
825
+ "output_audio_format",
826
+ DEFAULT_MODEL_SETTINGS.get("output_audio_format"),
827
+ )
828
+ input_audio_noise_reduction = model_settings.get(
829
+ "input_audio_noise_reduction",
830
+ DEFAULT_MODEL_SETTINGS.get("input_audio_noise_reduction"),
831
+ )
832
+
833
+ input_audio_config = None
834
+ if any(
835
+ value is not None
836
+ for value in [
837
+ input_audio_format,
838
+ input_audio_noise_reduction,
839
+ input_audio_transcription,
840
+ turn_detection,
841
+ ]
842
+ ):
843
+ input_audio_config = OpenAIRealtimeAudioInput(
844
+ format=to_realtime_audio_format(input_audio_format),
845
+ noise_reduction=cast(Any, input_audio_noise_reduction),
846
+ transcription=cast(Any, input_audio_transcription),
847
+ turn_detection=cast(Any, turn_detection),
848
+ )
849
+
850
+ output_audio_config = None
851
+ if any(value is not None for value in [output_audio_format, speed, voice]):
852
+ output_audio_config = OpenAIRealtimeAudioOutput(
853
+ format=to_realtime_audio_format(output_audio_format),
854
+ speed=speed,
855
+ voice=voice,
856
+ )
857
+
858
+ audio_config = None
859
+ if input_audio_config or output_audio_config:
860
+ audio_config = OpenAIRealtimeAudioConfig(
861
+ input=input_audio_config,
862
+ output=output_audio_config,
863
+ )
864
+
865
+ prompt: ResponsePrompt | None = None
866
+ if model_settings.get("prompt") is not None:
867
+ _passed_prompt: Prompt = model_settings["prompt"]
868
+ variables: dict[str, Any] | None = _passed_prompt.get("variables")
869
+ prompt = ResponsePrompt(
870
+ id=_passed_prompt["id"],
871
+ variables=variables,
872
+ version=_passed_prompt.get("version"),
873
+ )
874
+
875
+ # Construct full session object. `type` will be excluded at serialization time for updates.
876
+ return OpenAISessionCreateRequest(
877
+ model=model_name,
878
+ type="realtime",
879
+ instructions=model_settings.get("instructions"),
880
+ prompt=prompt,
881
+ output_modalities=modalities,
882
+ audio=audio_config,
883
+ max_output_tokens=cast(Any, model_settings.get("max_output_tokens")),
884
+ tool_choice=cast(Any, model_settings.get("tool_choice")),
885
+ tools=cast(
886
+ Any,
887
+ self._tools_to_session_tools(
888
+ tools=model_settings.get("tools", []),
889
+ handoffs=model_settings.get("handoffs", []),
890
+ ),
648
891
  ),
649
892
  )
650
893
 
651
894
  def _tools_to_session_tools(
652
895
  self, tools: list[Tool], handoffs: list[Handoff]
653
- ) -> list[OpenAISessionTool]:
654
- converted_tools: list[OpenAISessionTool] = []
896
+ ) -> list[OpenAISessionFunction]:
897
+ converted_tools: list[OpenAISessionFunction] = []
655
898
  for tool in tools:
656
899
  if not isinstance(tool, FunctionTool):
657
900
  raise UserError(f"Tool {tool.name} is unsupported. Must be a function tool.")
658
901
  converted_tools.append(
659
- OpenAISessionTool(
902
+ OpenAISessionFunction(
660
903
  name=tool.name,
661
904
  description=tool.description,
662
905
  parameters=tool.params_json_schema,
@@ -666,7 +909,7 @@ class OpenAIRealtimeWebSocketModel(RealtimeModel):
666
909
 
667
910
  for handoff in handoffs:
668
911
  converted_tools.append(
669
- OpenAISessionTool(
912
+ OpenAISessionFunction(
670
913
  name=handoff.tool_name,
671
914
  description=handoff.tool_description,
672
915
  parameters=handoff.input_json_schema,
@@ -682,15 +925,32 @@ class _ConversionHelper:
682
925
  def conversation_item_to_realtime_message_item(
683
926
  cls, item: ConversationItem, previous_item_id: str | None
684
927
  ) -> RealtimeMessageItem:
928
+ if not isinstance(
929
+ item,
930
+ (
931
+ RealtimeConversationItemUserMessage,
932
+ RealtimeConversationItemAssistantMessage,
933
+ RealtimeConversationItemSystemMessage,
934
+ ),
935
+ ):
936
+ raise ValueError("Unsupported conversation item type for message conversion.")
937
+ content: list[dict[str, Any]] = []
938
+ for each in item.content:
939
+ c = each.model_dump()
940
+ if each.type == "output_text":
941
+ # For backward-compatibility of assistant message items
942
+ c["type"] = "text"
943
+ elif each.type == "output_audio":
944
+ # For backward-compatibility of assistant message items
945
+ c["type"] = "audio"
946
+ content.append(c)
685
947
  return TypeAdapter(RealtimeMessageItem).validate_python(
686
948
  {
687
949
  "item_id": item.id or "",
688
950
  "previous_item_id": previous_item_id,
689
951
  "type": item.type,
690
952
  "role": item.role,
691
- "content": (
692
- [content.model_dump() for content in item.content] if item.content else []
693
- ),
953
+ "content": content,
694
954
  "status": "in_progress",
695
955
  },
696
956
  )
@@ -710,12 +970,12 @@ class _ConversionHelper:
710
970
  @classmethod
711
971
  def convert_tracing_config(
712
972
  cls, tracing_config: RealtimeModelTracingConfig | Literal["auto"] | None
713
- ) -> OpenAISessionTracing | None:
973
+ ) -> OpenAITracingConfiguration | Literal["auto"] | None:
714
974
  if tracing_config is None:
715
975
  return None
716
976
  elif tracing_config == "auto":
717
977
  return "auto"
718
- return OpenAISessionTracingConfiguration(
978
+ return OpenAITracingConfiguration(
719
979
  group_id=tracing_config.get("group_id"),
720
980
  metadata=tracing_config.get("metadata"),
721
981
  workflow_name=tracing_config.get("workflow_name"),
@@ -728,22 +988,53 @@ class _ConversionHelper:
728
988
  user_input = event.user_input
729
989
 
730
990
  if isinstance(user_input, dict):
731
- return OpenAIConversationItem(
991
+ content: list[Content] = []
992
+ for item in user_input.get("content", []):
993
+ try:
994
+ if not isinstance(item, dict):
995
+ continue
996
+ t = item.get("type")
997
+ if t == "input_text":
998
+ _txt = item.get("text")
999
+ text_val = _txt if isinstance(_txt, str) else None
1000
+ content.append(Content(type="input_text", text=text_val))
1001
+ elif t == "input_image":
1002
+ iu = item.get("image_url")
1003
+ if isinstance(iu, str) and iu:
1004
+ d = item.get("detail")
1005
+ detail_val = cast(
1006
+ Literal["auto", "low", "high"] | None,
1007
+ d if isinstance(d, str) and d in ("auto", "low", "high") else None,
1008
+ )
1009
+ if detail_val is None:
1010
+ content.append(
1011
+ Content(
1012
+ type="input_image",
1013
+ image_url=iu,
1014
+ )
1015
+ )
1016
+ else:
1017
+ content.append(
1018
+ Content(
1019
+ type="input_image",
1020
+ image_url=iu,
1021
+ detail=detail_val,
1022
+ )
1023
+ )
1024
+ # ignore unknown types for forward-compat
1025
+ except Exception:
1026
+ # best-effort; skip malformed parts
1027
+ continue
1028
+ return RealtimeConversationItemUserMessage(
732
1029
  type="message",
733
1030
  role="user",
734
- content=[
735
- OpenAIConversationItemContent(
736
- type="input_text",
737
- text=item.get("text"),
738
- )
739
- for item in user_input.get("content", [])
740
- ],
1031
+ content=content,
741
1032
  )
742
1033
  else:
743
- return OpenAIConversationItem(
1034
+ return RealtimeConversationItemUserMessage(
744
1035
  type="message",
745
1036
  role="user",
746
- content=[OpenAIConversationItemContent(type="input_text", text=user_input)],
1037
+ content=[Content(type="input_text", text=user_input)],
747
1038
  )
748
1039
 
749
1040
  @classmethod
@@ -769,7 +1060,7 @@ class _ConversionHelper:
769
1060
  def convert_tool_output(cls, event: RealtimeModelSendToolOutput) -> OpenAIRealtimeClientEvent:
770
1061
  return OpenAIConversationItemCreateEvent(
771
1062
  type="conversation.item.create",
772
- item=OpenAIConversationItem(
1063
+ item=RealtimeConversationItemFunctionCallOutput(
773
1064
  type="function_call_output",
774
1065
  output=event.output,
775
1066
  call_id=event.tool_call.call_id,