openai-agents 0.2.11__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of openai-agents might be problematic. Click here for more details.

Files changed (40) hide show
  1. agents/_debug.py +15 -4
  2. agents/_run_impl.py +34 -37
  3. agents/agent.py +18 -2
  4. agents/extensions/handoff_filters.py +2 -0
  5. agents/extensions/memory/__init__.py +42 -15
  6. agents/extensions/memory/encrypt_session.py +185 -0
  7. agents/extensions/models/litellm_model.py +62 -10
  8. agents/function_schema.py +45 -3
  9. agents/memory/__init__.py +2 -0
  10. agents/memory/openai_conversations_session.py +0 -3
  11. agents/memory/util.py +20 -0
  12. agents/models/chatcmpl_converter.py +74 -15
  13. agents/models/chatcmpl_helpers.py +6 -0
  14. agents/models/chatcmpl_stream_handler.py +29 -1
  15. agents/models/openai_chatcompletions.py +26 -4
  16. agents/models/openai_responses.py +30 -4
  17. agents/realtime/__init__.py +2 -0
  18. agents/realtime/_util.py +1 -1
  19. agents/realtime/agent.py +7 -0
  20. agents/realtime/audio_formats.py +29 -0
  21. agents/realtime/config.py +32 -4
  22. agents/realtime/items.py +17 -1
  23. agents/realtime/model_events.py +2 -0
  24. agents/realtime/model_inputs.py +15 -1
  25. agents/realtime/openai_realtime.py +421 -130
  26. agents/realtime/session.py +167 -14
  27. agents/result.py +47 -20
  28. agents/run.py +191 -106
  29. agents/tool.py +1 -1
  30. agents/tracing/processor_interface.py +84 -11
  31. agents/tracing/spans.py +88 -0
  32. agents/tracing/traces.py +99 -16
  33. agents/util/_json.py +19 -1
  34. agents/util/_transforms.py +12 -2
  35. agents/voice/input.py +5 -4
  36. agents/voice/models/openai_stt.py +15 -8
  37. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/METADATA +4 -2
  38. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/RECORD +40 -37
  39. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/WHEEL +0 -0
  40. {openai_agents-0.2.11.dist-info → openai_agents-0.3.1.dist-info}/licenses/LICENSE +0 -0
agents/memory/util.py ADDED
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Callable
4
+
5
+ from ..items import TResponseInputItem
6
+ from ..util._types import MaybeAwaitable
7
+
8
+ SessionInputCallback = Callable[
9
+ [list[TResponseInputItem], list[TResponseInputItem]],
10
+ MaybeAwaitable[list[TResponseInputItem]],
11
+ ]
12
+ """A function that combines session history with new input items.
13
+
14
+ Args:
15
+ history_items: The list of items from the session history.
16
+ new_items: The list of new input items for the current turn.
17
+
18
+ Returns:
19
+ A list of combined items to be used as input for the agent. Can be sync or async.
20
+ """
@@ -39,7 +39,7 @@ from openai.types.responses import (
39
39
  ResponseReasoningItemParam,
40
40
  )
41
41
  from openai.types.responses.response_input_param import FunctionCallOutput, ItemReference, Message
42
- from openai.types.responses.response_reasoning_item import Summary
42
+ from openai.types.responses.response_reasoning_item import Content, Summary
43
43
 
44
44
  from ..agent_output import AgentOutputSchemaBase
45
45
  from ..exceptions import AgentsException, UserError
@@ -93,16 +93,41 @@ class Converter:
93
93
  def message_to_output_items(cls, message: ChatCompletionMessage) -> list[TResponseOutputItem]:
94
94
  items: list[TResponseOutputItem] = []
95
95
 
96
- # Handle reasoning content if available
96
+ # Check if message is agents.extentions.models.litellm_model.InternalChatCompletionMessage
97
+ # We can't actually import it here because litellm is an optional dependency
98
+ # So we use hasattr to check for reasoning_content and thinking_blocks
97
99
  if hasattr(message, "reasoning_content") and message.reasoning_content:
98
- items.append(
99
- ResponseReasoningItem(
100
- id=FAKE_RESPONSES_ID,
101
- summary=[Summary(text=message.reasoning_content, type="summary_text")],
102
- type="reasoning",
103
- )
100
+ reasoning_item = ResponseReasoningItem(
101
+ id=FAKE_RESPONSES_ID,
102
+ summary=[Summary(text=message.reasoning_content, type="summary_text")],
103
+ type="reasoning",
104
104
  )
105
105
 
106
+ # Store thinking blocks for Anthropic compatibility
107
+ if hasattr(message, "thinking_blocks") and message.thinking_blocks:
108
+ # Store thinking text in content and signature in encrypted_content
109
+ reasoning_item.content = []
110
+ signature = None
111
+ for block in message.thinking_blocks:
112
+ if isinstance(block, dict):
113
+ thinking_text = block.get("thinking", "")
114
+ if thinking_text:
115
+ reasoning_item.content.append(
116
+ Content(text=thinking_text, type="reasoning_text")
117
+ )
118
+ # Store the signature if present
119
+ if block.get("signature"):
120
+ signature = block.get("signature")
121
+
122
+ # Store only the last signature in encrypted_content
123
+ # If there are multiple thinking blocks, this should be a problem.
124
+ # In practice, there should only be one signature for the entire reasoning step.
125
+ # Tested with: claude-sonnet-4-20250514
126
+ if signature:
127
+ reasoning_item.encrypted_content = signature
128
+
129
+ items.append(reasoning_item)
130
+
106
131
  message_item = ResponseOutputMessage(
107
132
  id=FAKE_RESPONSES_ID,
108
133
  content=[],
@@ -272,9 +297,7 @@ class Converter:
272
297
  f"Only file_data is supported for input_file {casted_file_param}"
273
298
  )
274
299
  if "filename" not in casted_file_param or not casted_file_param["filename"]:
275
- raise UserError(
276
- f"filename must be provided for input_file {casted_file_param}"
277
- )
300
+ raise UserError(f"filename must be provided for input_file {casted_file_param}")
278
301
  out.append(
279
302
  File(
280
303
  type="file",
@@ -292,10 +315,18 @@ class Converter:
292
315
  def items_to_messages(
293
316
  cls,
294
317
  items: str | Iterable[TResponseInputItem],
318
+ preserve_thinking_blocks: bool = False,
295
319
  ) -> list[ChatCompletionMessageParam]:
296
320
  """
297
321
  Convert a sequence of 'Item' objects into a list of ChatCompletionMessageParam.
298
322
 
323
+ Args:
324
+ items: A string or iterable of response input items to convert
325
+ preserve_thinking_blocks: Whether to preserve thinking blocks in tool calls
326
+ for reasoning models like Claude 4 Sonnet/Opus which support interleaved
327
+ thinking. When True, thinking blocks are reconstructed and included in
328
+ assistant messages with tool calls.
329
+
299
330
  Rules:
300
331
  - EasyInputMessage or InputMessage (role=user) => ChatCompletionUserMessageParam
301
332
  - EasyInputMessage or InputMessage (role=system) => ChatCompletionSystemMessageParam
@@ -316,6 +347,7 @@ class Converter:
316
347
 
317
348
  result: list[ChatCompletionMessageParam] = []
318
349
  current_assistant_msg: ChatCompletionAssistantMessageParam | None = None
350
+ pending_thinking_blocks: list[dict[str, str]] | None = None
319
351
 
320
352
  def flush_assistant_message() -> None:
321
353
  nonlocal current_assistant_msg
@@ -327,10 +359,11 @@ class Converter:
327
359
  current_assistant_msg = None
328
360
 
329
361
  def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
330
- nonlocal current_assistant_msg
362
+ nonlocal current_assistant_msg, pending_thinking_blocks
331
363
  if current_assistant_msg is None:
332
364
  current_assistant_msg = ChatCompletionAssistantMessageParam(role="assistant")
333
365
  current_assistant_msg["tool_calls"] = []
366
+
334
367
  return current_assistant_msg
335
368
 
336
369
  for item in items:
@@ -446,6 +479,13 @@ class Converter:
446
479
 
447
480
  elif func_call := cls.maybe_function_tool_call(item):
448
481
  asst = ensure_assistant_message()
482
+
483
+ # If we have pending thinking blocks, use them as the content
484
+ # This is required for Anthropic API tool calls with interleaved thinking
485
+ if pending_thinking_blocks:
486
+ asst["content"] = pending_thinking_blocks # type: ignore
487
+ pending_thinking_blocks = None # Clear after using
488
+
449
489
  tool_calls = list(asst.get("tool_calls", []))
450
490
  arguments = func_call["arguments"] if func_call["arguments"] else "{}"
451
491
  new_tool_call = ChatCompletionMessageFunctionToolCallParam(
@@ -474,9 +514,28 @@ class Converter:
474
514
  f"Encountered an item_reference, which is not supported: {item_ref}"
475
515
  )
476
516
 
477
- # 7) reasoning message => not handled
478
- elif cls.maybe_reasoning_message(item):
479
- pass
517
+ # 7) reasoning message => extract thinking blocks if present
518
+ elif reasoning_item := cls.maybe_reasoning_message(item):
519
+ # Reconstruct thinking blocks from content (text) and encrypted_content (signature)
520
+ content_items = reasoning_item.get("content", [])
521
+ signature = reasoning_item.get("encrypted_content")
522
+
523
+ if content_items and preserve_thinking_blocks:
524
+ # Reconstruct thinking blocks from content and signature
525
+ pending_thinking_blocks = []
526
+ for content_item in content_items:
527
+ if (
528
+ isinstance(content_item, dict)
529
+ and content_item.get("type") == "reasoning_text"
530
+ ):
531
+ thinking_block = {
532
+ "type": "thinking",
533
+ "thinking": content_item.get("text", ""),
534
+ }
535
+ # Add signature if available
536
+ if signature:
537
+ thinking_block["signature"] = signature
538
+ pending_thinking_blocks.append(thinking_block)
480
539
 
481
540
  # 8) If we haven't recognized it => fail or ignore
482
541
  else:
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextvars import ContextVar
4
+
3
5
  from openai import AsyncOpenAI
4
6
 
5
7
  from ..model_settings import ModelSettings
@@ -8,6 +10,10 @@ from ..version import __version__
8
10
  _USER_AGENT = f"Agents/Python {__version__}"
9
11
  HEADERS = {"User-Agent": _USER_AGENT}
10
12
 
13
+ USER_AGENT_OVERRIDE: ContextVar[str | None] = ContextVar(
14
+ "openai_chatcompletions_user_agent_override", default=None
15
+ )
16
+
11
17
 
12
18
  class ChatCmplHelpers:
13
19
  @classmethod
@@ -62,6 +62,9 @@ class StreamingState:
62
62
  # Fields for real-time function call streaming
63
63
  function_call_streaming: dict[int, bool] = field(default_factory=dict)
64
64
  function_call_output_idx: dict[int, int] = field(default_factory=dict)
65
+ # Store accumulated thinking text and signature for Anthropic compatibility
66
+ thinking_text: str = ""
67
+ thinking_signature: str | None = None
65
68
 
66
69
 
67
70
  class SequenceNumber:
@@ -101,6 +104,19 @@ class ChatCmplStreamHandler:
101
104
 
102
105
  delta = chunk.choices[0].delta
103
106
 
107
+ # Handle thinking blocks from Anthropic (for preserving signatures)
108
+ if hasattr(delta, "thinking_blocks") and delta.thinking_blocks:
109
+ for block in delta.thinking_blocks:
110
+ if isinstance(block, dict):
111
+ # Accumulate thinking text
112
+ thinking_text = block.get("thinking", "")
113
+ if thinking_text:
114
+ state.thinking_text += thinking_text
115
+ # Store signature if present
116
+ signature = block.get("signature")
117
+ if signature:
118
+ state.thinking_signature = signature
119
+
104
120
  # Handle reasoning content for reasoning summaries
105
121
  if hasattr(delta, "reasoning_content"):
106
122
  reasoning_content = delta.reasoning_content
@@ -527,7 +543,19 @@ class ChatCmplStreamHandler:
527
543
 
528
544
  # include Reasoning item if it exists
529
545
  if state.reasoning_content_index_and_output:
530
- outputs.append(state.reasoning_content_index_and_output[1])
546
+ reasoning_item = state.reasoning_content_index_and_output[1]
547
+ # Store thinking text in content and signature in encrypted_content
548
+ if state.thinking_text:
549
+ # Add thinking text as a Content object
550
+ if not reasoning_item.content:
551
+ reasoning_item.content = []
552
+ reasoning_item.content.append(
553
+ Content(text=state.thinking_text, type="reasoning_text")
554
+ )
555
+ # Store signature in encrypted_content
556
+ if state.thinking_signature:
557
+ reasoning_item.encrypted_content = state.thinking_signature
558
+ outputs.append(reasoning_item)
531
559
 
532
560
  # include text or refusal content if they exist
533
561
  if state.text_content_index_and_output or state.refusal_content_index_and_output:
@@ -23,8 +23,9 @@ from ..tracing import generation_span
23
23
  from ..tracing.span_data import GenerationSpanData
24
24
  from ..tracing.spans import Span
25
25
  from ..usage import Usage
26
+ from ..util._json import _to_dump_compatible
26
27
  from .chatcmpl_converter import Converter
27
- from .chatcmpl_helpers import HEADERS, ChatCmplHelpers
28
+ from .chatcmpl_helpers import HEADERS, USER_AGENT_OVERRIDE, ChatCmplHelpers
28
29
  from .chatcmpl_stream_handler import ChatCmplStreamHandler
29
30
  from .fake_id import FAKE_RESPONSES_ID
30
31
  from .interface import Model, ModelTracing
@@ -237,6 +238,8 @@ class OpenAIChatCompletionsModel(Model):
237
238
  "role": "system",
238
239
  },
239
240
  )
241
+ converted_messages = _to_dump_compatible(converted_messages)
242
+
240
243
  if tracing.include_data():
241
244
  span.span_data.input = converted_messages
242
245
 
@@ -255,12 +258,24 @@ class OpenAIChatCompletionsModel(Model):
255
258
  for handoff in handoffs:
256
259
  converted_tools.append(Converter.convert_handoff_tool(handoff))
257
260
 
261
+ converted_tools = _to_dump_compatible(converted_tools)
262
+
258
263
  if _debug.DONT_LOG_MODEL_DATA:
259
264
  logger.debug("Calling LLM")
260
265
  else:
266
+ messages_json = json.dumps(
267
+ converted_messages,
268
+ indent=2,
269
+ ensure_ascii=False,
270
+ )
271
+ tools_json = json.dumps(
272
+ converted_tools,
273
+ indent=2,
274
+ ensure_ascii=False,
275
+ )
261
276
  logger.debug(
262
- f"{json.dumps(converted_messages, indent=2, ensure_ascii=False)}\n"
263
- f"Tools:\n{json.dumps(converted_tools, indent=2, ensure_ascii=False)}\n"
277
+ f"{messages_json}\n"
278
+ f"Tools:\n{tools_json}\n"
264
279
  f"Stream: {stream}\n"
265
280
  f"Tool choice: {tool_choice}\n"
266
281
  f"Response format: {response_format}\n"
@@ -291,7 +306,7 @@ class OpenAIChatCompletionsModel(Model):
291
306
  reasoning_effort=self._non_null_or_not_given(reasoning_effort),
292
307
  verbosity=self._non_null_or_not_given(model_settings.verbosity),
293
308
  top_logprobs=self._non_null_or_not_given(model_settings.top_logprobs),
294
- extra_headers={**HEADERS, **(model_settings.extra_headers or {})},
309
+ extra_headers=self._merge_headers(model_settings),
295
310
  extra_query=model_settings.extra_query,
296
311
  extra_body=model_settings.extra_body,
297
312
  metadata=self._non_null_or_not_given(model_settings.metadata),
@@ -334,3 +349,10 @@ class OpenAIChatCompletionsModel(Model):
334
349
  if self._client is None:
335
350
  self._client = AsyncOpenAI()
336
351
  return self._client
352
+
353
+ def _merge_headers(self, model_settings: ModelSettings):
354
+ merged = {**HEADERS, **(model_settings.extra_headers or {})}
355
+ ua_ctx = USER_AGENT_OVERRIDE.get()
356
+ if ua_ctx is not None:
357
+ merged["User-Agent"] = ua_ctx
358
+ return merged
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from collections.abc import AsyncIterator
5
+ from contextvars import ContextVar
5
6
  from dataclasses import dataclass
6
7
  from typing import TYPE_CHECKING, Any, Literal, cast, overload
7
8
 
@@ -38,6 +39,7 @@ from ..tool import (
38
39
  )
39
40
  from ..tracing import SpanError, response_span
40
41
  from ..usage import Usage
42
+ from ..util._json import _to_dump_compatible
41
43
  from ..version import __version__
42
44
  from .interface import Model, ModelTracing
43
45
 
@@ -48,6 +50,11 @@ if TYPE_CHECKING:
48
50
  _USER_AGENT = f"Agents/Python {__version__}"
49
51
  _HEADERS = {"User-Agent": _USER_AGENT}
50
52
 
53
+ # Override for the User-Agent header used by the Responses API.
54
+ _USER_AGENT_OVERRIDE: ContextVar[str | None] = ContextVar(
55
+ "openai_responses_user_agent_override", default=None
56
+ )
57
+
51
58
 
52
59
  class OpenAIResponsesModel(Model):
53
60
  """
@@ -240,6 +247,7 @@ class OpenAIResponsesModel(Model):
240
247
  prompt: ResponsePromptParam | None = None,
241
248
  ) -> Response | AsyncStream[ResponseStreamEvent]:
242
249
  list_input = ItemHelpers.input_to_new_input_list(input)
250
+ list_input = _to_dump_compatible(list_input)
243
251
 
244
252
  parallel_tool_calls = (
245
253
  True
@@ -251,6 +259,7 @@ class OpenAIResponsesModel(Model):
251
259
 
252
260
  tool_choice = Converter.convert_tool_choice(model_settings.tool_choice)
253
261
  converted_tools = Converter.convert_tools(tools, handoffs)
262
+ converted_tools_payload = _to_dump_compatible(converted_tools.tools)
254
263
  response_format = Converter.get_response_format(output_schema)
255
264
 
256
265
  include_set: set[str] = set(converted_tools.includes)
@@ -263,10 +272,20 @@ class OpenAIResponsesModel(Model):
263
272
  if _debug.DONT_LOG_MODEL_DATA:
264
273
  logger.debug("Calling LLM")
265
274
  else:
275
+ input_json = json.dumps(
276
+ list_input,
277
+ indent=2,
278
+ ensure_ascii=False,
279
+ )
280
+ tools_json = json.dumps(
281
+ converted_tools_payload,
282
+ indent=2,
283
+ ensure_ascii=False,
284
+ )
266
285
  logger.debug(
267
286
  f"Calling LLM {self.model} with input:\n"
268
- f"{json.dumps(list_input, indent=2, ensure_ascii=False)}\n"
269
- f"Tools:\n{json.dumps(converted_tools.tools, indent=2, ensure_ascii=False)}\n"
287
+ f"{input_json}\n"
288
+ f"Tools:\n{tools_json}\n"
270
289
  f"Stream: {stream}\n"
271
290
  f"Tool choice: {tool_choice}\n"
272
291
  f"Response format: {response_format}\n"
@@ -290,7 +309,7 @@ class OpenAIResponsesModel(Model):
290
309
  model=self.model,
291
310
  input=list_input,
292
311
  include=include,
293
- tools=converted_tools.tools,
312
+ tools=converted_tools_payload,
294
313
  prompt=self._non_null_or_not_given(prompt),
295
314
  temperature=self._non_null_or_not_given(model_settings.temperature),
296
315
  top_p=self._non_null_or_not_given(model_settings.top_p),
@@ -299,7 +318,7 @@ class OpenAIResponsesModel(Model):
299
318
  tool_choice=tool_choice,
300
319
  parallel_tool_calls=parallel_tool_calls,
301
320
  stream=stream,
302
- extra_headers={**_HEADERS, **(model_settings.extra_headers or {})},
321
+ extra_headers=self._merge_headers(model_settings),
303
322
  extra_query=model_settings.extra_query,
304
323
  extra_body=model_settings.extra_body,
305
324
  text=response_format,
@@ -314,6 +333,13 @@ class OpenAIResponsesModel(Model):
314
333
  self._client = AsyncOpenAI()
315
334
  return self._client
316
335
 
336
+ def _merge_headers(self, model_settings: ModelSettings):
337
+ merged = {**_HEADERS, **(model_settings.extra_headers or {})}
338
+ ua_ctx = _USER_AGENT_OVERRIDE.get()
339
+ if ua_ctx is not None:
340
+ merged["User-Agent"] = ua_ctx
341
+ return merged
342
+
317
343
 
318
344
  @dataclass
319
345
  class ConvertedTools:
@@ -3,6 +3,7 @@ from .config import (
3
3
  RealtimeAudioFormat,
4
4
  RealtimeClientMessage,
5
5
  RealtimeGuardrailsSettings,
6
+ RealtimeInputAudioNoiseReductionConfig,
6
7
  RealtimeInputAudioTranscriptionConfig,
7
8
  RealtimeModelName,
8
9
  RealtimeModelTracingConfig,
@@ -101,6 +102,7 @@ __all__ = [
101
102
  "RealtimeAudioFormat",
102
103
  "RealtimeClientMessage",
103
104
  "RealtimeGuardrailsSettings",
105
+ "RealtimeInputAudioNoiseReductionConfig",
104
106
  "RealtimeInputAudioTranscriptionConfig",
105
107
  "RealtimeModelName",
106
108
  "RealtimeModelTracingConfig",
agents/realtime/_util.py CHANGED
@@ -4,6 +4,6 @@ from .config import RealtimeAudioFormat
4
4
 
5
5
 
6
6
  def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float:
7
- if format and format.startswith("g711"):
7
+ if format and isinstance(format, str) and format.startswith("g711"):
8
8
  return (len(audio_bytes) / 8000) * 1000
9
9
  return (len(audio_bytes) / 24 / 2) * 1000
agents/realtime/agent.py CHANGED
@@ -6,6 +6,8 @@ from collections.abc import Awaitable
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Any, Callable, Generic, cast
8
8
 
9
+ from agents.prompts import Prompt
10
+
9
11
  from ..agent import AgentBase
10
12
  from ..guardrail import OutputGuardrail
11
13
  from ..handoffs import Handoff
@@ -55,6 +57,11 @@ class RealtimeAgent(AgentBase, Generic[TContext]):
55
57
  return a string.
56
58
  """
57
59
 
60
+ prompt: Prompt | None = None
61
+ """A prompt object. Prompts allow you to dynamically configure the instructions, tools
62
+ and other config for an agent outside of your code. Only usable with OpenAI models.
63
+ """
64
+
58
65
  handoffs: list[RealtimeAgent[Any] | Handoff[TContext, RealtimeAgent[Any]]] = field(
59
66
  default_factory=list
60
67
  )
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from openai.types.realtime.realtime_audio_formats import (
4
+ AudioPCM,
5
+ AudioPCMA,
6
+ AudioPCMU,
7
+ RealtimeAudioFormats,
8
+ )
9
+
10
+ from ..logger import logger
11
+
12
+
13
+ def to_realtime_audio_format(
14
+ input_audio_format: str | RealtimeAudioFormats | None,
15
+ ) -> RealtimeAudioFormats | None:
16
+ format: RealtimeAudioFormats | None = None
17
+ if input_audio_format is not None:
18
+ if isinstance(input_audio_format, str):
19
+ if input_audio_format in ["pcm16", "audio/pcm", "pcm"]:
20
+ format = AudioPCM(type="audio/pcm", rate=24000)
21
+ elif input_audio_format in ["g711_ulaw", "audio/pcmu", "pcmu"]:
22
+ format = AudioPCMU(type="audio/pcmu")
23
+ elif input_audio_format in ["g711_alaw", "audio/pcma", "pcma"]:
24
+ format = AudioPCMA(type="audio/pcma")
25
+ else:
26
+ logger.debug(f"Unknown input_audio_format: {input_audio_format}")
27
+ else:
28
+ format = input_audio_format
29
+ return format
agents/realtime/config.py CHANGED
@@ -6,8 +6,13 @@ from typing import (
6
6
  Union,
7
7
  )
8
8
 
9
+ from openai.types.realtime.realtime_audio_formats import (
10
+ RealtimeAudioFormats as OpenAIRealtimeAudioFormats,
11
+ )
9
12
  from typing_extensions import NotRequired, TypeAlias, TypedDict
10
13
 
14
+ from agents.prompts import Prompt
15
+
11
16
  from ..guardrail import OutputGuardrail
12
17
  from ..handoffs import Handoff
13
18
  from ..model_settings import ToolChoice
@@ -15,6 +20,8 @@ from ..tool import Tool
15
20
 
16
21
  RealtimeModelName: TypeAlias = Union[
17
22
  Literal[
23
+ "gpt-realtime",
24
+ "gpt-realtime-2025-08-28",
18
25
  "gpt-4o-realtime-preview",
19
26
  "gpt-4o-mini-realtime-preview",
20
27
  "gpt-4o-realtime-preview-2025-06-03",
@@ -54,6 +61,13 @@ class RealtimeInputAudioTranscriptionConfig(TypedDict):
54
61
  """An optional prompt to guide transcription."""
55
62
 
56
63
 
64
+ class RealtimeInputAudioNoiseReductionConfig(TypedDict):
65
+ """Noise reduction configuration for input audio."""
66
+
67
+ type: NotRequired[Literal["near_field", "far_field"]]
68
+ """Noise reduction mode to apply to input audio."""
69
+
70
+
57
71
  class RealtimeTurnDetectionConfig(TypedDict):
58
72
  """Turn detection config. Allows extra vendor keys if needed."""
59
73
 
@@ -91,6 +105,9 @@ class RealtimeSessionModelSettings(TypedDict):
91
105
  instructions: NotRequired[str]
92
106
  """System instructions for the model."""
93
107
 
108
+ prompt: NotRequired[Prompt]
109
+ """The prompt to use for the model."""
110
+
94
111
  modalities: NotRequired[list[Literal["text", "audio"]]]
95
112
  """The modalities the model should support."""
96
113
 
@@ -100,15 +117,18 @@ class RealtimeSessionModelSettings(TypedDict):
100
117
  speed: NotRequired[float]
101
118
  """The speed of the model's responses."""
102
119
 
103
- input_audio_format: NotRequired[RealtimeAudioFormat]
120
+ input_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats]
104
121
  """The format for input audio streams."""
105
122
 
106
- output_audio_format: NotRequired[RealtimeAudioFormat]
123
+ output_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats]
107
124
  """The format for output audio streams."""
108
125
 
109
126
  input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig]
110
127
  """Configuration for transcribing input audio."""
111
128
 
129
+ input_audio_noise_reduction: NotRequired[RealtimeInputAudioNoiseReductionConfig | None]
130
+ """Noise reduction configuration for input audio."""
131
+
112
132
  turn_detection: NotRequired[RealtimeTurnDetectionConfig]
113
133
  """Configuration for detecting conversation turns."""
114
134
 
@@ -177,6 +197,14 @@ class RealtimeUserInputText(TypedDict):
177
197
  """The text content from the user."""
178
198
 
179
199
 
200
+ class RealtimeUserInputImage(TypedDict, total=False):
201
+ """An image input from the user (Realtime)."""
202
+
203
+ type: Literal["input_image"]
204
+ image_url: str
205
+ detail: NotRequired[Literal["auto", "low", "high"] | str]
206
+
207
+
180
208
  class RealtimeUserInputMessage(TypedDict):
181
209
  """A message input from the user."""
182
210
 
@@ -186,8 +214,8 @@ class RealtimeUserInputMessage(TypedDict):
186
214
  role: Literal["user"]
187
215
  """The role identifier for user messages."""
188
216
 
189
- content: list[RealtimeUserInputText]
190
- """List of text content items in the message."""
217
+ content: list[RealtimeUserInputText | RealtimeUserInputImage]
218
+ """List of content items (text and image) in the message."""
191
219
 
192
220
 
193
221
  RealtimeUserInput: TypeAlias = Union[str, RealtimeUserInputMessage]
agents/realtime/items.py CHANGED
@@ -34,6 +34,22 @@ class InputAudio(BaseModel):
34
34
  model_config = ConfigDict(extra="allow")
35
35
 
36
36
 
37
+ class InputImage(BaseModel):
38
+ """Image input content for realtime messages."""
39
+
40
+ type: Literal["input_image"] = "input_image"
41
+ """The type identifier for image input."""
42
+
43
+ image_url: str | None = None
44
+ """Data/remote URL string (data:... or https:...)."""
45
+
46
+ detail: str | None = None
47
+ """Optional detail hint (e.g., 'auto', 'high', 'low')."""
48
+
49
+ # Allow extra data (e.g., `detail`)
50
+ model_config = ConfigDict(extra="allow")
51
+
52
+
37
53
  class AssistantText(BaseModel):
38
54
  """Text content from the assistant in realtime responses."""
39
55
 
@@ -100,7 +116,7 @@ class UserMessageItem(BaseModel):
100
116
  role: Literal["user"] = "user"
101
117
  """The role identifier for user messages."""
102
118
 
103
- content: list[Annotated[InputText | InputAudio, Field(discriminator="type")]]
119
+ content: list[Annotated[InputText | InputAudio | InputImage, Field(discriminator="type")]]
104
120
  """List of content items, can be text or audio."""
105
121
 
106
122
  # Allow extra data
@@ -84,6 +84,7 @@ class RealtimeModelInputAudioTranscriptionCompletedEvent:
84
84
 
85
85
  type: Literal["input_audio_transcription_completed"] = "input_audio_transcription_completed"
86
86
 
87
+
87
88
  @dataclass
88
89
  class RealtimeModelInputAudioTimeoutTriggeredEvent:
89
90
  """Input audio timeout triggered."""
@@ -94,6 +95,7 @@ class RealtimeModelInputAudioTimeoutTriggeredEvent:
94
95
 
95
96
  type: Literal["input_audio_timeout_triggered"] = "input_audio_timeout_triggered"
96
97
 
98
+
97
99
  @dataclass
98
100
  class RealtimeModelTranscriptDeltaEvent:
99
101
  """Partial transcript update."""
@@ -24,12 +24,26 @@ class RealtimeModelInputTextContent(TypedDict):
24
24
  text: str
25
25
 
26
26
 
27
+ class RealtimeModelInputImageContent(TypedDict, total=False):
28
+ """An image to be sent to the model.
29
+
30
+ The Realtime API expects `image_url` to be a string data/remote URL.
31
+ """
32
+
33
+ type: Literal["input_image"]
34
+ image_url: str
35
+ """String URL (data:... or https:...)."""
36
+
37
+ detail: NotRequired[str]
38
+ """Optional detail hint such as 'high', 'low', or 'auto'."""
39
+
40
+
27
41
  class RealtimeModelUserInputMessage(TypedDict):
28
42
  """A message to be sent to the model."""
29
43
 
30
44
  type: Literal["message"]
31
45
  role: Literal["user"]
32
- content: list[RealtimeModelInputTextContent]
46
+ content: list[RealtimeModelInputTextContent | RealtimeModelInputImageContent]
33
47
 
34
48
 
35
49
  RealtimeModelUserInput: TypeAlias = Union[str, RealtimeModelUserInputMessage]