openai-agents 0.2.10__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of openai-agents might be problematic. Click here for more details.
- agents/_debug.py +15 -4
- agents/_run_impl.py +34 -37
- agents/extensions/models/litellm_model.py +20 -5
- agents/memory/__init__.py +2 -0
- agents/memory/openai_conversations_session.py +0 -3
- agents/memory/util.py +20 -0
- agents/models/openai_chatcompletions.py +17 -2
- agents/models/openai_responses.py +17 -4
- agents/realtime/_util.py +1 -1
- agents/realtime/agent.py +7 -0
- agents/realtime/audio_formats.py +29 -0
- agents/realtime/config.py +22 -4
- agents/realtime/items.py +17 -1
- agents/realtime/model.py +6 -0
- agents/realtime/model_inputs.py +15 -1
- agents/realtime/openai_realtime.py +428 -139
- agents/realtime/session.py +167 -14
- agents/run.py +102 -54
- agents/tool.py +2 -2
- agents/util/_json.py +19 -1
- agents/voice/input.py +5 -4
- agents/voice/models/openai_stt.py +6 -4
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/METADATA +2 -2
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/RECORD +26 -24
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/WHEEL +0 -0
- {openai_agents-0.2.10.dist-info → openai_agents-0.3.0.dist-info}/licenses/LICENSE +0 -0
agents/_debug.py
CHANGED
|
@@ -1,17 +1,28 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
def _debug_flag_enabled(flag: str) -> bool:
|
|
4
|
+
def _debug_flag_enabled(flag: str, default: bool = False) -> bool:
|
|
5
5
|
flag_value = os.getenv(flag)
|
|
6
|
-
|
|
6
|
+
if flag_value is None:
|
|
7
|
+
return default
|
|
8
|
+
else:
|
|
9
|
+
return flag_value == "1" or flag_value.lower() == "true"
|
|
7
10
|
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
def _load_dont_log_model_data() -> bool:
|
|
13
|
+
return _debug_flag_enabled("OPENAI_AGENTS_DONT_LOG_MODEL_DATA", default=True)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _load_dont_log_tool_data() -> bool:
|
|
17
|
+
return _debug_flag_enabled("OPENAI_AGENTS_DONT_LOG_TOOL_DATA", default=True)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
DONT_LOG_MODEL_DATA = _load_dont_log_model_data()
|
|
10
21
|
"""By default we don't log LLM inputs/outputs, to prevent exposing sensitive information. Set this
|
|
11
22
|
flag to enable logging them.
|
|
12
23
|
"""
|
|
13
24
|
|
|
14
|
-
DONT_LOG_TOOL_DATA =
|
|
25
|
+
DONT_LOG_TOOL_DATA = _load_dont_log_tool_data()
|
|
15
26
|
"""By default we don't log tool call inputs/outputs, to prevent exposing sensitive information. Set
|
|
16
27
|
this flag to enable logging them.
|
|
17
28
|
"""
|
agents/_run_impl.py
CHANGED
|
@@ -330,43 +330,40 @@ class RunImpl:
|
|
|
330
330
|
ItemHelpers.extract_last_text(message_items[-1].raw_item) if message_items else None
|
|
331
331
|
)
|
|
332
332
|
|
|
333
|
-
#
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
new_step_items=new_step_items,
|
|
368
|
-
next_step=NextStepRunAgain(),
|
|
369
|
-
)
|
|
333
|
+
# Generate final output only when there are no pending tool calls or approval requests.
|
|
334
|
+
if not processed_response.has_tools_or_approvals_to_run():
|
|
335
|
+
if output_schema and not output_schema.is_plain_text() and potential_final_output_text:
|
|
336
|
+
final_output = output_schema.validate_json(potential_final_output_text)
|
|
337
|
+
return await cls.execute_final_output(
|
|
338
|
+
agent=agent,
|
|
339
|
+
original_input=original_input,
|
|
340
|
+
new_response=new_response,
|
|
341
|
+
pre_step_items=pre_step_items,
|
|
342
|
+
new_step_items=new_step_items,
|
|
343
|
+
final_output=final_output,
|
|
344
|
+
hooks=hooks,
|
|
345
|
+
context_wrapper=context_wrapper,
|
|
346
|
+
)
|
|
347
|
+
elif not output_schema or output_schema.is_plain_text():
|
|
348
|
+
return await cls.execute_final_output(
|
|
349
|
+
agent=agent,
|
|
350
|
+
original_input=original_input,
|
|
351
|
+
new_response=new_response,
|
|
352
|
+
pre_step_items=pre_step_items,
|
|
353
|
+
new_step_items=new_step_items,
|
|
354
|
+
final_output=potential_final_output_text or "",
|
|
355
|
+
hooks=hooks,
|
|
356
|
+
context_wrapper=context_wrapper,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# If there's no final output, we can just run again
|
|
360
|
+
return SingleStepResult(
|
|
361
|
+
original_input=original_input,
|
|
362
|
+
model_response=new_response,
|
|
363
|
+
pre_step_items=pre_step_items,
|
|
364
|
+
new_step_items=new_step_items,
|
|
365
|
+
next_step=NextStepRunAgain(),
|
|
366
|
+
)
|
|
370
367
|
|
|
371
368
|
@classmethod
|
|
372
369
|
def maybe_reset_tool_choice(
|
|
@@ -48,6 +48,7 @@ from ...tracing import generation_span
|
|
|
48
48
|
from ...tracing.span_data import GenerationSpanData
|
|
49
49
|
from ...tracing.spans import Span
|
|
50
50
|
from ...usage import Usage
|
|
51
|
+
from ...util._json import _to_dump_compatible
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
class InternalChatCompletionMessage(ChatCompletionMessage):
|
|
@@ -265,6 +266,8 @@ class LitellmModel(Model):
|
|
|
265
266
|
"role": "system",
|
|
266
267
|
},
|
|
267
268
|
)
|
|
269
|
+
converted_messages = _to_dump_compatible(converted_messages)
|
|
270
|
+
|
|
268
271
|
if tracing.include_data():
|
|
269
272
|
span.span_data.input = converted_messages
|
|
270
273
|
|
|
@@ -283,13 +286,25 @@ class LitellmModel(Model):
|
|
|
283
286
|
for handoff in handoffs:
|
|
284
287
|
converted_tools.append(Converter.convert_handoff_tool(handoff))
|
|
285
288
|
|
|
289
|
+
converted_tools = _to_dump_compatible(converted_tools)
|
|
290
|
+
|
|
286
291
|
if _debug.DONT_LOG_MODEL_DATA:
|
|
287
292
|
logger.debug("Calling LLM")
|
|
288
293
|
else:
|
|
294
|
+
messages_json = json.dumps(
|
|
295
|
+
converted_messages,
|
|
296
|
+
indent=2,
|
|
297
|
+
ensure_ascii=False,
|
|
298
|
+
)
|
|
299
|
+
tools_json = json.dumps(
|
|
300
|
+
converted_tools,
|
|
301
|
+
indent=2,
|
|
302
|
+
ensure_ascii=False,
|
|
303
|
+
)
|
|
289
304
|
logger.debug(
|
|
290
305
|
f"Calling Litellm model: {self.model}\n"
|
|
291
|
-
f"{
|
|
292
|
-
f"Tools:\n{
|
|
306
|
+
f"{messages_json}\n"
|
|
307
|
+
f"Tools:\n{tools_json}\n"
|
|
293
308
|
f"Stream: {stream}\n"
|
|
294
309
|
f"Tool choice: {tool_choice}\n"
|
|
295
310
|
f"Response format: {response_format}\n"
|
|
@@ -369,9 +384,9 @@ class LitellmConverter:
|
|
|
369
384
|
if message.role != "assistant":
|
|
370
385
|
raise ModelBehaviorError(f"Unsupported role: {message.role}")
|
|
371
386
|
|
|
372
|
-
tool_calls:
|
|
373
|
-
ChatCompletionMessageFunctionToolCall | ChatCompletionMessageCustomToolCall
|
|
374
|
-
|
|
387
|
+
tool_calls: (
|
|
388
|
+
list[ChatCompletionMessageFunctionToolCall | ChatCompletionMessageCustomToolCall] | None
|
|
389
|
+
) = (
|
|
375
390
|
[LitellmConverter.convert_tool_call_to_openai(tool) for tool in message.tool_calls]
|
|
376
391
|
if message.tool_calls
|
|
377
392
|
else None
|
agents/memory/__init__.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from .openai_conversations_session import OpenAIConversationsSession
|
|
2
2
|
from .session import Session, SessionABC
|
|
3
3
|
from .sqlite_session import SQLiteSession
|
|
4
|
+
from .util import SessionInputCallback
|
|
4
5
|
|
|
5
6
|
__all__ = [
|
|
6
7
|
"Session",
|
|
7
8
|
"SessionABC",
|
|
9
|
+
"SessionInputCallback",
|
|
8
10
|
"SQLiteSession",
|
|
9
11
|
"OpenAIConversationsSession",
|
|
10
12
|
]
|
agents/memory/util.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
from ..items import TResponseInputItem
|
|
6
|
+
from ..util._types import MaybeAwaitable
|
|
7
|
+
|
|
8
|
+
SessionInputCallback = Callable[
|
|
9
|
+
[list[TResponseInputItem], list[TResponseInputItem]],
|
|
10
|
+
MaybeAwaitable[list[TResponseInputItem]],
|
|
11
|
+
]
|
|
12
|
+
"""A function that combines session history with new input items.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
history_items: The list of items from the session history.
|
|
16
|
+
new_items: The list of new input items for the current turn.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
A list of combined items to be used as input for the agent. Can be sync or async.
|
|
20
|
+
"""
|
|
@@ -23,6 +23,7 @@ from ..tracing import generation_span
|
|
|
23
23
|
from ..tracing.span_data import GenerationSpanData
|
|
24
24
|
from ..tracing.spans import Span
|
|
25
25
|
from ..usage import Usage
|
|
26
|
+
from ..util._json import _to_dump_compatible
|
|
26
27
|
from .chatcmpl_converter import Converter
|
|
27
28
|
from .chatcmpl_helpers import HEADERS, ChatCmplHelpers
|
|
28
29
|
from .chatcmpl_stream_handler import ChatCmplStreamHandler
|
|
@@ -237,6 +238,8 @@ class OpenAIChatCompletionsModel(Model):
|
|
|
237
238
|
"role": "system",
|
|
238
239
|
},
|
|
239
240
|
)
|
|
241
|
+
converted_messages = _to_dump_compatible(converted_messages)
|
|
242
|
+
|
|
240
243
|
if tracing.include_data():
|
|
241
244
|
span.span_data.input = converted_messages
|
|
242
245
|
|
|
@@ -255,12 +258,24 @@ class OpenAIChatCompletionsModel(Model):
|
|
|
255
258
|
for handoff in handoffs:
|
|
256
259
|
converted_tools.append(Converter.convert_handoff_tool(handoff))
|
|
257
260
|
|
|
261
|
+
converted_tools = _to_dump_compatible(converted_tools)
|
|
262
|
+
|
|
258
263
|
if _debug.DONT_LOG_MODEL_DATA:
|
|
259
264
|
logger.debug("Calling LLM")
|
|
260
265
|
else:
|
|
266
|
+
messages_json = json.dumps(
|
|
267
|
+
converted_messages,
|
|
268
|
+
indent=2,
|
|
269
|
+
ensure_ascii=False,
|
|
270
|
+
)
|
|
271
|
+
tools_json = json.dumps(
|
|
272
|
+
converted_tools,
|
|
273
|
+
indent=2,
|
|
274
|
+
ensure_ascii=False,
|
|
275
|
+
)
|
|
261
276
|
logger.debug(
|
|
262
|
-
f"{
|
|
263
|
-
f"Tools:\n{
|
|
277
|
+
f"{messages_json}\n"
|
|
278
|
+
f"Tools:\n{tools_json}\n"
|
|
264
279
|
f"Stream: {stream}\n"
|
|
265
280
|
f"Tool choice: {tool_choice}\n"
|
|
266
281
|
f"Response format: {response_format}\n"
|
|
@@ -38,6 +38,7 @@ from ..tool import (
|
|
|
38
38
|
)
|
|
39
39
|
from ..tracing import SpanError, response_span
|
|
40
40
|
from ..usage import Usage
|
|
41
|
+
from ..util._json import _to_dump_compatible
|
|
41
42
|
from ..version import __version__
|
|
42
43
|
from .interface import Model, ModelTracing
|
|
43
44
|
|
|
@@ -240,6 +241,7 @@ class OpenAIResponsesModel(Model):
|
|
|
240
241
|
prompt: ResponsePromptParam | None = None,
|
|
241
242
|
) -> Response | AsyncStream[ResponseStreamEvent]:
|
|
242
243
|
list_input = ItemHelpers.input_to_new_input_list(input)
|
|
244
|
+
list_input = _to_dump_compatible(list_input)
|
|
243
245
|
|
|
244
246
|
parallel_tool_calls = (
|
|
245
247
|
True
|
|
@@ -251,6 +253,7 @@ class OpenAIResponsesModel(Model):
|
|
|
251
253
|
|
|
252
254
|
tool_choice = Converter.convert_tool_choice(model_settings.tool_choice)
|
|
253
255
|
converted_tools = Converter.convert_tools(tools, handoffs)
|
|
256
|
+
converted_tools_payload = _to_dump_compatible(converted_tools.tools)
|
|
254
257
|
response_format = Converter.get_response_format(output_schema)
|
|
255
258
|
|
|
256
259
|
include_set: set[str] = set(converted_tools.includes)
|
|
@@ -263,10 +266,20 @@ class OpenAIResponsesModel(Model):
|
|
|
263
266
|
if _debug.DONT_LOG_MODEL_DATA:
|
|
264
267
|
logger.debug("Calling LLM")
|
|
265
268
|
else:
|
|
269
|
+
input_json = json.dumps(
|
|
270
|
+
list_input,
|
|
271
|
+
indent=2,
|
|
272
|
+
ensure_ascii=False,
|
|
273
|
+
)
|
|
274
|
+
tools_json = json.dumps(
|
|
275
|
+
converted_tools_payload,
|
|
276
|
+
indent=2,
|
|
277
|
+
ensure_ascii=False,
|
|
278
|
+
)
|
|
266
279
|
logger.debug(
|
|
267
280
|
f"Calling LLM {self.model} with input:\n"
|
|
268
|
-
f"{
|
|
269
|
-
f"Tools:\n{
|
|
281
|
+
f"{input_json}\n"
|
|
282
|
+
f"Tools:\n{tools_json}\n"
|
|
270
283
|
f"Stream: {stream}\n"
|
|
271
284
|
f"Tool choice: {tool_choice}\n"
|
|
272
285
|
f"Response format: {response_format}\n"
|
|
@@ -290,7 +303,7 @@ class OpenAIResponsesModel(Model):
|
|
|
290
303
|
model=self.model,
|
|
291
304
|
input=list_input,
|
|
292
305
|
include=include,
|
|
293
|
-
tools=
|
|
306
|
+
tools=converted_tools_payload,
|
|
294
307
|
prompt=self._non_null_or_not_given(prompt),
|
|
295
308
|
temperature=self._non_null_or_not_given(model_settings.temperature),
|
|
296
309
|
top_p=self._non_null_or_not_given(model_settings.top_p),
|
|
@@ -433,7 +446,7 @@ class Converter:
|
|
|
433
446
|
converted_tool = {
|
|
434
447
|
"type": "web_search",
|
|
435
448
|
"filters": tool.filters.model_dump() if tool.filters is not None else None, # type: ignore [typeddict-item]
|
|
436
|
-
"user_location": tool.user_location,
|
|
449
|
+
"user_location": tool.user_location,
|
|
437
450
|
"search_context_size": tool.search_context_size,
|
|
438
451
|
}
|
|
439
452
|
includes = None
|
agents/realtime/_util.py
CHANGED
|
@@ -4,6 +4,6 @@ from .config import RealtimeAudioFormat
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float:
|
|
7
|
-
if format and format.startswith("g711"):
|
|
7
|
+
if format and isinstance(format, str) and format.startswith("g711"):
|
|
8
8
|
return (len(audio_bytes) / 8000) * 1000
|
|
9
9
|
return (len(audio_bytes) / 24 / 2) * 1000
|
agents/realtime/agent.py
CHANGED
|
@@ -6,6 +6,8 @@ from collections.abc import Awaitable
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Any, Callable, Generic, cast
|
|
8
8
|
|
|
9
|
+
from agents.prompts import Prompt
|
|
10
|
+
|
|
9
11
|
from ..agent import AgentBase
|
|
10
12
|
from ..guardrail import OutputGuardrail
|
|
11
13
|
from ..handoffs import Handoff
|
|
@@ -55,6 +57,11 @@ class RealtimeAgent(AgentBase, Generic[TContext]):
|
|
|
55
57
|
return a string.
|
|
56
58
|
"""
|
|
57
59
|
|
|
60
|
+
prompt: Prompt | None = None
|
|
61
|
+
"""A prompt object. Prompts allow you to dynamically configure the instructions, tools
|
|
62
|
+
and other config for an agent outside of your code. Only usable with OpenAI models.
|
|
63
|
+
"""
|
|
64
|
+
|
|
58
65
|
handoffs: list[RealtimeAgent[Any] | Handoff[TContext, RealtimeAgent[Any]]] = field(
|
|
59
66
|
default_factory=list
|
|
60
67
|
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from openai.types.realtime.realtime_audio_formats import (
|
|
4
|
+
AudioPCM,
|
|
5
|
+
AudioPCMA,
|
|
6
|
+
AudioPCMU,
|
|
7
|
+
RealtimeAudioFormats,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from ..logger import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def to_realtime_audio_format(
|
|
14
|
+
input_audio_format: str | RealtimeAudioFormats | None,
|
|
15
|
+
) -> RealtimeAudioFormats | None:
|
|
16
|
+
format: RealtimeAudioFormats | None = None
|
|
17
|
+
if input_audio_format is not None:
|
|
18
|
+
if isinstance(input_audio_format, str):
|
|
19
|
+
if input_audio_format in ["pcm16", "audio/pcm", "pcm"]:
|
|
20
|
+
format = AudioPCM(type="audio/pcm", rate=24000)
|
|
21
|
+
elif input_audio_format in ["g711_ulaw", "audio/pcmu", "pcmu"]:
|
|
22
|
+
format = AudioPCMU(type="audio/pcmu")
|
|
23
|
+
elif input_audio_format in ["g711_alaw", "audio/pcma", "pcma"]:
|
|
24
|
+
format = AudioPCMA(type="audio/pcma")
|
|
25
|
+
else:
|
|
26
|
+
logger.debug(f"Unknown input_audio_format: {input_audio_format}")
|
|
27
|
+
else:
|
|
28
|
+
format = input_audio_format
|
|
29
|
+
return format
|
agents/realtime/config.py
CHANGED
|
@@ -6,8 +6,13 @@ from typing import (
|
|
|
6
6
|
Union,
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
+
from openai.types.realtime.realtime_audio_formats import (
|
|
10
|
+
RealtimeAudioFormats as OpenAIRealtimeAudioFormats,
|
|
11
|
+
)
|
|
9
12
|
from typing_extensions import NotRequired, TypeAlias, TypedDict
|
|
10
13
|
|
|
14
|
+
from agents.prompts import Prompt
|
|
15
|
+
|
|
11
16
|
from ..guardrail import OutputGuardrail
|
|
12
17
|
from ..handoffs import Handoff
|
|
13
18
|
from ..model_settings import ToolChoice
|
|
@@ -15,6 +20,8 @@ from ..tool import Tool
|
|
|
15
20
|
|
|
16
21
|
RealtimeModelName: TypeAlias = Union[
|
|
17
22
|
Literal[
|
|
23
|
+
"gpt-realtime",
|
|
24
|
+
"gpt-realtime-2025-08-28",
|
|
18
25
|
"gpt-4o-realtime-preview",
|
|
19
26
|
"gpt-4o-mini-realtime-preview",
|
|
20
27
|
"gpt-4o-realtime-preview-2025-06-03",
|
|
@@ -91,6 +98,9 @@ class RealtimeSessionModelSettings(TypedDict):
|
|
|
91
98
|
instructions: NotRequired[str]
|
|
92
99
|
"""System instructions for the model."""
|
|
93
100
|
|
|
101
|
+
prompt: NotRequired[Prompt]
|
|
102
|
+
"""The prompt to use for the model."""
|
|
103
|
+
|
|
94
104
|
modalities: NotRequired[list[Literal["text", "audio"]]]
|
|
95
105
|
"""The modalities the model should support."""
|
|
96
106
|
|
|
@@ -100,10 +110,10 @@ class RealtimeSessionModelSettings(TypedDict):
|
|
|
100
110
|
speed: NotRequired[float]
|
|
101
111
|
"""The speed of the model's responses."""
|
|
102
112
|
|
|
103
|
-
input_audio_format: NotRequired[RealtimeAudioFormat]
|
|
113
|
+
input_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats]
|
|
104
114
|
"""The format for input audio streams."""
|
|
105
115
|
|
|
106
|
-
output_audio_format: NotRequired[RealtimeAudioFormat]
|
|
116
|
+
output_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats]
|
|
107
117
|
"""The format for output audio streams."""
|
|
108
118
|
|
|
109
119
|
input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig]
|
|
@@ -177,6 +187,14 @@ class RealtimeUserInputText(TypedDict):
|
|
|
177
187
|
"""The text content from the user."""
|
|
178
188
|
|
|
179
189
|
|
|
190
|
+
class RealtimeUserInputImage(TypedDict, total=False):
|
|
191
|
+
"""An image input from the user (Realtime)."""
|
|
192
|
+
|
|
193
|
+
type: Literal["input_image"]
|
|
194
|
+
image_url: str
|
|
195
|
+
detail: NotRequired[Literal["auto", "low", "high"] | str]
|
|
196
|
+
|
|
197
|
+
|
|
180
198
|
class RealtimeUserInputMessage(TypedDict):
|
|
181
199
|
"""A message input from the user."""
|
|
182
200
|
|
|
@@ -186,8 +204,8 @@ class RealtimeUserInputMessage(TypedDict):
|
|
|
186
204
|
role: Literal["user"]
|
|
187
205
|
"""The role identifier for user messages."""
|
|
188
206
|
|
|
189
|
-
content: list[RealtimeUserInputText]
|
|
190
|
-
"""List of
|
|
207
|
+
content: list[RealtimeUserInputText | RealtimeUserInputImage]
|
|
208
|
+
"""List of content items (text and image) in the message."""
|
|
191
209
|
|
|
192
210
|
|
|
193
211
|
RealtimeUserInput: TypeAlias = Union[str, RealtimeUserInputMessage]
|
agents/realtime/items.py
CHANGED
|
@@ -34,6 +34,22 @@ class InputAudio(BaseModel):
|
|
|
34
34
|
model_config = ConfigDict(extra="allow")
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
class InputImage(BaseModel):
|
|
38
|
+
"""Image input content for realtime messages."""
|
|
39
|
+
|
|
40
|
+
type: Literal["input_image"] = "input_image"
|
|
41
|
+
"""The type identifier for image input."""
|
|
42
|
+
|
|
43
|
+
image_url: str | None = None
|
|
44
|
+
"""Data/remote URL string (data:... or https:...)."""
|
|
45
|
+
|
|
46
|
+
detail: str | None = None
|
|
47
|
+
"""Optional detail hint (e.g., 'auto', 'high', 'low')."""
|
|
48
|
+
|
|
49
|
+
# Allow extra data (e.g., `detail`)
|
|
50
|
+
model_config = ConfigDict(extra="allow")
|
|
51
|
+
|
|
52
|
+
|
|
37
53
|
class AssistantText(BaseModel):
|
|
38
54
|
"""Text content from the assistant in realtime responses."""
|
|
39
55
|
|
|
@@ -100,7 +116,7 @@ class UserMessageItem(BaseModel):
|
|
|
100
116
|
role: Literal["user"] = "user"
|
|
101
117
|
"""The role identifier for user messages."""
|
|
102
118
|
|
|
103
|
-
content: list[Annotated[InputText | InputAudio, Field(discriminator="type")]]
|
|
119
|
+
content: list[Annotated[InputText | InputAudio | InputImage, Field(discriminator="type")]]
|
|
104
120
|
"""List of content items, can be text or audio."""
|
|
105
121
|
|
|
106
122
|
# Allow extra data
|
agents/realtime/model.py
CHANGED
|
@@ -118,6 +118,12 @@ class RealtimeModelConfig(TypedDict):
|
|
|
118
118
|
the OpenAI Realtime model will use the default OpenAI WebSocket URL.
|
|
119
119
|
"""
|
|
120
120
|
|
|
121
|
+
headers: NotRequired[dict[str, str]]
|
|
122
|
+
"""The headers to use when connecting. If unset, the model will use a sane default.
|
|
123
|
+
Note that, when you set this, authorization header won't be set under the hood.
|
|
124
|
+
e.g., {"api-key": "your api key here"} for Azure OpenAI Realtime WebSocket connections.
|
|
125
|
+
"""
|
|
126
|
+
|
|
121
127
|
initial_model_settings: NotRequired[RealtimeSessionModelSettings]
|
|
122
128
|
"""The initial model settings to use when connecting."""
|
|
123
129
|
|
agents/realtime/model_inputs.py
CHANGED
|
@@ -24,12 +24,26 @@ class RealtimeModelInputTextContent(TypedDict):
|
|
|
24
24
|
text: str
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
class RealtimeModelInputImageContent(TypedDict, total=False):
|
|
28
|
+
"""An image to be sent to the model.
|
|
29
|
+
|
|
30
|
+
The Realtime API expects `image_url` to be a string data/remote URL.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
type: Literal["input_image"]
|
|
34
|
+
image_url: str
|
|
35
|
+
"""String URL (data:... or https:...)."""
|
|
36
|
+
|
|
37
|
+
detail: NotRequired[str]
|
|
38
|
+
"""Optional detail hint such as 'high', 'low', or 'auto'."""
|
|
39
|
+
|
|
40
|
+
|
|
27
41
|
class RealtimeModelUserInputMessage(TypedDict):
|
|
28
42
|
"""A message to be sent to the model."""
|
|
29
43
|
|
|
30
44
|
type: Literal["message"]
|
|
31
45
|
role: Literal["user"]
|
|
32
|
-
content: list[RealtimeModelInputTextContent]
|
|
46
|
+
content: list[RealtimeModelInputTextContent | RealtimeModelInputImageContent]
|
|
33
47
|
|
|
34
48
|
|
|
35
49
|
RealtimeModelUserInput: TypeAlias = Union[str, RealtimeModelUserInputMessage]
|