dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,1106 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Event models and data structures for OpenAI Realtime API communication."""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import uuid
|
|
11
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
#
|
|
16
|
+
# session properties
|
|
17
|
+
#
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AudioFormat(BaseModel):
|
|
21
|
+
"""Base class for audio format configuration."""
|
|
22
|
+
|
|
23
|
+
type: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PCMAudioFormat(AudioFormat):
|
|
27
|
+
"""PCM audio format configuration.
|
|
28
|
+
|
|
29
|
+
Parameters:
|
|
30
|
+
type: Audio format type, always "audio/pcm".
|
|
31
|
+
rate: Sample rate, always 24000 for PCM.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
type: Literal["audio/pcm"] = "audio/pcm"
|
|
35
|
+
rate: Literal[24000] = 24000
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PCMUAudioFormat(AudioFormat):
|
|
39
|
+
"""PCMU (G.711 μ-law) audio format configuration.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
type: Audio format type, always "audio/pcmu".
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
type: Literal["audio/pcmu"] = "audio/pcmu"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class PCMAAudioFormat(AudioFormat):
|
|
49
|
+
"""PCMA (G.711 A-law) audio format configuration.
|
|
50
|
+
|
|
51
|
+
Parameters:
|
|
52
|
+
type: Audio format type, always "audio/pcma".
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
type: Literal["audio/pcma"] = "audio/pcma"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class InputAudioTranscription(BaseModel):
|
|
59
|
+
"""Configuration for audio transcription settings."""
|
|
60
|
+
|
|
61
|
+
model: str = "gpt-4o-transcribe"
|
|
62
|
+
language: Optional[str]
|
|
63
|
+
prompt: Optional[str]
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
model: Optional[str] = "gpt-4o-transcribe",
|
|
68
|
+
language: Optional[str] = None,
|
|
69
|
+
prompt: Optional[str] = None,
|
|
70
|
+
):
|
|
71
|
+
"""Initialize InputAudioTranscription.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
|
|
75
|
+
language: Optional language code for transcription.
|
|
76
|
+
prompt: Optional transcription hint text.
|
|
77
|
+
"""
|
|
78
|
+
super().__init__(model=model, language=language, prompt=prompt)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class TurnDetection(BaseModel):
|
|
82
|
+
"""Server-side voice activity detection configuration.
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
type: Detection type, must be "server_vad".
|
|
86
|
+
threshold: Voice activity detection threshold (0.0-1.0). Defaults to 0.5.
|
|
87
|
+
prefix_padding_ms: Padding before speech starts in milliseconds. Defaults to 300.
|
|
88
|
+
silence_duration_ms: Silence duration to detect speech end in milliseconds. Defaults to 500.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
type: Optional[Literal["server_vad"]] = "server_vad"
|
|
92
|
+
threshold: Optional[float] = 0.5
|
|
93
|
+
prefix_padding_ms: Optional[int] = 300
|
|
94
|
+
silence_duration_ms: Optional[int] = 500
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class SemanticTurnDetection(BaseModel):
|
|
98
|
+
"""Semantic-based turn detection configuration.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
type: Detection type, must be "semantic_vad".
|
|
102
|
+
eagerness: Turn detection eagerness level. Can be "low", "medium", "high", or "auto".
|
|
103
|
+
create_response: Whether to automatically create responses on turn detection.
|
|
104
|
+
interrupt_response: Whether to interrupt ongoing responses on turn detection.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
type: Optional[Literal["semantic_vad"]] = "semantic_vad"
|
|
108
|
+
eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None
|
|
109
|
+
create_response: Optional[bool] = None
|
|
110
|
+
interrupt_response: Optional[bool] = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class InputAudioNoiseReduction(BaseModel):
|
|
114
|
+
"""Input audio noise reduction configuration.
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
type: Noise reduction type for different microphone scenarios.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
type: Optional[Literal["near_field", "far_field"]]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class AudioInput(BaseModel):
|
|
124
|
+
"""Audio input configuration.
|
|
125
|
+
|
|
126
|
+
Parameters:
|
|
127
|
+
format: The format of the input audio.
|
|
128
|
+
transcription: Configuration for input audio transcription.
|
|
129
|
+
noise_reduction: Configuration for input audio noise reduction.
|
|
130
|
+
turn_detection: Configuration for turn detection, or False to disable.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
format: Optional[Union[PCMAudioFormat, PCMUAudioFormat, PCMAAudioFormat]] = None
|
|
134
|
+
transcription: Optional[InputAudioTranscription] = None
|
|
135
|
+
noise_reduction: Optional[InputAudioNoiseReduction] = None
|
|
136
|
+
turn_detection: Optional[Union[TurnDetection, SemanticTurnDetection, bool]] = None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class AudioOutput(BaseModel):
|
|
140
|
+
"""Audio output configuration.
|
|
141
|
+
|
|
142
|
+
Parameters:
|
|
143
|
+
format: The format of the output audio.
|
|
144
|
+
voice: The voice the model uses to respond.
|
|
145
|
+
speed: The speed of the model's spoken response.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
format: Optional[Union[PCMAudioFormat, PCMUAudioFormat, PCMAAudioFormat]] = None
|
|
149
|
+
voice: Optional[str] = None
|
|
150
|
+
speed: Optional[float] = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class AudioConfiguration(BaseModel):
|
|
154
|
+
"""Audio configuration for input and output.
|
|
155
|
+
|
|
156
|
+
Parameters:
|
|
157
|
+
input: Configuration for input audio.
|
|
158
|
+
output: Configuration for output audio.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
input: Optional[AudioInput] = None
|
|
162
|
+
output: Optional[AudioOutput] = None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class SessionProperties(BaseModel):
|
|
166
|
+
"""Configuration properties for an OpenAI Realtime session.
|
|
167
|
+
|
|
168
|
+
Parameters:
|
|
169
|
+
type: The type of session, always "realtime".
|
|
170
|
+
object: Object type identifier, always "realtime.session".
|
|
171
|
+
id: Unique identifier for the session.
|
|
172
|
+
model: The Realtime model used for this session.
|
|
173
|
+
output_modalities: The set of modalities the model can respond with.
|
|
174
|
+
instructions: System instructions for the assistant.
|
|
175
|
+
audio: Configuration for input and output audio.
|
|
176
|
+
tools: Available function tools for the assistant.
|
|
177
|
+
tool_choice: Tool usage strategy ("auto", "none", or "required").
|
|
178
|
+
max_output_tokens: Maximum tokens in response or "inf" for unlimited.
|
|
179
|
+
tracing: Configuration options for tracing.
|
|
180
|
+
prompt: Reference to a prompt template and its variables.
|
|
181
|
+
expires_at: Session expiration timestamp.
|
|
182
|
+
include: Additional fields to include in server outputs.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
type: Optional[Literal["realtime"]] = "realtime"
|
|
186
|
+
object: Optional[Literal["realtime.session"]] = None
|
|
187
|
+
id: Optional[str] = None
|
|
188
|
+
model: Optional[str] = None
|
|
189
|
+
output_modalities: Optional[List[Literal["text", "audio"]]] = None
|
|
190
|
+
instructions: Optional[str] = None
|
|
191
|
+
audio: Optional[AudioConfiguration] = None
|
|
192
|
+
tools: Optional[List[Dict]] = None
|
|
193
|
+
tool_choice: Optional[Literal["auto", "none", "required"]] = None
|
|
194
|
+
max_output_tokens: Optional[Union[int, Literal["inf"]]] = None
|
|
195
|
+
tracing: Optional[Union[Literal["auto"], Dict]] = None
|
|
196
|
+
prompt: Optional[Dict] = None
|
|
197
|
+
expires_at: Optional[int] = None
|
|
198
|
+
include: Optional[List[str]] = None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
#
|
|
202
|
+
# context
|
|
203
|
+
#
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class ItemContent(BaseModel):
|
|
207
|
+
"""Content within a conversation item.
|
|
208
|
+
|
|
209
|
+
Parameters:
|
|
210
|
+
type: Content type (text, audio, input_text, input_audio, output_text, or output_audio).
|
|
211
|
+
text: Text content for text-based items.
|
|
212
|
+
audio: Base64-encoded audio data for audio items.
|
|
213
|
+
transcript: Transcribed text for audio items.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
type: Literal["text", "audio", "input_text", "input_audio", "output_text", "output_audio"]
|
|
217
|
+
text: Optional[str] = None
|
|
218
|
+
audio: Optional[str] = None # base64-encoded audio
|
|
219
|
+
transcript: Optional[str] = None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class ConversationItem(BaseModel):
|
|
223
|
+
"""A conversation item in the realtime session.
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
id: Unique identifier for the item, auto-generated if not provided.
|
|
227
|
+
object: Object type identifier for the realtime API.
|
|
228
|
+
type: Item type (message, function_call, or function_call_output).
|
|
229
|
+
status: Current status of the item.
|
|
230
|
+
role: Speaker role for message items (user, assistant, or system).
|
|
231
|
+
content: Content list for message items.
|
|
232
|
+
call_id: Function call identifier for function_call items.
|
|
233
|
+
name: Function name for function_call items.
|
|
234
|
+
arguments: Function arguments as JSON string for function_call items.
|
|
235
|
+
output: Function output as JSON string for function_call_output items.
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4().hex))
|
|
239
|
+
object: Optional[Literal["realtime.item"]] = None
|
|
240
|
+
type: Literal["message", "function_call", "function_call_output"]
|
|
241
|
+
status: Optional[Literal["completed", "in_progress", "incomplete"]] = None
|
|
242
|
+
# role and content are present for message items
|
|
243
|
+
role: Optional[Literal["user", "assistant", "system"]] = None
|
|
244
|
+
content: Optional[List[ItemContent]] = None
|
|
245
|
+
# these four fields are present for function_call items
|
|
246
|
+
call_id: Optional[str] = None
|
|
247
|
+
name: Optional[str] = None
|
|
248
|
+
arguments: Optional[str] = None
|
|
249
|
+
output: Optional[str] = None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class RealtimeConversation(BaseModel):
|
|
253
|
+
"""A realtime conversation session.
|
|
254
|
+
|
|
255
|
+
Parameters:
|
|
256
|
+
id: Unique identifier for the conversation.
|
|
257
|
+
object: Object type identifier, always "realtime.conversation".
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
id: str
|
|
261
|
+
object: Literal["realtime.conversation"]
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ResponseProperties(BaseModel):
|
|
265
|
+
"""Properties for configuring assistant responses.
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
output_modalities: Output modalities for the response. Must be either ["text"] or ["audio"]. Defaults to ["audio"].
|
|
269
|
+
instructions: Specific instructions for this response.
|
|
270
|
+
audio: Audio configuration for this response.
|
|
271
|
+
tools: Available tools for this response.
|
|
272
|
+
tool_choice: Tool usage strategy for this response.
|
|
273
|
+
temperature: Sampling temperature for this response.
|
|
274
|
+
max_output_tokens: Maximum tokens for this response.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
output_modalities: Optional[List[Literal["text", "audio"]]] = ["audio"]
|
|
278
|
+
instructions: Optional[str] = None
|
|
279
|
+
audio: Optional[AudioConfiguration] = None
|
|
280
|
+
tools: Optional[List[Dict]] = None
|
|
281
|
+
tool_choice: Optional[Literal["auto", "none", "required"]] = None
|
|
282
|
+
temperature: Optional[float] = None
|
|
283
|
+
max_output_tokens: Optional[Union[int, Literal["inf"]]] = None
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
#
|
|
287
|
+
# error class
|
|
288
|
+
#
|
|
289
|
+
class RealtimeError(BaseModel):
|
|
290
|
+
"""Error information from the realtime API.
|
|
291
|
+
|
|
292
|
+
Parameters:
|
|
293
|
+
type: Error type identifier.
|
|
294
|
+
code: Specific error code.
|
|
295
|
+
message: Human-readable error message.
|
|
296
|
+
param: Parameter name that caused the error, if applicable.
|
|
297
|
+
event_id: Event ID associated with the error, if applicable.
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
type: str
|
|
301
|
+
code: Optional[str] = ""
|
|
302
|
+
message: str
|
|
303
|
+
param: Optional[str] = None
|
|
304
|
+
event_id: Optional[str] = None
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
#
|
|
308
|
+
# client events
|
|
309
|
+
#
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class ClientEvent(BaseModel):
|
|
313
|
+
"""Base class for client events sent to the realtime API.
|
|
314
|
+
|
|
315
|
+
Parameters:
|
|
316
|
+
event_id: Unique identifier for the event, auto-generated if not provided.
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
event_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class SessionUpdateEvent(ClientEvent):
|
|
323
|
+
"""Event to update session properties.
|
|
324
|
+
|
|
325
|
+
Parameters:
|
|
326
|
+
type: Event type, always "session.update".
|
|
327
|
+
session: Updated session properties.
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
type: Literal["session.update"] = "session.update"
|
|
331
|
+
session: SessionProperties
|
|
332
|
+
|
|
333
|
+
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
|
334
|
+
"""Serialize the event to a dictionary.
|
|
335
|
+
|
|
336
|
+
Handles special serialization for turn_detection where False becomes null.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
*args: Positional arguments passed to parent model_dump.
|
|
340
|
+
**kwargs: Keyword arguments passed to parent model_dump.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Dictionary representation of the event.
|
|
344
|
+
"""
|
|
345
|
+
dump = super().model_dump(*args, **kwargs)
|
|
346
|
+
|
|
347
|
+
# Handle turn_detection in audio.input so that False becomes null
|
|
348
|
+
if "audio" in dump["session"] and dump["session"]["audio"]:
|
|
349
|
+
if "input" in dump["session"]["audio"] and dump["session"]["audio"]["input"]:
|
|
350
|
+
if "turn_detection" in dump["session"]["audio"]["input"]:
|
|
351
|
+
if dump["session"]["audio"]["input"]["turn_detection"] is False:
|
|
352
|
+
dump["session"]["audio"]["input"]["turn_detection"] = None
|
|
353
|
+
|
|
354
|
+
return dump
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class InputAudioBufferAppendEvent(ClientEvent):
|
|
358
|
+
"""Event to append audio data to the input buffer.
|
|
359
|
+
|
|
360
|
+
Parameters:
|
|
361
|
+
type: Event type, always "input_audio_buffer.append".
|
|
362
|
+
audio: Base64-encoded audio data to append.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
type: Literal["input_audio_buffer.append"] = "input_audio_buffer.append"
|
|
366
|
+
audio: str # base64-encoded audio
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class InputAudioBufferCommitEvent(ClientEvent):
|
|
370
|
+
"""Event to commit the current input audio buffer.
|
|
371
|
+
|
|
372
|
+
Parameters:
|
|
373
|
+
type: Event type, always "input_audio_buffer.commit".
|
|
374
|
+
"""
|
|
375
|
+
|
|
376
|
+
type: Literal["input_audio_buffer.commit"] = "input_audio_buffer.commit"
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class InputAudioBufferClearEvent(ClientEvent):
|
|
380
|
+
"""Event to clear the input audio buffer.
|
|
381
|
+
|
|
382
|
+
Parameters:
|
|
383
|
+
type: Event type, always "input_audio_buffer.clear".
|
|
384
|
+
"""
|
|
385
|
+
|
|
386
|
+
type: Literal["input_audio_buffer.clear"] = "input_audio_buffer.clear"
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class ConversationItemCreateEvent(ClientEvent):
|
|
390
|
+
"""Event to create a new conversation item.
|
|
391
|
+
|
|
392
|
+
Parameters:
|
|
393
|
+
type: Event type, always "conversation.item.create".
|
|
394
|
+
previous_item_id: ID of the item to insert after, if any.
|
|
395
|
+
item: The conversation item to create.
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
type: Literal["conversation.item.create"] = "conversation.item.create"
|
|
399
|
+
previous_item_id: Optional[str] = None
|
|
400
|
+
item: ConversationItem
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
class ConversationItemTruncateEvent(ClientEvent):
|
|
404
|
+
"""Event to truncate a conversation item's audio content.
|
|
405
|
+
|
|
406
|
+
Parameters:
|
|
407
|
+
type: Event type, always "conversation.item.truncate".
|
|
408
|
+
item_id: ID of the item to truncate.
|
|
409
|
+
content_index: Index of the content to truncate within the item.
|
|
410
|
+
audio_end_ms: End time in milliseconds for the truncated audio.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
type: Literal["conversation.item.truncate"] = "conversation.item.truncate"
|
|
414
|
+
item_id: str
|
|
415
|
+
content_index: int
|
|
416
|
+
audio_end_ms: int
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class ConversationItemDeleteEvent(ClientEvent):
|
|
420
|
+
"""Event to delete a conversation item.
|
|
421
|
+
|
|
422
|
+
Parameters:
|
|
423
|
+
type: Event type, always "conversation.item.delete".
|
|
424
|
+
item_id: ID of the item to delete.
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
type: Literal["conversation.item.delete"] = "conversation.item.delete"
|
|
428
|
+
item_id: str
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class ConversationItemRetrieveEvent(ClientEvent):
|
|
432
|
+
"""Event to retrieve a conversation item by ID.
|
|
433
|
+
|
|
434
|
+
Parameters:
|
|
435
|
+
type: Event type, always "conversation.item.retrieve".
|
|
436
|
+
item_id: ID of the item to retrieve.
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
type: Literal["conversation.item.retrieve"] = "conversation.item.retrieve"
|
|
440
|
+
item_id: str
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
class ResponseCreateEvent(ClientEvent):
|
|
444
|
+
"""Event to create a new assistant response.
|
|
445
|
+
|
|
446
|
+
Parameters:
|
|
447
|
+
type: Event type, always "response.create".
|
|
448
|
+
response: Optional response configuration properties.
|
|
449
|
+
"""
|
|
450
|
+
|
|
451
|
+
type: Literal["response.create"] = "response.create"
|
|
452
|
+
response: Optional[ResponseProperties] = None
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class ResponseCancelEvent(ClientEvent):
|
|
456
|
+
"""Event to cancel the current assistant response.
|
|
457
|
+
|
|
458
|
+
Parameters:
|
|
459
|
+
type: Event type, always "response.cancel".
|
|
460
|
+
"""
|
|
461
|
+
|
|
462
|
+
type: Literal["response.cancel"] = "response.cancel"
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
#
|
|
466
|
+
# server events
|
|
467
|
+
#
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class ServerEvent(BaseModel):
|
|
471
|
+
"""Base class for server events received from the realtime API.
|
|
472
|
+
|
|
473
|
+
Parameters:
|
|
474
|
+
event_id: Unique identifier for the event.
|
|
475
|
+
type: Type of the server event.
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
479
|
+
|
|
480
|
+
event_id: str
|
|
481
|
+
type: str
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
class SessionCreatedEvent(ServerEvent):
|
|
485
|
+
"""Event indicating a session has been created.
|
|
486
|
+
|
|
487
|
+
Parameters:
|
|
488
|
+
type: Event type, always "session.created".
|
|
489
|
+
session: The created session properties.
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
type: Literal["session.created"]
|
|
493
|
+
session: SessionProperties
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class SessionUpdatedEvent(ServerEvent):
|
|
497
|
+
"""Event indicating a session has been updated.
|
|
498
|
+
|
|
499
|
+
Parameters:
|
|
500
|
+
type: Event type, always "session.updated".
|
|
501
|
+
session: The updated session properties.
|
|
502
|
+
"""
|
|
503
|
+
|
|
504
|
+
type: Literal["session.updated"]
|
|
505
|
+
session: SessionProperties
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
class ConversationCreated(ServerEvent):
|
|
509
|
+
"""Event indicating a conversation has been created.
|
|
510
|
+
|
|
511
|
+
Parameters:
|
|
512
|
+
type: Event type, always "conversation.created".
|
|
513
|
+
conversation: The created conversation.
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
type: Literal["conversation.created"]
|
|
517
|
+
conversation: RealtimeConversation
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class ConversationItemAdded(ServerEvent):
|
|
521
|
+
"""Event indicating a conversation item has been added.
|
|
522
|
+
|
|
523
|
+
Parameters:
|
|
524
|
+
type: Event type, always "conversation.item.added".
|
|
525
|
+
previous_item_id: ID of the previous item, if any.
|
|
526
|
+
item: The added conversation item.
|
|
527
|
+
"""
|
|
528
|
+
|
|
529
|
+
type: Literal["conversation.item.added"]
|
|
530
|
+
previous_item_id: Optional[str] = None
|
|
531
|
+
item: ConversationItem
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class ConversationItemDone(ServerEvent):
|
|
535
|
+
"""Event indicating a conversation item is done processing.
|
|
536
|
+
|
|
537
|
+
Parameters:
|
|
538
|
+
type: Event type, always "conversation.item.done".
|
|
539
|
+
previous_item_id: ID of the previous item, if any.
|
|
540
|
+
item: The completed conversation item.
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
type: Literal["conversation.item.done"]
|
|
544
|
+
previous_item_id: Optional[str] = None
|
|
545
|
+
item: ConversationItem
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class ConversationItemInputAudioTranscriptionDelta(ServerEvent):
|
|
549
|
+
"""Event containing incremental input audio transcription.
|
|
550
|
+
|
|
551
|
+
Parameters:
|
|
552
|
+
type: Event type, always "conversation.item.input_audio_transcription.delta".
|
|
553
|
+
item_id: ID of the conversation item being transcribed.
|
|
554
|
+
content_index: Index of the content within the item.
|
|
555
|
+
delta: Incremental transcription text.
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
type: Literal["conversation.item.input_audio_transcription.delta"]
|
|
559
|
+
item_id: str
|
|
560
|
+
content_index: int
|
|
561
|
+
delta: str
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
class ConversationItemInputAudioTranscriptionCompleted(ServerEvent):
|
|
565
|
+
"""Event indicating input audio transcription is complete.
|
|
566
|
+
|
|
567
|
+
Parameters:
|
|
568
|
+
type: Event type, always "conversation.item.input_audio_transcription.completed".
|
|
569
|
+
item_id: ID of the conversation item that was transcribed.
|
|
570
|
+
content_index: Index of the content within the item.
|
|
571
|
+
transcript: Complete transcription text.
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
type: Literal["conversation.item.input_audio_transcription.completed"]
|
|
575
|
+
item_id: str
|
|
576
|
+
content_index: int
|
|
577
|
+
transcript: str
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
class ConversationItemInputAudioTranscriptionFailed(ServerEvent):
|
|
581
|
+
"""Event indicating input audio transcription failed.
|
|
582
|
+
|
|
583
|
+
Parameters:
|
|
584
|
+
type: Event type, always "conversation.item.input_audio_transcription.failed".
|
|
585
|
+
item_id: ID of the conversation item that failed transcription.
|
|
586
|
+
content_index: Index of the content within the item.
|
|
587
|
+
error: Error details for the transcription failure.
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
type: Literal["conversation.item.input_audio_transcription.failed"]
|
|
591
|
+
item_id: str
|
|
592
|
+
content_index: int
|
|
593
|
+
error: RealtimeError
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
class ConversationItemTruncated(ServerEvent):
|
|
597
|
+
"""Event indicating a conversation item has been truncated.
|
|
598
|
+
|
|
599
|
+
Parameters:
|
|
600
|
+
type: Event type, always "conversation.item.truncated".
|
|
601
|
+
item_id: ID of the truncated conversation item.
|
|
602
|
+
content_index: Index of the content within the item.
|
|
603
|
+
audio_end_ms: End time in milliseconds for the truncated audio.
|
|
604
|
+
"""
|
|
605
|
+
|
|
606
|
+
type: Literal["conversation.item.truncated"]
|
|
607
|
+
item_id: str
|
|
608
|
+
content_index: int
|
|
609
|
+
audio_end_ms: int
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
class ConversationItemDeleted(ServerEvent):
|
|
613
|
+
"""Event indicating a conversation item has been deleted.
|
|
614
|
+
|
|
615
|
+
Parameters:
|
|
616
|
+
type: Event type, always "conversation.item.deleted".
|
|
617
|
+
item_id: ID of the deleted conversation item.
|
|
618
|
+
"""
|
|
619
|
+
|
|
620
|
+
type: Literal["conversation.item.deleted"]
|
|
621
|
+
item_id: str
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class ConversationItemRetrieved(ServerEvent):
|
|
625
|
+
"""Event containing a retrieved conversation item.
|
|
626
|
+
|
|
627
|
+
Parameters:
|
|
628
|
+
type: Event type, always "conversation.item.retrieved".
|
|
629
|
+
item: The retrieved conversation item.
|
|
630
|
+
"""
|
|
631
|
+
|
|
632
|
+
type: Literal["conversation.item.retrieved"]
|
|
633
|
+
item: ConversationItem
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
class ResponseCreated(ServerEvent):
|
|
637
|
+
"""Event indicating an assistant response has been created.
|
|
638
|
+
|
|
639
|
+
Parameters:
|
|
640
|
+
type: Event type, always "response.created".
|
|
641
|
+
response: The created response object.
|
|
642
|
+
"""
|
|
643
|
+
|
|
644
|
+
type: Literal["response.created"]
|
|
645
|
+
response: "Response"
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
class ResponseDone(ServerEvent):
|
|
649
|
+
"""Event indicating an assistant response is complete.
|
|
650
|
+
|
|
651
|
+
Parameters:
|
|
652
|
+
type: Event type, always "response.done".
|
|
653
|
+
response: The completed response object.
|
|
654
|
+
"""
|
|
655
|
+
|
|
656
|
+
type: Literal["response.done"]
|
|
657
|
+
response: "Response"
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
class ResponseOutputItemAdded(ServerEvent):
|
|
661
|
+
"""Event indicating an output item has been added to a response.
|
|
662
|
+
|
|
663
|
+
Parameters:
|
|
664
|
+
type: Event type, always "response.output_item.added".
|
|
665
|
+
response_id: ID of the response.
|
|
666
|
+
output_index: Index of the output item.
|
|
667
|
+
item: The added conversation item.
|
|
668
|
+
"""
|
|
669
|
+
|
|
670
|
+
type: Literal["response.output_item.added"]
|
|
671
|
+
response_id: str
|
|
672
|
+
output_index: int
|
|
673
|
+
item: ConversationItem
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
class ResponseOutputItemDone(ServerEvent):
|
|
677
|
+
"""Event indicating an output item is complete.
|
|
678
|
+
|
|
679
|
+
Parameters:
|
|
680
|
+
type: Event type, always "response.output_item.done".
|
|
681
|
+
response_id: ID of the response.
|
|
682
|
+
output_index: Index of the output item.
|
|
683
|
+
item: The completed conversation item.
|
|
684
|
+
"""
|
|
685
|
+
|
|
686
|
+
type: Literal["response.output_item.done"]
|
|
687
|
+
response_id: str
|
|
688
|
+
output_index: int
|
|
689
|
+
item: ConversationItem
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
class ResponseContentPartAdded(ServerEvent):
|
|
693
|
+
"""Event indicating a content part has been added to a response.
|
|
694
|
+
|
|
695
|
+
Parameters:
|
|
696
|
+
type: Event type, always "response.content_part.added".
|
|
697
|
+
response_id: ID of the response.
|
|
698
|
+
item_id: ID of the conversation item.
|
|
699
|
+
output_index: Index of the output item.
|
|
700
|
+
content_index: Index of the content part.
|
|
701
|
+
part: The added content part.
|
|
702
|
+
"""
|
|
703
|
+
|
|
704
|
+
type: Literal["response.content_part.added"]
|
|
705
|
+
response_id: str
|
|
706
|
+
item_id: str
|
|
707
|
+
output_index: int
|
|
708
|
+
content_index: int
|
|
709
|
+
part: ItemContent
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
class ResponseContentPartDone(ServerEvent):
|
|
713
|
+
"""Event indicating a content part is complete.
|
|
714
|
+
|
|
715
|
+
Parameters:
|
|
716
|
+
type: Event type, always "response.content_part.done".
|
|
717
|
+
response_id: ID of the response.
|
|
718
|
+
item_id: ID of the conversation item.
|
|
719
|
+
output_index: Index of the output item.
|
|
720
|
+
content_index: Index of the content part.
|
|
721
|
+
part: The completed content part.
|
|
722
|
+
"""
|
|
723
|
+
|
|
724
|
+
type: Literal["response.content_part.done"]
|
|
725
|
+
response_id: str
|
|
726
|
+
item_id: str
|
|
727
|
+
output_index: int
|
|
728
|
+
content_index: int
|
|
729
|
+
part: ItemContent
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
class ResponseTextDelta(ServerEvent):
|
|
733
|
+
"""Event containing incremental text from a response.
|
|
734
|
+
|
|
735
|
+
Parameters:
|
|
736
|
+
type: Event type, always "response.output_text.delta".
|
|
737
|
+
response_id: ID of the response.
|
|
738
|
+
item_id: ID of the conversation item.
|
|
739
|
+
output_index: Index of the output item.
|
|
740
|
+
content_index: Index of the content part.
|
|
741
|
+
delta: Incremental text content.
|
|
742
|
+
"""
|
|
743
|
+
|
|
744
|
+
type: Literal["response.output_text.delta"]
|
|
745
|
+
response_id: str
|
|
746
|
+
item_id: str
|
|
747
|
+
output_index: int
|
|
748
|
+
content_index: int
|
|
749
|
+
delta: str
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
class ResponseTextDone(ServerEvent):
|
|
753
|
+
"""Event indicating text content is complete.
|
|
754
|
+
|
|
755
|
+
Parameters:
|
|
756
|
+
type: Event type, always "response.output_text.done".
|
|
757
|
+
response_id: ID of the response.
|
|
758
|
+
item_id: ID of the conversation item.
|
|
759
|
+
output_index: Index of the output item.
|
|
760
|
+
content_index: Index of the content part.
|
|
761
|
+
text: Complete text content.
|
|
762
|
+
"""
|
|
763
|
+
|
|
764
|
+
type: Literal["response.output_text.done"]
|
|
765
|
+
response_id: str
|
|
766
|
+
item_id: str
|
|
767
|
+
output_index: int
|
|
768
|
+
content_index: int
|
|
769
|
+
text: str
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
class ResponseAudioTranscriptDelta(ServerEvent):
|
|
773
|
+
"""Event containing incremental audio transcript from a response.
|
|
774
|
+
|
|
775
|
+
Parameters:
|
|
776
|
+
type: Event type, always "response.output_audio_transcript.delta".
|
|
777
|
+
response_id: ID of the response.
|
|
778
|
+
item_id: ID of the conversation item.
|
|
779
|
+
output_index: Index of the output item.
|
|
780
|
+
content_index: Index of the content part.
|
|
781
|
+
delta: Incremental transcript text.
|
|
782
|
+
"""
|
|
783
|
+
|
|
784
|
+
type: Literal["response.output_audio_transcript.delta"]
|
|
785
|
+
response_id: str
|
|
786
|
+
item_id: str
|
|
787
|
+
output_index: int
|
|
788
|
+
content_index: int
|
|
789
|
+
delta: str
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
class ResponseAudioTranscriptDone(ServerEvent):
|
|
793
|
+
"""Event indicating audio transcript is complete.
|
|
794
|
+
|
|
795
|
+
Parameters:
|
|
796
|
+
type: Event type, always "response.output_audio_transcript.done".
|
|
797
|
+
response_id: ID of the response.
|
|
798
|
+
item_id: ID of the conversation item.
|
|
799
|
+
output_index: Index of the output item.
|
|
800
|
+
content_index: Index of the content part.
|
|
801
|
+
transcript: Complete transcript text.
|
|
802
|
+
"""
|
|
803
|
+
|
|
804
|
+
type: Literal["response.output_audio_transcript.done"]
|
|
805
|
+
response_id: str
|
|
806
|
+
item_id: str
|
|
807
|
+
output_index: int
|
|
808
|
+
content_index: int
|
|
809
|
+
transcript: str
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
class ResponseAudioDelta(ServerEvent):
|
|
813
|
+
"""Event containing incremental audio data from a response.
|
|
814
|
+
|
|
815
|
+
Parameters:
|
|
816
|
+
type: Event type, always "response.output_audio.delta".
|
|
817
|
+
response_id: ID of the response.
|
|
818
|
+
item_id: ID of the conversation item.
|
|
819
|
+
output_index: Index of the output item.
|
|
820
|
+
content_index: Index of the content part.
|
|
821
|
+
delta: Base64-encoded incremental audio data.
|
|
822
|
+
"""
|
|
823
|
+
|
|
824
|
+
type: Literal["response.output_audio.delta"]
|
|
825
|
+
response_id: str
|
|
826
|
+
item_id: str
|
|
827
|
+
output_index: int
|
|
828
|
+
content_index: int
|
|
829
|
+
delta: str # base64-encoded audio
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
class ResponseAudioDone(ServerEvent):
|
|
833
|
+
"""Event indicating audio content is complete.
|
|
834
|
+
|
|
835
|
+
Parameters:
|
|
836
|
+
type: Event type, always "response.output_audio.done".
|
|
837
|
+
response_id: ID of the response.
|
|
838
|
+
item_id: ID of the conversation item.
|
|
839
|
+
output_index: Index of the output item.
|
|
840
|
+
content_index: Index of the content part.
|
|
841
|
+
"""
|
|
842
|
+
|
|
843
|
+
type: Literal["response.output_audio.done"]
|
|
844
|
+
response_id: str
|
|
845
|
+
item_id: str
|
|
846
|
+
output_index: int
|
|
847
|
+
content_index: int
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
class ResponseFunctionCallArgumentsDelta(ServerEvent):
|
|
851
|
+
"""Event containing incremental function call arguments.
|
|
852
|
+
|
|
853
|
+
Parameters:
|
|
854
|
+
type: Event type, always "response.function_call_arguments.delta".
|
|
855
|
+
response_id: ID of the response.
|
|
856
|
+
item_id: ID of the conversation item.
|
|
857
|
+
output_index: Index of the output item.
|
|
858
|
+
call_id: ID of the function call.
|
|
859
|
+
delta: Incremental function arguments as JSON.
|
|
860
|
+
"""
|
|
861
|
+
|
|
862
|
+
type: Literal["response.function_call_arguments.delta"]
|
|
863
|
+
response_id: str
|
|
864
|
+
item_id: str
|
|
865
|
+
output_index: int
|
|
866
|
+
call_id: str
|
|
867
|
+
delta: str
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
class ResponseFunctionCallArgumentsDone(ServerEvent):
|
|
871
|
+
"""Event indicating function call arguments are complete.
|
|
872
|
+
|
|
873
|
+
Parameters:
|
|
874
|
+
type: Event type, always "response.function_call_arguments.done".
|
|
875
|
+
response_id: ID of the response.
|
|
876
|
+
item_id: ID of the conversation item.
|
|
877
|
+
output_index: Index of the output item.
|
|
878
|
+
call_id: ID of the function call.
|
|
879
|
+
arguments: Complete function arguments as JSON string.
|
|
880
|
+
"""
|
|
881
|
+
|
|
882
|
+
type: Literal["response.function_call_arguments.done"]
|
|
883
|
+
response_id: str
|
|
884
|
+
item_id: str
|
|
885
|
+
output_index: int
|
|
886
|
+
call_id: str
|
|
887
|
+
arguments: str
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
class InputAudioBufferSpeechStarted(ServerEvent):
|
|
891
|
+
"""Event indicating speech has started in the input audio buffer.
|
|
892
|
+
|
|
893
|
+
Parameters:
|
|
894
|
+
type: Event type, always "input_audio_buffer.speech_started".
|
|
895
|
+
audio_start_ms: Start time of speech in milliseconds.
|
|
896
|
+
item_id: ID of the associated conversation item.
|
|
897
|
+
"""
|
|
898
|
+
|
|
899
|
+
type: Literal["input_audio_buffer.speech_started"]
|
|
900
|
+
audio_start_ms: int
|
|
901
|
+
item_id: str
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
class InputAudioBufferSpeechStopped(ServerEvent):
|
|
905
|
+
"""Event indicating speech has stopped in the input audio buffer.
|
|
906
|
+
|
|
907
|
+
Parameters:
|
|
908
|
+
type: Event type, always "input_audio_buffer.speech_stopped".
|
|
909
|
+
audio_end_ms: End time of speech in milliseconds.
|
|
910
|
+
item_id: ID of the associated conversation item.
|
|
911
|
+
"""
|
|
912
|
+
|
|
913
|
+
type: Literal["input_audio_buffer.speech_stopped"]
|
|
914
|
+
audio_end_ms: int
|
|
915
|
+
item_id: str
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
class InputAudioBufferCommitted(ServerEvent):
|
|
919
|
+
"""Event indicating the input audio buffer has been committed.
|
|
920
|
+
|
|
921
|
+
Parameters:
|
|
922
|
+
type: Event type, always "input_audio_buffer.committed".
|
|
923
|
+
previous_item_id: ID of the previous item, if any.
|
|
924
|
+
item_id: ID of the committed conversation item.
|
|
925
|
+
"""
|
|
926
|
+
|
|
927
|
+
type: Literal["input_audio_buffer.committed"]
|
|
928
|
+
previous_item_id: Optional[str] = None
|
|
929
|
+
item_id: str
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
class InputAudioBufferCleared(ServerEvent):
|
|
933
|
+
"""Event indicating the input audio buffer has been cleared.
|
|
934
|
+
|
|
935
|
+
Parameters:
|
|
936
|
+
type: Event type, always "input_audio_buffer.cleared".
|
|
937
|
+
"""
|
|
938
|
+
|
|
939
|
+
type: Literal["input_audio_buffer.cleared"]
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
class ErrorEvent(ServerEvent):
|
|
943
|
+
"""Event indicating an error occurred.
|
|
944
|
+
|
|
945
|
+
Parameters:
|
|
946
|
+
type: Event type, always "error".
|
|
947
|
+
error: Error details.
|
|
948
|
+
"""
|
|
949
|
+
|
|
950
|
+
type: Literal["error"]
|
|
951
|
+
error: RealtimeError
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
class RateLimitsUpdated(ServerEvent):
|
|
955
|
+
"""Event indicating rate limits have been updated.
|
|
956
|
+
|
|
957
|
+
Parameters:
|
|
958
|
+
type: Event type, always "rate_limits.updated".
|
|
959
|
+
rate_limits: List of rate limit information.
|
|
960
|
+
"""
|
|
961
|
+
|
|
962
|
+
type: Literal["rate_limits.updated"]
|
|
963
|
+
rate_limits: List[Dict[str, Any]]
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
class CachedTokensDetails(BaseModel):
|
|
967
|
+
"""Details about cached tokens.
|
|
968
|
+
|
|
969
|
+
Parameters:
|
|
970
|
+
text_tokens: Number of cached text tokens.
|
|
971
|
+
audio_tokens: Number of cached audio tokens.
|
|
972
|
+
"""
|
|
973
|
+
|
|
974
|
+
text_tokens: Optional[int] = 0
|
|
975
|
+
audio_tokens: Optional[int] = 0
|
|
976
|
+
|
|
977
|
+
|
|
978
|
+
class TokenDetails(BaseModel):
|
|
979
|
+
"""Detailed token usage information.
|
|
980
|
+
|
|
981
|
+
Parameters:
|
|
982
|
+
cached_tokens: Number of cached tokens used. Defaults to 0.
|
|
983
|
+
text_tokens: Number of text tokens used. Defaults to 0.
|
|
984
|
+
audio_tokens: Number of audio tokens used. Defaults to 0.
|
|
985
|
+
cached_tokens_details: Detailed breakdown of cached tokens.
|
|
986
|
+
image_tokens: Number of image tokens used (for input only).
|
|
987
|
+
"""
|
|
988
|
+
|
|
989
|
+
cached_tokens: Optional[int] = 0
|
|
990
|
+
text_tokens: Optional[int] = 0
|
|
991
|
+
audio_tokens: Optional[int] = 0
|
|
992
|
+
cached_tokens_details: Optional[CachedTokensDetails] = None
|
|
993
|
+
image_tokens: Optional[int] = 0
|
|
994
|
+
|
|
995
|
+
class Config:
|
|
996
|
+
"""Pydantic configuration for TokenDetails."""
|
|
997
|
+
|
|
998
|
+
extra = "allow"
|
|
999
|
+
|
|
1000
|
+
|
|
1001
|
+
class Usage(BaseModel):
|
|
1002
|
+
"""Token usage statistics for a response.
|
|
1003
|
+
|
|
1004
|
+
Parameters:
|
|
1005
|
+
total_tokens: Total number of tokens used.
|
|
1006
|
+
input_tokens: Number of input tokens used.
|
|
1007
|
+
output_tokens: Number of output tokens used.
|
|
1008
|
+
input_token_details: Detailed breakdown of input token usage.
|
|
1009
|
+
output_token_details: Detailed breakdown of output token usage.
|
|
1010
|
+
"""
|
|
1011
|
+
|
|
1012
|
+
total_tokens: int
|
|
1013
|
+
input_tokens: int
|
|
1014
|
+
output_tokens: int
|
|
1015
|
+
input_token_details: TokenDetails
|
|
1016
|
+
output_token_details: TokenDetails
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
class Response(BaseModel):
|
|
1020
|
+
"""A complete assistant response.
|
|
1021
|
+
|
|
1022
|
+
Parameters:
|
|
1023
|
+
id: Unique identifier for the response.
|
|
1024
|
+
object: Object type, always "realtime.response".
|
|
1025
|
+
status: Current status of the response.
|
|
1026
|
+
status_details: Additional status information.
|
|
1027
|
+
output: List of conversation items in the response.
|
|
1028
|
+
conversation_id: Which conversation the response is added to.
|
|
1029
|
+
output_modalities: The set of modalities the model used to respond.
|
|
1030
|
+
max_output_tokens: Maximum number of output tokens used.
|
|
1031
|
+
audio: Audio configuration for the response.
|
|
1032
|
+
usage: Token usage statistics for the response.
|
|
1033
|
+
voice: The voice the model used to respond.
|
|
1034
|
+
temperature: Sampling temperature used for the response.
|
|
1035
|
+
output_audio_format: The format of output audio.
|
|
1036
|
+
"""
|
|
1037
|
+
|
|
1038
|
+
id: str
|
|
1039
|
+
object: Literal["realtime.response"]
|
|
1040
|
+
status: Literal["completed", "in_progress", "incomplete", "cancelled", "failed"]
|
|
1041
|
+
status_details: Any
|
|
1042
|
+
output: List[ConversationItem]
|
|
1043
|
+
output_modalities: Optional[List[Literal["text", "audio"]]] = None
|
|
1044
|
+
max_output_tokens: Optional[Union[int, Literal["inf"]]] = None
|
|
1045
|
+
audio: Optional[AudioConfiguration] = None
|
|
1046
|
+
usage: Optional[Usage] = None
|
|
1047
|
+
voice: Optional[str] = None
|
|
1048
|
+
temperature: Optional[float] = None
|
|
1049
|
+
output_audio_format: Optional[str] = None
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
_server_event_types = {
|
|
1053
|
+
"error": ErrorEvent,
|
|
1054
|
+
"session.created": SessionCreatedEvent,
|
|
1055
|
+
"session.updated": SessionUpdatedEvent,
|
|
1056
|
+
"conversation.created": ConversationCreated,
|
|
1057
|
+
"input_audio_buffer.committed": InputAudioBufferCommitted,
|
|
1058
|
+
"input_audio_buffer.cleared": InputAudioBufferCleared,
|
|
1059
|
+
"input_audio_buffer.speech_started": InputAudioBufferSpeechStarted,
|
|
1060
|
+
"input_audio_buffer.speech_stopped": InputAudioBufferSpeechStopped,
|
|
1061
|
+
"conversation.item.added": ConversationItemAdded,
|
|
1062
|
+
"conversation.item.done": ConversationItemDone,
|
|
1063
|
+
"conversation.item.input_audio_transcription.delta": ConversationItemInputAudioTranscriptionDelta,
|
|
1064
|
+
"conversation.item.input_audio_transcription.completed": ConversationItemInputAudioTranscriptionCompleted,
|
|
1065
|
+
"conversation.item.input_audio_transcription.failed": ConversationItemInputAudioTranscriptionFailed,
|
|
1066
|
+
"conversation.item.truncated": ConversationItemTruncated,
|
|
1067
|
+
"conversation.item.deleted": ConversationItemDeleted,
|
|
1068
|
+
"conversation.item.retrieved": ConversationItemRetrieved,
|
|
1069
|
+
"response.created": ResponseCreated,
|
|
1070
|
+
"response.done": ResponseDone,
|
|
1071
|
+
"response.output_item.added": ResponseOutputItemAdded,
|
|
1072
|
+
"response.output_item.done": ResponseOutputItemDone,
|
|
1073
|
+
"response.content_part.added": ResponseContentPartAdded,
|
|
1074
|
+
"response.content_part.done": ResponseContentPartDone,
|
|
1075
|
+
"response.output_text.delta": ResponseTextDelta,
|
|
1076
|
+
"response.output_text.done": ResponseTextDone,
|
|
1077
|
+
"response.output_audio_transcript.delta": ResponseAudioTranscriptDelta,
|
|
1078
|
+
"response.output_audio_transcript.done": ResponseAudioTranscriptDone,
|
|
1079
|
+
"response.output_audio.delta": ResponseAudioDelta,
|
|
1080
|
+
"response.output_audio.done": ResponseAudioDone,
|
|
1081
|
+
"response.function_call_arguments.delta": ResponseFunctionCallArgumentsDelta,
|
|
1082
|
+
"response.function_call_arguments.done": ResponseFunctionCallArgumentsDone,
|
|
1083
|
+
"rate_limits.updated": RateLimitsUpdated,
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
def parse_server_event(str):
|
|
1088
|
+
"""Parse a server event from JSON string.
|
|
1089
|
+
|
|
1090
|
+
Args:
|
|
1091
|
+
str: JSON string containing the server event.
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
Parsed server event object of the appropriate type.
|
|
1095
|
+
|
|
1096
|
+
Raises:
|
|
1097
|
+
Exception: If the event type is unimplemented or parsing fails.
|
|
1098
|
+
"""
|
|
1099
|
+
try:
|
|
1100
|
+
event = json.loads(str)
|
|
1101
|
+
event_type = event["type"]
|
|
1102
|
+
if event_type not in _server_event_types:
|
|
1103
|
+
raise Exception(f"Unimplemented server event type: {event_type}")
|
|
1104
|
+
return _server_event_types[event_type].model_validate(event)
|
|
1105
|
+
except Exception as e:
|
|
1106
|
+
raise Exception(f"{e} \n\n{str}")
|