dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,829 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""OpenAI Realtime LLM service implementation with WebSocket support."""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
from pipecat.adapters.services.open_ai_realtime_adapter import OpenAIRealtimeLLMAdapter
|
|
18
|
+
from pipecat.frames.frames import (
|
|
19
|
+
BotStoppedSpeakingFrame,
|
|
20
|
+
CancelFrame,
|
|
21
|
+
EndFrame,
|
|
22
|
+
ErrorFrame,
|
|
23
|
+
Frame,
|
|
24
|
+
InputAudioRawFrame,
|
|
25
|
+
InterimTranscriptionFrame,
|
|
26
|
+
InterruptionFrame,
|
|
27
|
+
LLMContextFrame,
|
|
28
|
+
LLMFullResponseEndFrame,
|
|
29
|
+
LLMFullResponseStartFrame,
|
|
30
|
+
LLMMessagesAppendFrame,
|
|
31
|
+
LLMSetToolsFrame,
|
|
32
|
+
LLMTextFrame,
|
|
33
|
+
LLMUpdateSettingsFrame,
|
|
34
|
+
StartFrame,
|
|
35
|
+
TranscriptionFrame,
|
|
36
|
+
TTSAudioRawFrame,
|
|
37
|
+
TTSStartedFrame,
|
|
38
|
+
TTSStoppedFrame,
|
|
39
|
+
TTSTextFrame,
|
|
40
|
+
UserStartedSpeakingFrame,
|
|
41
|
+
UserStoppedSpeakingFrame,
|
|
42
|
+
)
|
|
43
|
+
from pipecat.metrics.metrics import LLMTokenUsage
|
|
44
|
+
from pipecat.processors.aggregators.llm_response import (
|
|
45
|
+
LLMAssistantAggregatorParams,
|
|
46
|
+
LLMUserAggregatorParams,
|
|
47
|
+
)
|
|
48
|
+
from pipecat.processors.aggregators.openai_llm_context import (
|
|
49
|
+
OpenAILLMContext,
|
|
50
|
+
OpenAILLMContextFrame,
|
|
51
|
+
)
|
|
52
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
53
|
+
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
|
|
54
|
+
from pipecat.services.openai.llm import OpenAIContextAggregatorPair
|
|
55
|
+
from pipecat.transcriptions.language import Language
|
|
56
|
+
from pipecat.utils.time import time_now_iso8601
|
|
57
|
+
from pipecat.utils.tracing.service_decorators import traced_openai_realtime, traced_stt
|
|
58
|
+
|
|
59
|
+
from . import events
|
|
60
|
+
from .context import (
|
|
61
|
+
OpenAIRealtimeAssistantContextAggregator,
|
|
62
|
+
OpenAIRealtimeLLMContext,
|
|
63
|
+
OpenAIRealtimeUserContextAggregator,
|
|
64
|
+
)
|
|
65
|
+
from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
69
|
+
except ModuleNotFoundError as e:
|
|
70
|
+
logger.error(f"Exception: {e}")
|
|
71
|
+
logger.error("In order to use OpenAI, you need to `pip install pipecat-ai[openai]`.")
|
|
72
|
+
raise Exception(f"Missing module: {e}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class CurrentAudioResponse:
|
|
77
|
+
"""Tracks the current audio response from the assistant.
|
|
78
|
+
|
|
79
|
+
Parameters:
|
|
80
|
+
item_id: Unique identifier for the audio response item.
|
|
81
|
+
content_index: Index of the audio content within the item.
|
|
82
|
+
start_time_ms: Timestamp when the audio response started in milliseconds.
|
|
83
|
+
total_size: Total size of audio data received in bytes. Defaults to 0.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
item_id: str
|
|
87
|
+
content_index: int
|
|
88
|
+
start_time_ms: int
|
|
89
|
+
total_size: int = 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class OpenAIRealtimeLLMService(LLMService):
|
|
93
|
+
"""OpenAI Realtime LLM service providing real-time audio and text communication.
|
|
94
|
+
|
|
95
|
+
Implements the OpenAI Realtime API with WebSocket communication for low-latency
|
|
96
|
+
bidirectional audio and text interactions. Supports function calling, conversation
|
|
97
|
+
management, and real-time transcription.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# Overriding the default adapter to use the OpenAIRealtimeLLMAdapter one.
|
|
101
|
+
adapter_class = OpenAIRealtimeLLMAdapter
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self,
|
|
105
|
+
*,
|
|
106
|
+
api_key: str,
|
|
107
|
+
model: str = "gpt-realtime",
|
|
108
|
+
base_url: str = "wss://api.openai.com/v1/realtime",
|
|
109
|
+
session_properties: Optional[events.SessionProperties] = None,
|
|
110
|
+
start_audio_paused: bool = False,
|
|
111
|
+
send_transcription_frames: bool = True,
|
|
112
|
+
**kwargs,
|
|
113
|
+
):
|
|
114
|
+
"""Initialize the OpenAI Realtime LLM service.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
api_key: OpenAI API key for authentication.
|
|
118
|
+
model: OpenAI model name. Defaults to "gpt-4o-realtime-preview-2025-06-03".
|
|
119
|
+
base_url: WebSocket base URL for the realtime API.
|
|
120
|
+
Defaults to "wss://api.openai.com/v1/realtime".
|
|
121
|
+
session_properties: Configuration properties for the realtime session.
|
|
122
|
+
If None, uses default SessionProperties.
|
|
123
|
+
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
|
124
|
+
send_transcription_frames: Whether to emit transcription frames. Defaults to True.
|
|
125
|
+
**kwargs: Additional arguments passed to parent LLMService.
|
|
126
|
+
"""
|
|
127
|
+
full_url = f"{base_url}?model={model}"
|
|
128
|
+
super().__init__(base_url=full_url, **kwargs)
|
|
129
|
+
|
|
130
|
+
self.api_key = api_key
|
|
131
|
+
self.base_url = full_url
|
|
132
|
+
self.set_model_name(model)
|
|
133
|
+
|
|
134
|
+
self._session_properties: events.SessionProperties = (
|
|
135
|
+
session_properties or events.SessionProperties()
|
|
136
|
+
)
|
|
137
|
+
self._audio_input_paused = start_audio_paused
|
|
138
|
+
self._send_transcription_frames = send_transcription_frames
|
|
139
|
+
self._websocket = None
|
|
140
|
+
self._receive_task = None
|
|
141
|
+
self._context = None
|
|
142
|
+
|
|
143
|
+
self._disconnecting = False
|
|
144
|
+
self._api_session_ready = False
|
|
145
|
+
self._run_llm_when_api_session_ready = False
|
|
146
|
+
|
|
147
|
+
self._current_assistant_response = None
|
|
148
|
+
self._current_audio_response = None
|
|
149
|
+
|
|
150
|
+
self._messages_added_manually = {}
|
|
151
|
+
self._user_and_response_message_tuple = None
|
|
152
|
+
self._pending_function_calls = {} # Track function calls by call_id
|
|
153
|
+
|
|
154
|
+
self._register_event_handler("on_conversation_item_created")
|
|
155
|
+
self._register_event_handler("on_conversation_item_updated")
|
|
156
|
+
self._retrieve_conversation_item_futures = {}
|
|
157
|
+
|
|
158
|
+
def can_generate_metrics(self) -> bool:
|
|
159
|
+
"""Check if the service can generate usage metrics.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
True if metrics generation is supported.
|
|
163
|
+
"""
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
def set_audio_input_paused(self, paused: bool):
|
|
167
|
+
"""Set whether audio input is paused.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
paused: True to pause audio input, False to resume.
|
|
171
|
+
"""
|
|
172
|
+
self._audio_input_paused = paused
|
|
173
|
+
|
|
174
|
+
def _is_modality_enabled(self, modality: str) -> bool:
|
|
175
|
+
"""Check if a specific modality is enabled, "text" or "audio"."""
|
|
176
|
+
modalities = self._session_properties.output_modalities or ["audio", "text"]
|
|
177
|
+
return modality in modalities
|
|
178
|
+
|
|
179
|
+
def _get_enabled_modalities(self) -> list[str]:
|
|
180
|
+
"""Get the list of enabled modalities."""
|
|
181
|
+
modalities = self._session_properties.output_modalities or ["audio", "text"]
|
|
182
|
+
# API only supports single modality responses: either ["text"] or ["audio"]
|
|
183
|
+
if "audio" in modalities:
|
|
184
|
+
return ["audio"]
|
|
185
|
+
elif "text" in modalities:
|
|
186
|
+
return ["text"]
|
|
187
|
+
|
|
188
|
+
async def retrieve_conversation_item(self, item_id: str):
|
|
189
|
+
"""Retrieve a conversation item by ID from the server.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
item_id: The ID of the conversation item to retrieve.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
The retrieved conversation item.
|
|
196
|
+
"""
|
|
197
|
+
future = self.get_event_loop().create_future()
|
|
198
|
+
retrieval_in_flight = False
|
|
199
|
+
if not self._retrieve_conversation_item_futures.get(item_id):
|
|
200
|
+
self._retrieve_conversation_item_futures[item_id] = []
|
|
201
|
+
else:
|
|
202
|
+
retrieval_in_flight = True
|
|
203
|
+
self._retrieve_conversation_item_futures[item_id].append(future)
|
|
204
|
+
if not retrieval_in_flight:
|
|
205
|
+
await self.send_client_event(
|
|
206
|
+
# Set event_id to "rci_{item_id}" so that we can identify an
|
|
207
|
+
# error later if the retrieval fails. We don't need a UUID
|
|
208
|
+
# suffix to the event_id because we're ensuring only one
|
|
209
|
+
# in-flight retrieval per item_id. (Note: "rci" = "retrieve
|
|
210
|
+
# conversation item")
|
|
211
|
+
events.ConversationItemRetrieveEvent(item_id=item_id, event_id=f"rci_{item_id}")
|
|
212
|
+
)
|
|
213
|
+
return await future
|
|
214
|
+
|
|
215
|
+
#
|
|
216
|
+
# standard AIService frame handling
|
|
217
|
+
#
|
|
218
|
+
|
|
219
|
+
async def start(self, frame: StartFrame):
|
|
220
|
+
"""Start the service and establish WebSocket connection.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
frame: The start frame triggering service initialization.
|
|
224
|
+
"""
|
|
225
|
+
await super().start(frame)
|
|
226
|
+
await self._connect()
|
|
227
|
+
|
|
228
|
+
async def stop(self, frame: EndFrame):
|
|
229
|
+
"""Stop the service and close WebSocket connection.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
frame: The end frame triggering service shutdown.
|
|
233
|
+
"""
|
|
234
|
+
await super().stop(frame)
|
|
235
|
+
await self._disconnect()
|
|
236
|
+
|
|
237
|
+
async def cancel(self, frame: CancelFrame):
|
|
238
|
+
"""Cancel the service and close WebSocket connection.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
frame: The cancel frame triggering service cancellation.
|
|
242
|
+
"""
|
|
243
|
+
await super().cancel(frame)
|
|
244
|
+
await self._disconnect()
|
|
245
|
+
|
|
246
|
+
#
|
|
247
|
+
# speech and interruption handling
|
|
248
|
+
#
|
|
249
|
+
|
|
250
|
+
async def _handle_interruption(self):
|
|
251
|
+
# None and False are different. Check for False. None means we're using OpenAI's
|
|
252
|
+
# built-in turn detection defaults.
|
|
253
|
+
turn_detection_disabled = (
|
|
254
|
+
self._session_properties.audio
|
|
255
|
+
and self._session_properties.audio.input
|
|
256
|
+
and self._session_properties.audio.input.turn_detection is False
|
|
257
|
+
)
|
|
258
|
+
if turn_detection_disabled:
|
|
259
|
+
await self.send_client_event(events.InputAudioBufferClearEvent())
|
|
260
|
+
await self.send_client_event(events.ResponseCancelEvent())
|
|
261
|
+
await self._truncate_current_audio_response()
|
|
262
|
+
await self.stop_all_metrics()
|
|
263
|
+
if self._current_assistant_response:
|
|
264
|
+
await self.push_frame(LLMFullResponseEndFrame())
|
|
265
|
+
# Only push TTSStoppedFrame if audio modality is enabled
|
|
266
|
+
if self._is_modality_enabled("audio"):
|
|
267
|
+
await self.push_frame(TTSStoppedFrame())
|
|
268
|
+
|
|
269
|
+
async def _handle_user_started_speaking(self, frame):
|
|
270
|
+
pass
|
|
271
|
+
|
|
272
|
+
async def _handle_user_stopped_speaking(self, frame):
|
|
273
|
+
# None and False are different. Check for False. None means we're using OpenAI's
|
|
274
|
+
# built-in turn detection defaults.
|
|
275
|
+
turn_detection_disabled = (
|
|
276
|
+
self._session_properties.audio
|
|
277
|
+
and self._session_properties.audio.input
|
|
278
|
+
and self._session_properties.audio.input.turn_detection is False
|
|
279
|
+
)
|
|
280
|
+
if turn_detection_disabled:
|
|
281
|
+
await self.send_client_event(events.InputAudioBufferCommitEvent())
|
|
282
|
+
await self.send_client_event(events.ResponseCreateEvent())
|
|
283
|
+
|
|
284
|
+
async def _handle_bot_stopped_speaking(self):
|
|
285
|
+
self._current_audio_response = None
|
|
286
|
+
|
|
287
|
+
def _calculate_audio_duration_ms(
|
|
288
|
+
self, total_bytes: int, sample_rate: int = 24000, bytes_per_sample: int = 2
|
|
289
|
+
) -> int:
|
|
290
|
+
"""Calculate audio duration in milliseconds based on PCM audio parameters."""
|
|
291
|
+
samples = total_bytes / bytes_per_sample
|
|
292
|
+
duration_seconds = samples / sample_rate
|
|
293
|
+
return int(duration_seconds * 1000)
|
|
294
|
+
|
|
295
|
+
async def _truncate_current_audio_response(self):
|
|
296
|
+
"""Truncates the current audio response at the appropriate duration.
|
|
297
|
+
|
|
298
|
+
Calculates the actual duration of the audio content and truncates at the shorter of
|
|
299
|
+
either the wall clock time or the actual audio duration to prevent invalid truncation
|
|
300
|
+
requests.
|
|
301
|
+
"""
|
|
302
|
+
if not self._current_audio_response:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
# if the bot is still speaking, truncate the last message
|
|
306
|
+
try:
|
|
307
|
+
current = self._current_audio_response
|
|
308
|
+
self._current_audio_response = None
|
|
309
|
+
|
|
310
|
+
# Calculate actual audio duration instead of using wall clock time
|
|
311
|
+
audio_duration_ms = self._calculate_audio_duration_ms(current.total_size)
|
|
312
|
+
|
|
313
|
+
# Use the shorter of wall clock time or actual audio duration
|
|
314
|
+
elapsed_ms = int(time.time() * 1000 - current.start_time_ms)
|
|
315
|
+
truncate_ms = min(elapsed_ms, audio_duration_ms)
|
|
316
|
+
|
|
317
|
+
logger.trace(
|
|
318
|
+
f"Truncating audio: duration={audio_duration_ms}ms, "
|
|
319
|
+
f"elapsed={elapsed_ms}ms, truncate={truncate_ms}ms"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
await self.send_client_event(
|
|
323
|
+
events.ConversationItemTruncateEvent(
|
|
324
|
+
item_id=current.item_id,
|
|
325
|
+
content_index=current.content_index,
|
|
326
|
+
audio_end_ms=truncate_ms,
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
except Exception as e:
|
|
330
|
+
# Log warning and don't re-raise - allow session to continue
|
|
331
|
+
logger.warning(f"Audio truncation failed (non-fatal): {e}")
|
|
332
|
+
|
|
333
|
+
#
|
|
334
|
+
# frame processing
|
|
335
|
+
#
|
|
336
|
+
# StartFrame, StopFrame, CancelFrame implemented in base class
|
|
337
|
+
#
|
|
338
|
+
|
|
339
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
340
|
+
"""Process incoming frames from the pipeline.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
frame: The frame to process.
|
|
344
|
+
direction: The direction of frame flow in the pipeline.
|
|
345
|
+
"""
|
|
346
|
+
await super().process_frame(frame, direction)
|
|
347
|
+
|
|
348
|
+
if isinstance(frame, TranscriptionFrame):
|
|
349
|
+
pass
|
|
350
|
+
elif isinstance(frame, OpenAILLMContextFrame):
|
|
351
|
+
context: OpenAIRealtimeLLMContext = OpenAIRealtimeLLMContext.upgrade_to_realtime(
|
|
352
|
+
frame.context
|
|
353
|
+
)
|
|
354
|
+
if not self._context:
|
|
355
|
+
self._context = context
|
|
356
|
+
elif frame.context is not self._context:
|
|
357
|
+
# If the context has changed, reset the conversation
|
|
358
|
+
self._context = context
|
|
359
|
+
await self.reset_conversation()
|
|
360
|
+
# Run the LLM at next opportunity
|
|
361
|
+
await self._create_response()
|
|
362
|
+
elif isinstance(frame, LLMContextFrame):
|
|
363
|
+
raise NotImplementedError(
|
|
364
|
+
"Universal LLMContext is not yet supported for OpenAI Realtime."
|
|
365
|
+
)
|
|
366
|
+
elif isinstance(frame, InputAudioRawFrame):
|
|
367
|
+
if not self._audio_input_paused:
|
|
368
|
+
await self._send_user_audio(frame)
|
|
369
|
+
elif isinstance(frame, InterruptionFrame):
|
|
370
|
+
await self._handle_interruption()
|
|
371
|
+
elif isinstance(frame, UserStartedSpeakingFrame):
|
|
372
|
+
await self._handle_user_started_speaking(frame)
|
|
373
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
374
|
+
await self._handle_user_stopped_speaking(frame)
|
|
375
|
+
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
376
|
+
await self._handle_bot_stopped_speaking()
|
|
377
|
+
elif isinstance(frame, LLMMessagesAppendFrame):
|
|
378
|
+
await self._handle_messages_append(frame)
|
|
379
|
+
elif isinstance(frame, RealtimeMessagesUpdateFrame):
|
|
380
|
+
self._context = frame.context
|
|
381
|
+
elif isinstance(frame, LLMUpdateSettingsFrame):
|
|
382
|
+
self._session_properties = events.SessionProperties(**frame.settings)
|
|
383
|
+
await self._update_settings()
|
|
384
|
+
elif isinstance(frame, LLMSetToolsFrame):
|
|
385
|
+
await self._update_settings()
|
|
386
|
+
elif isinstance(frame, RealtimeFunctionCallResultFrame):
|
|
387
|
+
await self._handle_function_call_result(frame.result_frame)
|
|
388
|
+
|
|
389
|
+
await self.push_frame(frame, direction)
|
|
390
|
+
|
|
391
|
+
async def _handle_messages_append(self, frame):
|
|
392
|
+
logger.error("!!! NEED TO IMPLEMENT MESSAGES APPEND")
|
|
393
|
+
|
|
394
|
+
async def _handle_function_call_result(self, frame):
|
|
395
|
+
item = events.ConversationItem(
|
|
396
|
+
type="function_call_output",
|
|
397
|
+
call_id=frame.tool_call_id,
|
|
398
|
+
output=json.dumps(frame.result),
|
|
399
|
+
)
|
|
400
|
+
await self.send_client_event(events.ConversationItemCreateEvent(item=item))
|
|
401
|
+
|
|
402
|
+
#
|
|
403
|
+
# websocket communication
|
|
404
|
+
#
|
|
405
|
+
|
|
406
|
+
async def send_client_event(self, event: events.ClientEvent):
|
|
407
|
+
"""Send a client event to the OpenAI Realtime API.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
event: The client event to send.
|
|
411
|
+
"""
|
|
412
|
+
await self._ws_send(event.model_dump(exclude_none=True))
|
|
413
|
+
|
|
414
|
+
async def _connect(self):
|
|
415
|
+
try:
|
|
416
|
+
if self._websocket:
|
|
417
|
+
# Here we assume that if we have a websocket, we are connected. We
|
|
418
|
+
# handle disconnections in the send/recv code paths.
|
|
419
|
+
return
|
|
420
|
+
self._websocket = await websocket_connect(
|
|
421
|
+
uri=self.base_url,
|
|
422
|
+
additional_headers={
|
|
423
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
424
|
+
},
|
|
425
|
+
)
|
|
426
|
+
self._receive_task = self.create_task(self._receive_task_handler())
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.error(f"{self} initialization error: {e}")
|
|
429
|
+
self._websocket = None
|
|
430
|
+
|
|
431
|
+
async def _disconnect(self):
|
|
432
|
+
try:
|
|
433
|
+
self._disconnecting = True
|
|
434
|
+
self._api_session_ready = False
|
|
435
|
+
await self.stop_all_metrics()
|
|
436
|
+
if self._websocket:
|
|
437
|
+
await self._websocket.close()
|
|
438
|
+
self._websocket = None
|
|
439
|
+
if self._receive_task:
|
|
440
|
+
await self.cancel_task(self._receive_task, timeout=1.0)
|
|
441
|
+
self._receive_task = None
|
|
442
|
+
self._disconnecting = False
|
|
443
|
+
except Exception as e:
|
|
444
|
+
logger.error(f"{self} error disconnecting: {e}")
|
|
445
|
+
|
|
446
|
+
async def _ws_send(self, realtime_message):
|
|
447
|
+
try:
|
|
448
|
+
if self._websocket:
|
|
449
|
+
await self._websocket.send(json.dumps(realtime_message))
|
|
450
|
+
except Exception as e:
|
|
451
|
+
if self._disconnecting:
|
|
452
|
+
return
|
|
453
|
+
logger.error(f"Error sending message to websocket: {e}")
|
|
454
|
+
# In server-to-server contexts, a WebSocket error should be quite rare. Given how hard
|
|
455
|
+
# it is to recover from a send-side error with proper state management, and that exponential
|
|
456
|
+
# backoff for retries can have cost/stability implications for a service cluster, let's just
|
|
457
|
+
# treat a send-side error as fatal.
|
|
458
|
+
await self.push_error(ErrorFrame(error=f"Error sending client event: {e}", fatal=True))
|
|
459
|
+
|
|
460
|
+
async def _update_settings(self):
|
|
461
|
+
settings = self._session_properties
|
|
462
|
+
# tools given in the context override the tools in the session properties
|
|
463
|
+
if self._context and self._context.tools:
|
|
464
|
+
settings.tools = self._context.tools
|
|
465
|
+
# instructions in the context come from an initial "system" message in the
|
|
466
|
+
# messages list, and override instructions in the session properties
|
|
467
|
+
if self._context and self._context._session_instructions:
|
|
468
|
+
settings.instructions = self._context._session_instructions
|
|
469
|
+
await self.send_client_event(events.SessionUpdateEvent(session=settings))
|
|
470
|
+
|
|
471
|
+
#
|
|
472
|
+
# inbound server event handling
|
|
473
|
+
# https://platform.openai.com/docs/api-reference/realtime-server-events
|
|
474
|
+
#
|
|
475
|
+
|
|
476
|
+
async def _receive_task_handler(self):
|
|
477
|
+
async for message in self._websocket:
|
|
478
|
+
evt = events.parse_server_event(message)
|
|
479
|
+
if evt.type == "session.created":
|
|
480
|
+
await self._handle_evt_session_created(evt)
|
|
481
|
+
elif evt.type == "session.updated":
|
|
482
|
+
await self._handle_evt_session_updated(evt)
|
|
483
|
+
elif evt.type == "response.output_audio.delta":
|
|
484
|
+
await self._handle_evt_audio_delta(evt)
|
|
485
|
+
elif evt.type == "response.output_audio.done":
|
|
486
|
+
await self._handle_evt_audio_done(evt)
|
|
487
|
+
elif evt.type == "conversation.item.added":
|
|
488
|
+
await self._handle_evt_conversation_item_added(evt)
|
|
489
|
+
elif evt.type == "conversation.item.done":
|
|
490
|
+
await self._handle_evt_conversation_item_done(evt)
|
|
491
|
+
elif evt.type == "conversation.item.input_audio_transcription.delta":
|
|
492
|
+
await self._handle_evt_input_audio_transcription_delta(evt)
|
|
493
|
+
elif evt.type == "conversation.item.input_audio_transcription.completed":
|
|
494
|
+
await self.handle_evt_input_audio_transcription_completed(evt)
|
|
495
|
+
elif evt.type == "conversation.item.retrieved":
|
|
496
|
+
await self._handle_conversation_item_retrieved(evt)
|
|
497
|
+
elif evt.type == "response.done":
|
|
498
|
+
await self._handle_evt_response_done(evt)
|
|
499
|
+
elif evt.type == "input_audio_buffer.speech_started":
|
|
500
|
+
await self._handle_evt_speech_started(evt)
|
|
501
|
+
elif evt.type == "input_audio_buffer.speech_stopped":
|
|
502
|
+
await self._handle_evt_speech_stopped(evt)
|
|
503
|
+
elif evt.type == "response.output_text.delta":
|
|
504
|
+
await self._handle_evt_text_delta(evt)
|
|
505
|
+
elif evt.type == "response.output_audio_transcript.delta":
|
|
506
|
+
await self._handle_evt_audio_transcript_delta(evt)
|
|
507
|
+
elif evt.type == "response.function_call_arguments.done":
|
|
508
|
+
await self._handle_evt_function_call_arguments_done(evt)
|
|
509
|
+
elif evt.type == "error":
|
|
510
|
+
if not await self._maybe_handle_evt_retrieve_conversation_item_error(evt):
|
|
511
|
+
await self._handle_evt_error(evt)
|
|
512
|
+
# errors are fatal, so exit the receive loop
|
|
513
|
+
return
|
|
514
|
+
|
|
515
|
+
@traced_openai_realtime(operation="llm_setup")
|
|
516
|
+
async def _handle_evt_session_created(self, evt):
|
|
517
|
+
# session.created is received right after connecting. Send a message
|
|
518
|
+
# to configure the session properties.
|
|
519
|
+
await self._update_settings()
|
|
520
|
+
|
|
521
|
+
async def _handle_evt_session_updated(self, evt):
|
|
522
|
+
# If this is our first context frame, run the LLM
|
|
523
|
+
self._api_session_ready = True
|
|
524
|
+
# Now that we've configured the session, we can run the LLM if we need to.
|
|
525
|
+
if self._run_llm_when_api_session_ready:
|
|
526
|
+
self._run_llm_when_api_session_ready = False
|
|
527
|
+
await self._create_response()
|
|
528
|
+
|
|
529
|
+
async def _handle_evt_audio_delta(self, evt):
|
|
530
|
+
# note: ttfb is faster by 1/2 RTT than ttfb as measured for other services, since we're getting
|
|
531
|
+
# this event from the server
|
|
532
|
+
await self.stop_ttfb_metrics()
|
|
533
|
+
if not self._current_audio_response:
|
|
534
|
+
self._current_audio_response = CurrentAudioResponse(
|
|
535
|
+
item_id=evt.item_id,
|
|
536
|
+
content_index=evt.content_index,
|
|
537
|
+
start_time_ms=int(time.time() * 1000),
|
|
538
|
+
)
|
|
539
|
+
await self.push_frame(TTSStartedFrame())
|
|
540
|
+
audio = base64.b64decode(evt.delta)
|
|
541
|
+
self._current_audio_response.total_size += len(audio)
|
|
542
|
+
frame = TTSAudioRawFrame(
|
|
543
|
+
audio=audio,
|
|
544
|
+
sample_rate=24000,
|
|
545
|
+
num_channels=1,
|
|
546
|
+
)
|
|
547
|
+
await self.push_frame(frame)
|
|
548
|
+
|
|
549
|
+
async def _handle_evt_audio_done(self, evt):
|
|
550
|
+
if self._current_audio_response:
|
|
551
|
+
await self.push_frame(TTSStoppedFrame())
|
|
552
|
+
# Don't clear the self._current_audio_response here. We need to wait until we
|
|
553
|
+
# receive a BotStoppedSpeakingFrame from the output transport.
|
|
554
|
+
|
|
555
|
+
async def _handle_evt_conversation_item_added(self, evt):
|
|
556
|
+
"""Handle conversation.item.added event - item is added but may still be processing."""
|
|
557
|
+
if evt.item.type == "function_call":
|
|
558
|
+
# Track this function call for when arguments are completed
|
|
559
|
+
# Only add if not already tracked (prevent duplicates)
|
|
560
|
+
if evt.item.call_id not in self._pending_function_calls:
|
|
561
|
+
self._pending_function_calls[evt.item.call_id] = evt.item
|
|
562
|
+
else:
|
|
563
|
+
logger.warning(f"Function call {evt.item.call_id} already tracked, skipping")
|
|
564
|
+
|
|
565
|
+
await self._call_event_handler("on_conversation_item_created", evt.item.id, evt.item)
|
|
566
|
+
|
|
567
|
+
# This will get sent from the server every time a new "message" is added
|
|
568
|
+
# to the server's conversation state, whether we create it via the API
|
|
569
|
+
# or the server creates it from LLM output.
|
|
570
|
+
if self._messages_added_manually.get(evt.item.id):
|
|
571
|
+
del self._messages_added_manually[evt.item.id]
|
|
572
|
+
return
|
|
573
|
+
|
|
574
|
+
if evt.item.role == "user":
|
|
575
|
+
# We need to wait for completion of both user message and response message. Then we'll
|
|
576
|
+
# add both to the context. User message is complete when we have a "transcript" field
|
|
577
|
+
# that is not None. Response message is complete when we get a "response.done" event.
|
|
578
|
+
self._user_and_response_message_tuple = (evt.item, {"done": False, "output": []})
|
|
579
|
+
elif evt.item.role == "assistant":
|
|
580
|
+
self._current_assistant_response = evt.item
|
|
581
|
+
await self.push_frame(LLMFullResponseStartFrame())
|
|
582
|
+
|
|
583
|
+
async def _handle_evt_conversation_item_done(self, evt):
|
|
584
|
+
"""Handle conversation.item.done event - item is fully completed."""
|
|
585
|
+
await self._call_event_handler("on_conversation_item_updated", evt.item.id, evt.item)
|
|
586
|
+
# The item is now fully processed and ready
|
|
587
|
+
# For now, no additional logic needed beyond the event handler call
|
|
588
|
+
|
|
589
|
+
async def _handle_evt_input_audio_transcription_delta(self, evt):
|
|
590
|
+
if self._send_transcription_frames:
|
|
591
|
+
await self.push_frame(
|
|
592
|
+
# no way to get a language code?
|
|
593
|
+
InterimTranscriptionFrame(evt.delta, "", time_now_iso8601(), result=evt)
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
@traced_stt
|
|
597
|
+
async def _handle_user_transcription(
|
|
598
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
599
|
+
):
|
|
600
|
+
"""Handle a transcription result with tracing."""
|
|
601
|
+
pass
|
|
602
|
+
|
|
603
|
+
async def handle_evt_input_audio_transcription_completed(self, evt):
|
|
604
|
+
"""Handle completion of input audio transcription.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
evt: The transcription completed event.
|
|
608
|
+
"""
|
|
609
|
+
await self._call_event_handler("on_conversation_item_updated", evt.item_id, None)
|
|
610
|
+
|
|
611
|
+
if self._send_transcription_frames:
|
|
612
|
+
await self.push_frame(
|
|
613
|
+
# no way to get a language code?
|
|
614
|
+
TranscriptionFrame(evt.transcript, "", time_now_iso8601(), result=evt)
|
|
615
|
+
)
|
|
616
|
+
await self._handle_user_transcription(evt.transcript, True, Language.EN)
|
|
617
|
+
pair = self._user_and_response_message_tuple
|
|
618
|
+
if pair:
|
|
619
|
+
user, assistant = pair
|
|
620
|
+
user.content[0].transcript = evt.transcript
|
|
621
|
+
if assistant["done"]:
|
|
622
|
+
self._user_and_response_message_tuple = None
|
|
623
|
+
self._context.add_user_content_item_as_message(user)
|
|
624
|
+
else:
|
|
625
|
+
# User message without preceding conversation.item.created. Bug?
|
|
626
|
+
logger.warning(f"Transcript for unknown user message: {evt}")
|
|
627
|
+
|
|
628
|
+
async def _handle_conversation_item_retrieved(self, evt: events.ConversationItemRetrieved):
|
|
629
|
+
futures = self._retrieve_conversation_item_futures.pop(evt.item.id, None)
|
|
630
|
+
if futures:
|
|
631
|
+
for future in futures:
|
|
632
|
+
future.set_result(evt.item)
|
|
633
|
+
|
|
634
|
+
@traced_openai_realtime(operation="llm_response")
|
|
635
|
+
async def _handle_evt_response_done(self, evt):
|
|
636
|
+
# todo: figure out whether there's anything we need to do for "cancelled" events
|
|
637
|
+
# usage metrics
|
|
638
|
+
tokens = LLMTokenUsage(
|
|
639
|
+
prompt_tokens=evt.response.usage.input_tokens,
|
|
640
|
+
completion_tokens=evt.response.usage.output_tokens,
|
|
641
|
+
total_tokens=evt.response.usage.total_tokens,
|
|
642
|
+
)
|
|
643
|
+
await self.start_llm_usage_metrics(tokens)
|
|
644
|
+
await self.stop_processing_metrics()
|
|
645
|
+
await self.push_frame(LLMFullResponseEndFrame())
|
|
646
|
+
self._current_assistant_response = None
|
|
647
|
+
# error handling
|
|
648
|
+
if evt.response.status == "failed":
|
|
649
|
+
await self.push_error(
|
|
650
|
+
ErrorFrame(error=evt.response.status_details["error"]["message"], fatal=True)
|
|
651
|
+
)
|
|
652
|
+
return
|
|
653
|
+
# response content
|
|
654
|
+
for item in evt.response.output:
|
|
655
|
+
await self._call_event_handler("on_conversation_item_updated", item.id, item)
|
|
656
|
+
pair = self._user_and_response_message_tuple
|
|
657
|
+
if pair:
|
|
658
|
+
user, assistant = pair
|
|
659
|
+
assistant["done"] = True
|
|
660
|
+
assistant["output"] = evt.response.output
|
|
661
|
+
if user.content[0].transcript is not None:
|
|
662
|
+
self._user_and_response_message_tuple = None
|
|
663
|
+
self._context.add_user_content_item_as_message(user)
|
|
664
|
+
else:
|
|
665
|
+
# Response message without preceding user message (standalone response)
|
|
666
|
+
# Function calls in this response were already processed immediately when arguments were complete
|
|
667
|
+
logger.debug(f"Handling standalone response: {evt.response.id}")
|
|
668
|
+
|
|
669
|
+
async def _handle_evt_text_delta(self, evt):
|
|
670
|
+
if evt.delta:
|
|
671
|
+
await self.push_frame(LLMTextFrame(evt.delta))
|
|
672
|
+
|
|
673
|
+
async def _handle_evt_audio_transcript_delta(self, evt):
|
|
674
|
+
if evt.delta:
|
|
675
|
+
await self.push_frame(LLMTextFrame(evt.delta))
|
|
676
|
+
await self.push_frame(TTSTextFrame(evt.delta))
|
|
677
|
+
|
|
678
|
+
async def _handle_evt_function_call_arguments_done(self, evt):
|
|
679
|
+
"""Handle completion of function call arguments.
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
evt: The response.function_call_arguments.done event.
|
|
683
|
+
"""
|
|
684
|
+
# Process the function call immediately when arguments are complete
|
|
685
|
+
# This is needed because function calls might not trigger response.done
|
|
686
|
+
try:
|
|
687
|
+
# Parse the arguments
|
|
688
|
+
args = json.loads(evt.arguments)
|
|
689
|
+
|
|
690
|
+
# Get the function call item we tracked earlier
|
|
691
|
+
function_call_item = self._pending_function_calls.get(evt.call_id)
|
|
692
|
+
if function_call_item:
|
|
693
|
+
# Remove from pending calls FIRST to prevent duplicate processing
|
|
694
|
+
del self._pending_function_calls[evt.call_id]
|
|
695
|
+
|
|
696
|
+
# Create the function call and process it
|
|
697
|
+
function_calls = [
|
|
698
|
+
FunctionCallFromLLM(
|
|
699
|
+
context=self._context,
|
|
700
|
+
tool_call_id=evt.call_id,
|
|
701
|
+
function_name=function_call_item.name,
|
|
702
|
+
arguments=args,
|
|
703
|
+
)
|
|
704
|
+
]
|
|
705
|
+
|
|
706
|
+
await self.run_function_calls(function_calls)
|
|
707
|
+
logger.debug(f"Processed function call: {function_call_item.name}")
|
|
708
|
+
else:
|
|
709
|
+
logger.warning(f"No tracked function call found for call_id: {evt.call_id}")
|
|
710
|
+
logger.warning(
|
|
711
|
+
f"Available pending calls: {list(self._pending_function_calls.keys())}"
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
except Exception as e:
|
|
715
|
+
logger.error(f"Failed to process function call arguments: {e}")
|
|
716
|
+
|
|
717
|
+
async def _handle_evt_speech_started(self, evt):
|
|
718
|
+
await self._truncate_current_audio_response()
|
|
719
|
+
await self.push_interruption_task_frame_and_wait()
|
|
720
|
+
await self.push_frame(UserStartedSpeakingFrame())
|
|
721
|
+
|
|
722
|
+
async def _handle_evt_speech_stopped(self, evt):
|
|
723
|
+
await self.start_ttfb_metrics()
|
|
724
|
+
await self.start_processing_metrics()
|
|
725
|
+
await self.push_frame(UserStoppedSpeakingFrame())
|
|
726
|
+
|
|
727
|
+
async def _maybe_handle_evt_retrieve_conversation_item_error(self, evt: events.ErrorEvent):
|
|
728
|
+
"""Maybe handle an error event related to retrieving a conversation item.
|
|
729
|
+
|
|
730
|
+
If the given error event is an error retrieving a conversation item:
|
|
731
|
+
|
|
732
|
+
- set an exception on the future that retrieve_conversation_item() is waiting on
|
|
733
|
+
- return true
|
|
734
|
+
Otherwise:
|
|
735
|
+
- return false
|
|
736
|
+
"""
|
|
737
|
+
if evt.error.code == "item_retrieve_invalid_item_id":
|
|
738
|
+
item_id = evt.error.event_id.split("_", 1)[1] # event_id is of the form "rci_{item_id}"
|
|
739
|
+
futures = self._retrieve_conversation_item_futures.pop(item_id, None)
|
|
740
|
+
if futures:
|
|
741
|
+
for future in futures:
|
|
742
|
+
future.set_exception(Exception(evt.error.message))
|
|
743
|
+
return True
|
|
744
|
+
return False
|
|
745
|
+
|
|
746
|
+
async def _handle_evt_error(self, evt):
|
|
747
|
+
# Errors are fatal to this connection. Send an ErrorFrame.
|
|
748
|
+
await self.push_error(ErrorFrame(error=f"Error: {evt}", fatal=True))
|
|
749
|
+
|
|
750
|
+
#
|
|
751
|
+
# state and client events for the current conversation
|
|
752
|
+
# https://platform.openai.com/docs/api-reference/realtime-client-events
|
|
753
|
+
#
|
|
754
|
+
|
|
755
|
+
async def reset_conversation(self):
|
|
756
|
+
"""Reset the conversation by disconnecting and reconnecting.
|
|
757
|
+
|
|
758
|
+
This is the safest way to start a new conversation. Note that this will
|
|
759
|
+
fail if called from the receive task.
|
|
760
|
+
"""
|
|
761
|
+
logger.debug("Resetting conversation")
|
|
762
|
+
await self._disconnect()
|
|
763
|
+
if self._context:
|
|
764
|
+
self._context.llm_needs_settings_update = True
|
|
765
|
+
self._context.llm_needs_initial_messages = True
|
|
766
|
+
await self._connect()
|
|
767
|
+
|
|
768
|
+
@traced_openai_realtime(operation="llm_request")
|
|
769
|
+
async def _create_response(self):
|
|
770
|
+
if not self._api_session_ready:
|
|
771
|
+
self._run_llm_when_api_session_ready = True
|
|
772
|
+
return
|
|
773
|
+
|
|
774
|
+
if self._context.llm_needs_initial_messages:
|
|
775
|
+
messages = self._context.get_messages_for_initializing_history()
|
|
776
|
+
for item in messages:
|
|
777
|
+
evt = events.ConversationItemCreateEvent(item=item)
|
|
778
|
+
self._messages_added_manually[evt.item.id] = True
|
|
779
|
+
await self.send_client_event(evt)
|
|
780
|
+
self._context.llm_needs_initial_messages = False
|
|
781
|
+
|
|
782
|
+
if self._context.llm_needs_settings_update:
|
|
783
|
+
await self._update_settings()
|
|
784
|
+
self._context.llm_needs_settings_update = False
|
|
785
|
+
|
|
786
|
+
logger.debug(f"Creating response: {self._context.get_messages_for_logging()}")
|
|
787
|
+
|
|
788
|
+
await self.push_frame(LLMFullResponseStartFrame())
|
|
789
|
+
await self.start_processing_metrics()
|
|
790
|
+
await self.start_ttfb_metrics()
|
|
791
|
+
await self.send_client_event(
|
|
792
|
+
events.ResponseCreateEvent(
|
|
793
|
+
response=events.ResponseProperties(output_modalities=self._get_enabled_modalities())
|
|
794
|
+
)
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
async def _send_user_audio(self, frame):
|
|
798
|
+
payload = base64.b64encode(frame.audio).decode("utf-8")
|
|
799
|
+
await self.send_client_event(events.InputAudioBufferAppendEvent(audio=payload))
|
|
800
|
+
|
|
801
|
+
def create_context_aggregator(
|
|
802
|
+
self,
|
|
803
|
+
context: OpenAILLMContext,
|
|
804
|
+
*,
|
|
805
|
+
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
|
806
|
+
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
|
807
|
+
) -> OpenAIContextAggregatorPair:
|
|
808
|
+
"""Create an instance of OpenAIContextAggregatorPair from an OpenAILLMContext.
|
|
809
|
+
|
|
810
|
+
Constructor keyword arguments for both the user and assistant aggregators can be provided.
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
context: The LLM context.
|
|
814
|
+
user_params: User aggregator parameters.
|
|
815
|
+
assistant_params: Assistant aggregator parameters.
|
|
816
|
+
|
|
817
|
+
Returns:
|
|
818
|
+
OpenAIContextAggregatorPair: A pair of context aggregators, one for
|
|
819
|
+
the user and one for the assistant, encapsulated in an
|
|
820
|
+
OpenAIContextAggregatorPair.
|
|
821
|
+
"""
|
|
822
|
+
context.set_llm_adapter(self.get_llm_adapter())
|
|
823
|
+
|
|
824
|
+
OpenAIRealtimeLLMContext.upgrade_to_realtime(context)
|
|
825
|
+
user = OpenAIRealtimeUserContextAggregator(context, params=user_params)
|
|
826
|
+
|
|
827
|
+
assistant_params.expect_stripped_words = False
|
|
828
|
+
assistant = OpenAIRealtimeAssistantContextAggregator(context, params=assistant_params)
|
|
829
|
+
return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
|