dv-pipecat-ai 0.0.82.dev815__py3-none-any.whl → 0.0.82.dev857__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/METADATA +8 -3
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/RECORD +106 -79
- pipecat/adapters/base_llm_adapter.py +44 -6
- pipecat/adapters/services/anthropic_adapter.py +302 -2
- pipecat/adapters/services/aws_nova_sonic_adapter.py +40 -2
- pipecat/adapters/services/bedrock_adapter.py +40 -2
- pipecat/adapters/services/gemini_adapter.py +276 -6
- pipecat/adapters/services/open_ai_adapter.py +88 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +39 -1
- pipecat/audio/dtmf/__init__.py +0 -0
- pipecat/audio/dtmf/types.py +47 -0
- pipecat/audio/dtmf/utils.py +70 -0
- pipecat/audio/filters/aic_filter.py +199 -0
- pipecat/audio/utils.py +9 -7
- pipecat/extensions/ivr/__init__.py +0 -0
- pipecat/extensions/ivr/ivr_navigator.py +452 -0
- pipecat/frames/frames.py +156 -43
- pipecat/pipeline/llm_switcher.py +76 -0
- pipecat/pipeline/parallel_pipeline.py +3 -3
- pipecat/pipeline/service_switcher.py +144 -0
- pipecat/pipeline/task.py +68 -28
- pipecat/pipeline/task_observer.py +10 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -2
- pipecat/processors/aggregators/llm_context.py +277 -0
- pipecat/processors/aggregators/llm_response.py +48 -15
- pipecat/processors/aggregators/llm_response_universal.py +840 -0
- pipecat/processors/aggregators/openai_llm_context.py +3 -3
- pipecat/processors/dtmf_aggregator.py +0 -2
- pipecat/processors/filters/stt_mute_filter.py +0 -2
- pipecat/processors/frame_processor.py +18 -11
- pipecat/processors/frameworks/rtvi.py +17 -10
- pipecat/processors/metrics/sentry.py +2 -0
- pipecat/runner/daily.py +137 -36
- pipecat/runner/run.py +1 -1
- pipecat/runner/utils.py +7 -7
- pipecat/serializers/asterisk.py +20 -4
- pipecat/serializers/exotel.py +1 -1
- pipecat/serializers/plivo.py +1 -1
- pipecat/serializers/telnyx.py +1 -1
- pipecat/serializers/twilio.py +1 -1
- pipecat/services/__init__.py +2 -2
- pipecat/services/anthropic/llm.py +113 -28
- pipecat/services/asyncai/tts.py +4 -0
- pipecat/services/aws/llm.py +82 -8
- pipecat/services/aws/tts.py +0 -10
- pipecat/services/aws_nova_sonic/aws.py +5 -0
- pipecat/services/cartesia/tts.py +28 -16
- pipecat/services/cerebras/llm.py +15 -10
- pipecat/services/deepgram/stt.py +8 -0
- pipecat/services/deepseek/llm.py +13 -8
- pipecat/services/fireworks/llm.py +13 -8
- pipecat/services/fish/tts.py +8 -6
- pipecat/services/gemini_multimodal_live/gemini.py +5 -0
- pipecat/services/gladia/config.py +7 -1
- pipecat/services/gladia/stt.py +23 -15
- pipecat/services/google/llm.py +159 -59
- pipecat/services/google/llm_openai.py +18 -3
- pipecat/services/grok/llm.py +2 -1
- pipecat/services/llm_service.py +38 -3
- pipecat/services/mem0/memory.py +2 -1
- pipecat/services/mistral/llm.py +5 -6
- pipecat/services/nim/llm.py +2 -1
- pipecat/services/openai/base_llm.py +88 -26
- pipecat/services/openai/image.py +6 -1
- pipecat/services/openai_realtime_beta/openai.py +5 -2
- pipecat/services/openpipe/llm.py +6 -8
- pipecat/services/perplexity/llm.py +13 -8
- pipecat/services/playht/tts.py +9 -6
- pipecat/services/rime/tts.py +1 -1
- pipecat/services/sambanova/llm.py +18 -13
- pipecat/services/sarvam/tts.py +415 -10
- pipecat/services/speechmatics/stt.py +2 -2
- pipecat/services/tavus/video.py +1 -1
- pipecat/services/tts_service.py +15 -5
- pipecat/services/vistaar/llm.py +2 -5
- pipecat/transports/base_input.py +32 -19
- pipecat/transports/base_output.py +39 -5
- pipecat/transports/daily/__init__.py +0 -0
- pipecat/transports/daily/transport.py +2371 -0
- pipecat/transports/daily/utils.py +410 -0
- pipecat/transports/livekit/__init__.py +0 -0
- pipecat/transports/livekit/transport.py +1042 -0
- pipecat/transports/network/fastapi_websocket.py +12 -546
- pipecat/transports/network/small_webrtc.py +12 -922
- pipecat/transports/network/webrtc_connection.py +9 -595
- pipecat/transports/network/websocket_client.py +12 -481
- pipecat/transports/network/websocket_server.py +12 -487
- pipecat/transports/services/daily.py +9 -2334
- pipecat/transports/services/helpers/daily_rest.py +12 -396
- pipecat/transports/services/livekit.py +12 -975
- pipecat/transports/services/tavus.py +12 -757
- pipecat/transports/smallwebrtc/__init__.py +0 -0
- pipecat/transports/smallwebrtc/connection.py +612 -0
- pipecat/transports/smallwebrtc/transport.py +936 -0
- pipecat/transports/tavus/__init__.py +0 -0
- pipecat/transports/tavus/transport.py +770 -0
- pipecat/transports/websocket/__init__.py +0 -0
- pipecat/transports/websocket/client.py +494 -0
- pipecat/transports/websocket/fastapi.py +559 -0
- pipecat/transports/websocket/server.py +500 -0
- pipecat/transports/whatsapp/__init__.py +0 -0
- pipecat/transports/whatsapp/api.py +345 -0
- pipecat/transports/whatsapp/client.py +364 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""LLM response aggregators for handling conversation context and message aggregation.
|
|
8
|
+
|
|
9
|
+
This module provides aggregators that process and accumulate LLM responses, user inputs,
|
|
10
|
+
and conversation context. These aggregators handle the flow between speech-to-text,
|
|
11
|
+
LLM processing, and text-to-speech components in conversational AI pipelines.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any, Dict, List, Literal, Optional, Set
|
|
18
|
+
|
|
19
|
+
from loguru import logger
|
|
20
|
+
|
|
21
|
+
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
|
22
|
+
from pipecat.audio.interruptions.base_interruption_strategy import BaseInterruptionStrategy
|
|
23
|
+
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
|
24
|
+
from pipecat.audio.vad.vad_analyzer import VADParams
|
|
25
|
+
from pipecat.frames.frames import (
|
|
26
|
+
BotInterruptionFrame,
|
|
27
|
+
BotStartedSpeakingFrame,
|
|
28
|
+
BotStoppedSpeakingFrame,
|
|
29
|
+
CancelFrame,
|
|
30
|
+
EmulateUserStartedSpeakingFrame,
|
|
31
|
+
EmulateUserStoppedSpeakingFrame,
|
|
32
|
+
EndFrame,
|
|
33
|
+
Frame,
|
|
34
|
+
FunctionCallCancelFrame,
|
|
35
|
+
FunctionCallInProgressFrame,
|
|
36
|
+
FunctionCallResultFrame,
|
|
37
|
+
FunctionCallsStartedFrame,
|
|
38
|
+
InputAudioRawFrame,
|
|
39
|
+
InterimTranscriptionFrame,
|
|
40
|
+
LLMContextAssistantTimestampFrame,
|
|
41
|
+
LLMContextFrame,
|
|
42
|
+
LLMFullResponseEndFrame,
|
|
43
|
+
LLMFullResponseStartFrame,
|
|
44
|
+
LLMMessagesAppendFrame,
|
|
45
|
+
LLMMessagesUpdateFrame,
|
|
46
|
+
LLMRunFrame,
|
|
47
|
+
LLMSetToolChoiceFrame,
|
|
48
|
+
LLMSetToolsFrame,
|
|
49
|
+
SpeechControlParamsFrame,
|
|
50
|
+
StartFrame,
|
|
51
|
+
StartInterruptionFrame,
|
|
52
|
+
TextFrame,
|
|
53
|
+
TranscriptionFrame,
|
|
54
|
+
UserImageRawFrame,
|
|
55
|
+
UserStartedSpeakingFrame,
|
|
56
|
+
UserStoppedSpeakingFrame,
|
|
57
|
+
)
|
|
58
|
+
from pipecat.processors.aggregators.llm_context import (
|
|
59
|
+
LLMContext,
|
|
60
|
+
LLMContextMessage,
|
|
61
|
+
LLMSpecificMessage,
|
|
62
|
+
NotGiven,
|
|
63
|
+
)
|
|
64
|
+
from pipecat.processors.aggregators.llm_response import (
|
|
65
|
+
LLMAssistantAggregatorParams,
|
|
66
|
+
LLMUserAggregatorParams,
|
|
67
|
+
)
|
|
68
|
+
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
69
|
+
from pipecat.utils.time import time_now_iso8601
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class LLMContextAggregator(FrameProcessor):
|
|
73
|
+
"""Base LLM aggregator that uses an LLMContext for conversation storage.
|
|
74
|
+
|
|
75
|
+
This aggregator maintains conversation state using an LLMContext and
|
|
76
|
+
pushes LLMContextFrame objects as aggregation frames. It provides
|
|
77
|
+
common functionality for context-based conversation management.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, *, context: LLMContext, role: str, **kwargs):
|
|
81
|
+
"""Initialize the context response aggregator.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
context: The LLM context to use for conversation storage.
|
|
85
|
+
role: The role this aggregator represents (e.g. "user", "assistant").
|
|
86
|
+
**kwargs: Additional arguments passed to parent class.
|
|
87
|
+
"""
|
|
88
|
+
super().__init__(**kwargs)
|
|
89
|
+
self._context = context
|
|
90
|
+
self._role = role
|
|
91
|
+
|
|
92
|
+
self._aggregation: str = ""
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def messages(self) -> List[LLMContextMessage]:
|
|
96
|
+
"""Get messages from the LLM context.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of message dictionaries from the context.
|
|
100
|
+
"""
|
|
101
|
+
return self._context.get_messages()
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def role(self) -> str:
|
|
105
|
+
"""Get the role for this aggregator.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The role string for this aggregator.
|
|
109
|
+
"""
|
|
110
|
+
return self._role
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def context(self):
|
|
114
|
+
"""Get the LLM context.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The LLMContext instance used by this aggregator.
|
|
118
|
+
"""
|
|
119
|
+
return self._context
|
|
120
|
+
|
|
121
|
+
def _get_context_frame(self) -> LLMContextFrame:
|
|
122
|
+
"""Create a context frame with the current context.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
LLMContextFrame containing the current context.
|
|
126
|
+
"""
|
|
127
|
+
return LLMContextFrame(context=self._context)
|
|
128
|
+
|
|
129
|
+
async def push_context_frame(self, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
|
130
|
+
"""Push a context frame in the specified direction.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
direction: The direction to push the frame (upstream or downstream).
|
|
134
|
+
"""
|
|
135
|
+
frame = self._get_context_frame()
|
|
136
|
+
await self.push_frame(frame, direction)
|
|
137
|
+
|
|
138
|
+
def add_messages(self, messages):
|
|
139
|
+
"""Add messages to the context.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
messages: Messages to add to the conversation context.
|
|
143
|
+
"""
|
|
144
|
+
self._context.add_messages(messages)
|
|
145
|
+
|
|
146
|
+
def set_messages(self, messages):
|
|
147
|
+
"""Set the context messages.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
messages: Messages to replace the current context messages.
|
|
151
|
+
"""
|
|
152
|
+
self._context.set_messages(messages)
|
|
153
|
+
|
|
154
|
+
def set_tools(self, tools: ToolsSchema | NotGiven):
|
|
155
|
+
"""Set tools in the context.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
tools: List of tool definitions to set in the context.
|
|
159
|
+
"""
|
|
160
|
+
self._context.set_tools(tools)
|
|
161
|
+
|
|
162
|
+
def set_tool_choice(self, tool_choice: Literal["none", "auto", "required"] | dict):
|
|
163
|
+
"""Set tool choice in the context.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
tool_choice: Tool choice configuration for the context.
|
|
167
|
+
"""
|
|
168
|
+
self._context.set_tool_choice(tool_choice)
|
|
169
|
+
|
|
170
|
+
async def reset(self):
|
|
171
|
+
"""Reset the aggregation state."""
|
|
172
|
+
self._aggregation = ""
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class LLMUserAggregator(LLMContextAggregator):
|
|
176
|
+
"""User LLM aggregator that processes speech-to-text transcriptions.
|
|
177
|
+
|
|
178
|
+
This aggregator handles the complex logic of aggregating user speech transcriptions
|
|
179
|
+
from STT services. It manages multiple scenarios including:
|
|
180
|
+
|
|
181
|
+
- Transcriptions received between VAD events
|
|
182
|
+
- Transcriptions received outside VAD events
|
|
183
|
+
- Interim vs final transcriptions
|
|
184
|
+
- User interruptions during bot speech
|
|
185
|
+
- Emulated VAD for whispered or short utterances
|
|
186
|
+
|
|
187
|
+
The aggregator uses timeouts to handle cases where transcriptions arrive
|
|
188
|
+
after VAD events or when no VAD is available.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
context: LLMContext,
|
|
194
|
+
*,
|
|
195
|
+
params: Optional[LLMUserAggregatorParams] = None,
|
|
196
|
+
**kwargs,
|
|
197
|
+
):
|
|
198
|
+
"""Initialize the user context aggregator.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
context: The LLM context for conversation storage.
|
|
202
|
+
params: Configuration parameters for aggregation behavior.
|
|
203
|
+
**kwargs: Additional arguments. Supports deprecated 'aggregation_timeout'.
|
|
204
|
+
"""
|
|
205
|
+
super().__init__(context=context, role="user", **kwargs)
|
|
206
|
+
self._params = params or LLMUserAggregatorParams()
|
|
207
|
+
self._vad_params: Optional[VADParams] = None
|
|
208
|
+
self._turn_params: Optional[SmartTurnParams] = None
|
|
209
|
+
|
|
210
|
+
if "aggregation_timeout" in kwargs:
|
|
211
|
+
import warnings
|
|
212
|
+
|
|
213
|
+
with warnings.catch_warnings():
|
|
214
|
+
warnings.simplefilter("always")
|
|
215
|
+
warnings.warn(
|
|
216
|
+
"Parameter 'aggregation_timeout' is deprecated, use 'params' instead.",
|
|
217
|
+
DeprecationWarning,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
self._params.aggregation_timeout = kwargs["aggregation_timeout"]
|
|
221
|
+
|
|
222
|
+
self._user_speaking = False
|
|
223
|
+
self._bot_speaking = False
|
|
224
|
+
self._was_bot_speaking = False
|
|
225
|
+
self._emulating_vad = False
|
|
226
|
+
self._seen_interim_results = False
|
|
227
|
+
self._waiting_for_aggregation = False
|
|
228
|
+
|
|
229
|
+
self._aggregation_event = asyncio.Event()
|
|
230
|
+
self._aggregation_task = None
|
|
231
|
+
|
|
232
|
+
async def reset(self):
|
|
233
|
+
"""Reset the aggregation state and interruption strategies."""
|
|
234
|
+
await super().reset()
|
|
235
|
+
self._was_bot_speaking = False
|
|
236
|
+
self._seen_interim_results = False
|
|
237
|
+
self._waiting_for_aggregation = False
|
|
238
|
+
[await s.reset() for s in self._interruption_strategies]
|
|
239
|
+
|
|
240
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
241
|
+
"""Process frames for user speech aggregation and context management.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
frame: The frame to process.
|
|
245
|
+
direction: The direction of frame flow in the pipeline.
|
|
246
|
+
"""
|
|
247
|
+
await super().process_frame(frame, direction)
|
|
248
|
+
|
|
249
|
+
if isinstance(frame, StartFrame):
|
|
250
|
+
# Push StartFrame before start(), because we want StartFrame to be
|
|
251
|
+
# processed by every processor before any other frame is processed.
|
|
252
|
+
await self.push_frame(frame, direction)
|
|
253
|
+
await self._start(frame)
|
|
254
|
+
elif isinstance(frame, EndFrame):
|
|
255
|
+
# Push EndFrame before stop(), because stop() waits on the task to
|
|
256
|
+
# finish and the task finishes when EndFrame is processed.
|
|
257
|
+
await self.push_frame(frame, direction)
|
|
258
|
+
await self._stop(frame)
|
|
259
|
+
elif isinstance(frame, CancelFrame):
|
|
260
|
+
await self._cancel(frame)
|
|
261
|
+
await self.push_frame(frame, direction)
|
|
262
|
+
elif isinstance(frame, InputAudioRawFrame):
|
|
263
|
+
await self._handle_input_audio(frame)
|
|
264
|
+
await self.push_frame(frame, direction)
|
|
265
|
+
elif isinstance(frame, UserStartedSpeakingFrame):
|
|
266
|
+
await self._handle_user_started_speaking(frame)
|
|
267
|
+
await self.push_frame(frame, direction)
|
|
268
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
269
|
+
await self._handle_user_stopped_speaking(frame)
|
|
270
|
+
await self.push_frame(frame, direction)
|
|
271
|
+
elif isinstance(frame, BotStartedSpeakingFrame):
|
|
272
|
+
await self._handle_bot_started_speaking(frame)
|
|
273
|
+
await self.push_frame(frame, direction)
|
|
274
|
+
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
275
|
+
await self._handle_bot_stopped_speaking(frame)
|
|
276
|
+
await self.push_frame(frame, direction)
|
|
277
|
+
elif isinstance(frame, TranscriptionFrame):
|
|
278
|
+
await self._handle_transcription(frame)
|
|
279
|
+
elif isinstance(frame, InterimTranscriptionFrame):
|
|
280
|
+
await self._handle_interim_transcription(frame)
|
|
281
|
+
elif isinstance(frame, LLMRunFrame):
|
|
282
|
+
await self._handle_llm_run(frame)
|
|
283
|
+
elif isinstance(frame, LLMMessagesAppendFrame):
|
|
284
|
+
await self._handle_llm_messages_append(frame)
|
|
285
|
+
elif isinstance(frame, LLMMessagesUpdateFrame):
|
|
286
|
+
await self._handle_llm_messages_update(frame)
|
|
287
|
+
elif isinstance(frame, LLMSetToolsFrame):
|
|
288
|
+
self.set_tools(frame.tools)
|
|
289
|
+
elif isinstance(frame, LLMSetToolChoiceFrame):
|
|
290
|
+
self.set_tool_choice(frame.tool_choice)
|
|
291
|
+
elif isinstance(frame, SpeechControlParamsFrame):
|
|
292
|
+
self._vad_params = frame.vad_params
|
|
293
|
+
self._turn_params = frame.turn_params
|
|
294
|
+
await self.push_frame(frame, direction)
|
|
295
|
+
else:
|
|
296
|
+
await self.push_frame(frame, direction)
|
|
297
|
+
|
|
298
|
+
async def _process_aggregation(self):
|
|
299
|
+
"""Process the current aggregation and push it downstream."""
|
|
300
|
+
aggregation = self._aggregation
|
|
301
|
+
await self.reset()
|
|
302
|
+
self._context.add_message({"role": self.role, "content": aggregation})
|
|
303
|
+
frame = LLMContextFrame(self._context)
|
|
304
|
+
await self.push_frame(frame)
|
|
305
|
+
|
|
306
|
+
async def _push_aggregation(self):
|
|
307
|
+
"""Push the current aggregation based on interruption strategies and conditions."""
|
|
308
|
+
if len(self._aggregation) > 0:
|
|
309
|
+
if self.interruption_strategies and self._bot_speaking:
|
|
310
|
+
should_interrupt = await self._should_interrupt_based_on_strategies()
|
|
311
|
+
|
|
312
|
+
if should_interrupt:
|
|
313
|
+
logger.debug(
|
|
314
|
+
"Interruption conditions met - pushing BotInterruptionFrame and aggregation"
|
|
315
|
+
)
|
|
316
|
+
await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
|
317
|
+
await self._process_aggregation()
|
|
318
|
+
else:
|
|
319
|
+
logger.debug("Interruption conditions not met - not pushing aggregation")
|
|
320
|
+
# Don't process aggregation, just reset it
|
|
321
|
+
await self.reset()
|
|
322
|
+
else:
|
|
323
|
+
# No interruption config - normal behavior (always push aggregation)
|
|
324
|
+
await self._process_aggregation()
|
|
325
|
+
# Handles the case where both the user and the bot are not speaking,
|
|
326
|
+
# and the bot was previously speaking before the user interruption.
|
|
327
|
+
# Normally, when the user stops speaking, new text is expected,
|
|
328
|
+
# which triggers the bot to respond. However, if no new text
|
|
329
|
+
# is received, this safeguard ensures
|
|
330
|
+
# the bot doesn't hang indefinitely while waiting to speak again.
|
|
331
|
+
elif not self._seen_interim_results and self._was_bot_speaking and not self._bot_speaking:
|
|
332
|
+
logger.warning("User stopped speaking but no new aggregation received.")
|
|
333
|
+
# Resetting it so we don't trigger this twice
|
|
334
|
+
self._was_bot_speaking = False
|
|
335
|
+
# TODO: we are not enabling this for now, due to some STT services which can take as long as 2 seconds two return a transcription
|
|
336
|
+
# So we need more tests and probably make this feature configurable, disabled it by default.
|
|
337
|
+
# We are just pushing the same previous context to be processed again in this case
|
|
338
|
+
# await self.push_frame(LLMContextFrame(self._context))
|
|
339
|
+
|
|
340
|
+
async def _should_interrupt_based_on_strategies(self) -> bool:
|
|
341
|
+
"""Check if interruption should occur based on configured strategies.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
True if any interruption strategy indicates interruption should occur.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
async def should_interrupt(strategy: BaseInterruptionStrategy):
|
|
348
|
+
await strategy.append_text(self._aggregation)
|
|
349
|
+
return await strategy.should_interrupt()
|
|
350
|
+
|
|
351
|
+
return any([await should_interrupt(s) for s in self._interruption_strategies])
|
|
352
|
+
|
|
353
|
+
async def _start(self, frame: StartFrame):
|
|
354
|
+
self._create_aggregation_task()
|
|
355
|
+
|
|
356
|
+
async def _stop(self, frame: EndFrame):
|
|
357
|
+
await self._cancel_aggregation_task()
|
|
358
|
+
|
|
359
|
+
async def _cancel(self, frame: CancelFrame):
|
|
360
|
+
await self._cancel_aggregation_task()
|
|
361
|
+
|
|
362
|
+
async def _handle_llm_run(self, frame: LLMRunFrame):
|
|
363
|
+
await self.push_context_frame()
|
|
364
|
+
|
|
365
|
+
async def _handle_llm_messages_append(self, frame: LLMMessagesAppendFrame):
|
|
366
|
+
self.add_messages(frame.messages)
|
|
367
|
+
if frame.run_llm:
|
|
368
|
+
await self.push_context_frame()
|
|
369
|
+
|
|
370
|
+
async def _handle_llm_messages_update(self, frame: LLMMessagesUpdateFrame):
|
|
371
|
+
self.set_messages(frame.messages)
|
|
372
|
+
if frame.run_llm:
|
|
373
|
+
await self.push_context_frame()
|
|
374
|
+
|
|
375
|
+
async def _handle_input_audio(self, frame: InputAudioRawFrame):
|
|
376
|
+
for s in self.interruption_strategies:
|
|
377
|
+
await s.append_audio(frame.audio, frame.sample_rate)
|
|
378
|
+
|
|
379
|
+
async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
|
|
380
|
+
self._user_speaking = True
|
|
381
|
+
self._waiting_for_aggregation = True
|
|
382
|
+
self._was_bot_speaking = self._bot_speaking
|
|
383
|
+
|
|
384
|
+
# If we get a non-emulated UserStartedSpeakingFrame but we are in the
|
|
385
|
+
# middle of emulating VAD, let's stop emulating VAD (i.e. don't send the
|
|
386
|
+
# EmulateUserStoppedSpeakingFrame).
|
|
387
|
+
if not frame.emulated and self._emulating_vad:
|
|
388
|
+
self._emulating_vad = False
|
|
389
|
+
|
|
390
|
+
async def _handle_user_stopped_speaking(self, _: UserStoppedSpeakingFrame):
|
|
391
|
+
self._user_speaking = False
|
|
392
|
+
# We just stopped speaking. Let's see if there's some aggregation to
|
|
393
|
+
# push. If the last thing we saw is an interim transcription, let's wait
|
|
394
|
+
# pushing the aggregation as we will probably get a final transcription.
|
|
395
|
+
if len(self._aggregation) > 0:
|
|
396
|
+
if not self._seen_interim_results:
|
|
397
|
+
await self._push_aggregation()
|
|
398
|
+
# Handles the case where both the user and the bot are not speaking,
|
|
399
|
+
# and the bot was previously speaking before the user interruption.
|
|
400
|
+
# So in this case we are resetting the aggregation timer
|
|
401
|
+
elif not self._seen_interim_results and self._was_bot_speaking and not self._bot_speaking:
|
|
402
|
+
# Reset aggregation timer.
|
|
403
|
+
self._aggregation_event.set()
|
|
404
|
+
|
|
405
|
+
async def _handle_bot_started_speaking(self, _: BotStartedSpeakingFrame):
|
|
406
|
+
self._bot_speaking = True
|
|
407
|
+
|
|
408
|
+
async def _handle_bot_stopped_speaking(self, _: BotStoppedSpeakingFrame):
|
|
409
|
+
self._bot_speaking = False
|
|
410
|
+
|
|
411
|
+
async def _handle_transcription(self, frame: TranscriptionFrame):
|
|
412
|
+
text = frame.text
|
|
413
|
+
|
|
414
|
+
# Make sure we really have some text.
|
|
415
|
+
if not text.strip():
|
|
416
|
+
return
|
|
417
|
+
|
|
418
|
+
self._aggregation += f" {text}" if self._aggregation else text
|
|
419
|
+
# We just got a final result, so let's reset interim results.
|
|
420
|
+
self._seen_interim_results = False
|
|
421
|
+
# Reset aggregation timer.
|
|
422
|
+
self._aggregation_event.set()
|
|
423
|
+
|
|
424
|
+
async def _handle_interim_transcription(self, _: InterimTranscriptionFrame):
|
|
425
|
+
self._seen_interim_results = True
|
|
426
|
+
|
|
427
|
+
def _create_aggregation_task(self):
|
|
428
|
+
if not self._aggregation_task:
|
|
429
|
+
self._aggregation_task = self.create_task(self._aggregation_task_handler())
|
|
430
|
+
|
|
431
|
+
async def _cancel_aggregation_task(self):
|
|
432
|
+
if self._aggregation_task:
|
|
433
|
+
await self.cancel_task(self._aggregation_task)
|
|
434
|
+
self._aggregation_task = None
|
|
435
|
+
|
|
436
|
+
async def _aggregation_task_handler(self):
|
|
437
|
+
while True:
|
|
438
|
+
try:
|
|
439
|
+
# The _aggregation_task_handler handles two distinct timeout scenarios:
|
|
440
|
+
#
|
|
441
|
+
# 1. When emulating_vad=True: Wait for emulated VAD timeout before
|
|
442
|
+
# pushing aggregation (simulating VAD behavior when no actual VAD
|
|
443
|
+
# detection occurred).
|
|
444
|
+
#
|
|
445
|
+
# 2. When emulating_vad=False: Use aggregation_timeout as a buffer
|
|
446
|
+
# to wait for potential late-arriving transcription frames after
|
|
447
|
+
# a real VAD event.
|
|
448
|
+
#
|
|
449
|
+
# For emulated VAD scenarios, the timeout strategy depends on whether
|
|
450
|
+
# a turn analyzer is configured:
|
|
451
|
+
#
|
|
452
|
+
# - WITH turn analyzer: Use turn_emulated_vad_timeout parameter because
|
|
453
|
+
# the VAD's stop_secs is set very low (e.g. 0.2s) for rapid speech
|
|
454
|
+
# chunking to feed the turn analyzer. This low value is too fast
|
|
455
|
+
# for emulated VAD scenarios where we need to allow users time to
|
|
456
|
+
# finish speaking (e.g. 0.8s).
|
|
457
|
+
#
|
|
458
|
+
# - WITHOUT turn analyzer: Use VAD's stop_secs directly to maintain
|
|
459
|
+
# consistent user experience between real VAD detection and
|
|
460
|
+
# emulated VAD scenarios.
|
|
461
|
+
if not self._emulating_vad:
|
|
462
|
+
timeout = self._params.aggregation_timeout
|
|
463
|
+
elif self._turn_params:
|
|
464
|
+
timeout = self._params.turn_emulated_vad_timeout
|
|
465
|
+
else:
|
|
466
|
+
# Use VAD stop_secs when no turn analyzer is present, fallback if no VAD params
|
|
467
|
+
timeout = (
|
|
468
|
+
self._vad_params.stop_secs
|
|
469
|
+
if self._vad_params
|
|
470
|
+
else self._params.turn_emulated_vad_timeout
|
|
471
|
+
)
|
|
472
|
+
await asyncio.wait_for(self._aggregation_event.wait(), timeout=timeout)
|
|
473
|
+
await self._maybe_emulate_user_speaking()
|
|
474
|
+
except asyncio.TimeoutError:
|
|
475
|
+
if not self._user_speaking:
|
|
476
|
+
await self._push_aggregation()
|
|
477
|
+
|
|
478
|
+
# If we are emulating VAD we still need to send the user stopped
|
|
479
|
+
# speaking frame.
|
|
480
|
+
if self._emulating_vad:
|
|
481
|
+
await self.push_frame(
|
|
482
|
+
EmulateUserStoppedSpeakingFrame(), FrameDirection.UPSTREAM
|
|
483
|
+
)
|
|
484
|
+
self._emulating_vad = False
|
|
485
|
+
finally:
|
|
486
|
+
self._aggregation_event.clear()
|
|
487
|
+
|
|
488
|
+
async def _maybe_emulate_user_speaking(self):
|
|
489
|
+
"""Maybe emulate user speaking based on transcription.
|
|
490
|
+
|
|
491
|
+
Emulate user speaking if we got a transcription but it was not
|
|
492
|
+
detected by VAD. Behavior when bot is speaking depends on the
|
|
493
|
+
enable_emulated_vad_interruptions parameter.
|
|
494
|
+
"""
|
|
495
|
+
# Check if we received a transcription but VAD was not able to detect
|
|
496
|
+
# voice (e.g. when you whisper a short utterance). In that case, we need
|
|
497
|
+
# to emulate VAD (i.e. user start/stopped speaking), but we do it only
|
|
498
|
+
# if the bot is not speaking. If the bot is speaking and we really have
|
|
499
|
+
# a short utterance we don't really want to interrupt the bot.
|
|
500
|
+
if (
|
|
501
|
+
not self._user_speaking
|
|
502
|
+
and not self._waiting_for_aggregation
|
|
503
|
+
and len(self._aggregation) > 0
|
|
504
|
+
):
|
|
505
|
+
if self._bot_speaking and not self._params.enable_emulated_vad_interruptions:
|
|
506
|
+
# If emulated VAD interruptions are disabled and bot is speaking, ignore
|
|
507
|
+
logger.debug("Ignoring user speaking emulation, bot is speaking.")
|
|
508
|
+
await self.reset()
|
|
509
|
+
else:
|
|
510
|
+
# Either bot is not speaking, or emulated VAD interruptions are enabled
|
|
511
|
+
# - trigger user speaking emulation.
|
|
512
|
+
await self.push_frame(EmulateUserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
513
|
+
self._emulating_vad = True
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
class LLMAssistantAggregator(LLMContextAggregator):
|
|
517
|
+
"""Assistant LLM aggregator that processes bot responses and function calls.
|
|
518
|
+
|
|
519
|
+
This aggregator handles the complex logic of processing assistant responses including:
|
|
520
|
+
|
|
521
|
+
- Text frame aggregation between response start/end markers
|
|
522
|
+
- Function call lifecycle management
|
|
523
|
+
- Context updates with timestamps
|
|
524
|
+
- Tool execution and result handling
|
|
525
|
+
- Interruption handling during responses
|
|
526
|
+
|
|
527
|
+
The aggregator manages function calls in progress and coordinates between
|
|
528
|
+
text generation and tool execution phases of LLM responses.
|
|
529
|
+
"""
|
|
530
|
+
|
|
531
|
+
def __init__(
|
|
532
|
+
self,
|
|
533
|
+
context: LLMContext,
|
|
534
|
+
*,
|
|
535
|
+
params: Optional[LLMAssistantAggregatorParams] = None,
|
|
536
|
+
**kwargs,
|
|
537
|
+
):
|
|
538
|
+
"""Initialize the assistant context aggregator.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
context: The OpenAI LLM context for conversation storage.
|
|
542
|
+
params: Configuration parameters for aggregation behavior.
|
|
543
|
+
**kwargs: Additional arguments. Supports deprecated 'expect_stripped_words'.
|
|
544
|
+
"""
|
|
545
|
+
super().__init__(context=context, role="assistant", **kwargs)
|
|
546
|
+
self._params = params or LLMAssistantAggregatorParams()
|
|
547
|
+
|
|
548
|
+
if "expect_stripped_words" in kwargs:
|
|
549
|
+
import warnings
|
|
550
|
+
|
|
551
|
+
with warnings.catch_warnings():
|
|
552
|
+
warnings.simplefilter("always")
|
|
553
|
+
warnings.warn(
|
|
554
|
+
"Parameter 'expect_stripped_words' is deprecated, use 'params' instead.",
|
|
555
|
+
DeprecationWarning,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
self._params.expect_stripped_words = kwargs["expect_stripped_words"]
|
|
559
|
+
|
|
560
|
+
self._started = 0
|
|
561
|
+
self._function_calls_in_progress: Dict[str, Optional[FunctionCallInProgressFrame]] = {}
|
|
562
|
+
self._context_updated_tasks: Set[asyncio.Task] = set()
|
|
563
|
+
|
|
564
|
+
@property
|
|
565
|
+
def has_function_calls_in_progress(self) -> bool:
|
|
566
|
+
"""Check if there are any function calls currently in progress.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
True if function calls are in progress, False otherwise.
|
|
570
|
+
"""
|
|
571
|
+
return bool(self._function_calls_in_progress)
|
|
572
|
+
|
|
573
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
574
|
+
"""Process frames for assistant response aggregation and function call management.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
frame: The frame to process.
|
|
578
|
+
direction: The direction of frame flow in the pipeline.
|
|
579
|
+
"""
|
|
580
|
+
await super().process_frame(frame, direction)
|
|
581
|
+
|
|
582
|
+
if isinstance(frame, StartInterruptionFrame):
|
|
583
|
+
await self._handle_interruptions(frame)
|
|
584
|
+
await self.push_frame(frame, direction)
|
|
585
|
+
elif isinstance(frame, LLMFullResponseStartFrame):
|
|
586
|
+
await self._handle_llm_start(frame)
|
|
587
|
+
elif isinstance(frame, LLMFullResponseEndFrame):
|
|
588
|
+
await self._handle_llm_end(frame)
|
|
589
|
+
elif isinstance(frame, TextFrame):
|
|
590
|
+
await self._handle_text(frame)
|
|
591
|
+
elif isinstance(frame, LLMRunFrame):
|
|
592
|
+
await self._handle_llm_run(frame)
|
|
593
|
+
elif isinstance(frame, LLMMessagesAppendFrame):
|
|
594
|
+
await self._handle_llm_messages_append(frame)
|
|
595
|
+
elif isinstance(frame, LLMMessagesUpdateFrame):
|
|
596
|
+
await self._handle_llm_messages_update(frame)
|
|
597
|
+
elif isinstance(frame, LLMSetToolsFrame):
|
|
598
|
+
self.set_tools(frame.tools)
|
|
599
|
+
elif isinstance(frame, LLMSetToolChoiceFrame):
|
|
600
|
+
self.set_tool_choice(frame.tool_choice)
|
|
601
|
+
elif isinstance(frame, FunctionCallsStartedFrame):
|
|
602
|
+
await self._handle_function_calls_started(frame)
|
|
603
|
+
elif isinstance(frame, FunctionCallInProgressFrame):
|
|
604
|
+
await self._handle_function_call_in_progress(frame)
|
|
605
|
+
elif isinstance(frame, FunctionCallResultFrame):
|
|
606
|
+
await self._handle_function_call_result(frame)
|
|
607
|
+
elif isinstance(frame, FunctionCallCancelFrame):
|
|
608
|
+
await self._handle_function_call_cancel(frame)
|
|
609
|
+
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
|
610
|
+
await self._handle_user_image_frame(frame)
|
|
611
|
+
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
612
|
+
await self._push_aggregation()
|
|
613
|
+
await self.push_frame(frame, direction)
|
|
614
|
+
else:
|
|
615
|
+
await self.push_frame(frame, direction)
|
|
616
|
+
|
|
617
|
+
async def _push_aggregation(self):
|
|
618
|
+
"""Push the current assistant aggregation with timestamp."""
|
|
619
|
+
if not self._aggregation:
|
|
620
|
+
return
|
|
621
|
+
|
|
622
|
+
aggregation = self._aggregation.strip()
|
|
623
|
+
await self.reset()
|
|
624
|
+
|
|
625
|
+
if aggregation:
|
|
626
|
+
self._context.add_message({"role": "assistant", "content": aggregation})
|
|
627
|
+
|
|
628
|
+
# Push context frame
|
|
629
|
+
await self.push_context_frame()
|
|
630
|
+
|
|
631
|
+
# Push timestamp frame with current time
|
|
632
|
+
timestamp_frame = LLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
|
|
633
|
+
await self.push_frame(timestamp_frame)
|
|
634
|
+
|
|
635
|
+
async def _handle_llm_run(self, frame: LLMRunFrame):
|
|
636
|
+
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
637
|
+
|
|
638
|
+
async def _handle_llm_messages_append(self, frame: LLMMessagesAppendFrame):
|
|
639
|
+
self.add_messages(frame.messages)
|
|
640
|
+
if frame.run_llm:
|
|
641
|
+
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
642
|
+
|
|
643
|
+
async def _handle_llm_messages_update(self, frame: LLMMessagesUpdateFrame):
|
|
644
|
+
self.set_messages(frame.messages)
|
|
645
|
+
if frame.run_llm:
|
|
646
|
+
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
647
|
+
|
|
648
|
+
async def _handle_interruptions(self, frame: StartInterruptionFrame):
|
|
649
|
+
await self._push_aggregation()
|
|
650
|
+
self._started = 0
|
|
651
|
+
await self.reset()
|
|
652
|
+
|
|
653
|
+
async def _handle_function_calls_started(self, frame: FunctionCallsStartedFrame):
|
|
654
|
+
function_names = [f"{f.function_name}:{f.tool_call_id}" for f in frame.function_calls]
|
|
655
|
+
logger.debug(f"{self} FunctionCallsStartedFrame: {function_names}")
|
|
656
|
+
for function_call in frame.function_calls:
|
|
657
|
+
self._function_calls_in_progress[function_call.tool_call_id] = None
|
|
658
|
+
|
|
659
|
+
async def _handle_function_call_in_progress(self, frame: FunctionCallInProgressFrame):
|
|
660
|
+
logger.debug(
|
|
661
|
+
f"{self} FunctionCallInProgressFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
# Update context with the in-progress function call
|
|
665
|
+
self._context.add_message(
|
|
666
|
+
{
|
|
667
|
+
"role": "assistant",
|
|
668
|
+
"tool_calls": [
|
|
669
|
+
{
|
|
670
|
+
"id": frame.tool_call_id,
|
|
671
|
+
"function": {
|
|
672
|
+
"name": frame.function_name,
|
|
673
|
+
"arguments": json.dumps(frame.arguments),
|
|
674
|
+
},
|
|
675
|
+
"type": "function",
|
|
676
|
+
}
|
|
677
|
+
],
|
|
678
|
+
}
|
|
679
|
+
)
|
|
680
|
+
self._context.add_message(
|
|
681
|
+
{
|
|
682
|
+
"role": "tool",
|
|
683
|
+
"content": "IN_PROGRESS",
|
|
684
|
+
"tool_call_id": frame.tool_call_id,
|
|
685
|
+
}
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
self._function_calls_in_progress[frame.tool_call_id] = frame
|
|
689
|
+
|
|
690
|
+
async def _handle_function_call_result(self, frame: FunctionCallResultFrame):
|
|
691
|
+
logger.debug(
|
|
692
|
+
f"{self} FunctionCallResultFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
|
693
|
+
)
|
|
694
|
+
if frame.tool_call_id not in self._function_calls_in_progress:
|
|
695
|
+
logger.warning(
|
|
696
|
+
f"FunctionCallResultFrame tool_call_id [{frame.tool_call_id}] is not running"
|
|
697
|
+
)
|
|
698
|
+
return
|
|
699
|
+
|
|
700
|
+
del self._function_calls_in_progress[frame.tool_call_id]
|
|
701
|
+
|
|
702
|
+
properties = frame.properties
|
|
703
|
+
|
|
704
|
+
# Update context with the function call result
|
|
705
|
+
if frame.result:
|
|
706
|
+
result = json.dumps(frame.result)
|
|
707
|
+
self._update_function_call_result(frame.function_name, frame.tool_call_id, result)
|
|
708
|
+
else:
|
|
709
|
+
self._update_function_call_result(frame.function_name, frame.tool_call_id, "COMPLETED")
|
|
710
|
+
|
|
711
|
+
run_llm = False
|
|
712
|
+
|
|
713
|
+
# Run inference if the function call result requires it.
|
|
714
|
+
if frame.result:
|
|
715
|
+
if properties and properties.run_llm is not None:
|
|
716
|
+
# If the tool call result has a run_llm property, use it.
|
|
717
|
+
run_llm = properties.run_llm
|
|
718
|
+
elif frame.run_llm is not None:
|
|
719
|
+
# If the frame is indicating we should run the LLM, do it.
|
|
720
|
+
run_llm = frame.run_llm
|
|
721
|
+
else:
|
|
722
|
+
# If this is the last function call in progress, run the LLM.
|
|
723
|
+
run_llm = not bool(self._function_calls_in_progress)
|
|
724
|
+
|
|
725
|
+
if run_llm:
|
|
726
|
+
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
727
|
+
|
|
728
|
+
# Call the `on_context_updated` callback once the function call result
|
|
729
|
+
# is added to the context. Also, run this in a separate task to make
|
|
730
|
+
# sure we don't block the pipeline.
|
|
731
|
+
if properties and properties.on_context_updated:
|
|
732
|
+
task_name = f"{frame.function_name}:{frame.tool_call_id}:on_context_updated"
|
|
733
|
+
task = self.create_task(properties.on_context_updated(), task_name)
|
|
734
|
+
self._context_updated_tasks.add(task)
|
|
735
|
+
task.add_done_callback(self._context_updated_task_finished)
|
|
736
|
+
|
|
737
|
+
async def _handle_function_call_cancel(self, frame: FunctionCallCancelFrame):
|
|
738
|
+
logger.debug(
|
|
739
|
+
f"{self} FunctionCallCancelFrame: [{frame.function_name}:{frame.tool_call_id}]"
|
|
740
|
+
)
|
|
741
|
+
if frame.tool_call_id not in self._function_calls_in_progress:
|
|
742
|
+
return
|
|
743
|
+
|
|
744
|
+
if self._function_calls_in_progress[frame.tool_call_id].cancel_on_interruption:
|
|
745
|
+
# Update context with the function call cancellation
|
|
746
|
+
self._update_function_call_result(frame.function_name, frame.tool_call_id, "CANCELLED")
|
|
747
|
+
del self._function_calls_in_progress[frame.tool_call_id]
|
|
748
|
+
|
|
749
|
+
def _update_function_call_result(self, function_name: str, tool_call_id: str, result: Any):
|
|
750
|
+
for message in self._context.get_messages():
|
|
751
|
+
if (
|
|
752
|
+
not isinstance(message, LLMSpecificMessage)
|
|
753
|
+
and message["role"] == "tool"
|
|
754
|
+
and message["tool_call_id"]
|
|
755
|
+
and message["tool_call_id"] == tool_call_id
|
|
756
|
+
):
|
|
757
|
+
message["content"] = result
|
|
758
|
+
|
|
759
|
+
async def _handle_user_image_frame(self, frame: UserImageRawFrame):
|
|
760
|
+
logger.debug(
|
|
761
|
+
f"{self} UserImageRawFrame: [{frame.request.function_name}:{frame.request.tool_call_id}]"
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
if frame.request.tool_call_id not in self._function_calls_in_progress:
|
|
765
|
+
logger.warning(
|
|
766
|
+
f"UserImageRawFrame tool_call_id [{frame.request.tool_call_id}] is not running"
|
|
767
|
+
)
|
|
768
|
+
return
|
|
769
|
+
|
|
770
|
+
del self._function_calls_in_progress[frame.request.tool_call_id]
|
|
771
|
+
|
|
772
|
+
# Update context with the image frame
|
|
773
|
+
self._update_function_call_result(
|
|
774
|
+
frame.request.function_name, frame.request.tool_call_id, "COMPLETED"
|
|
775
|
+
)
|
|
776
|
+
self._context.add_image_frame_message(
|
|
777
|
+
format=frame.format,
|
|
778
|
+
size=frame.size,
|
|
779
|
+
image=frame.image,
|
|
780
|
+
text=frame.request.context,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
await self._push_aggregation()
|
|
784
|
+
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
785
|
+
|
|
786
|
+
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
|
787
|
+
self._started += 1
|
|
788
|
+
|
|
789
|
+
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
|
790
|
+
self._started -= 1
|
|
791
|
+
await self._push_aggregation()
|
|
792
|
+
|
|
793
|
+
async def _handle_text(self, frame: TextFrame):
|
|
794
|
+
if not self._started:
|
|
795
|
+
return
|
|
796
|
+
|
|
797
|
+
if self._params.expect_stripped_words:
|
|
798
|
+
self._aggregation += f" {frame.text}" if self._aggregation else frame.text
|
|
799
|
+
else:
|
|
800
|
+
self._aggregation += frame.text
|
|
801
|
+
|
|
802
|
+
def _context_updated_task_finished(self, task: asyncio.Task):
|
|
803
|
+
self._context_updated_tasks.discard(task)
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
class LLMContextAggregatorPair:
|
|
807
|
+
"""Pair of LLM context aggregators for updating context with user and assistant messages."""
|
|
808
|
+
|
|
809
|
+
def __init__(
|
|
810
|
+
self,
|
|
811
|
+
context: LLMContext,
|
|
812
|
+
*,
|
|
813
|
+
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
|
814
|
+
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
|
815
|
+
):
|
|
816
|
+
"""Initialize the LLM context aggregator pair.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
context: The context to be managed by the aggregators.
|
|
820
|
+
user_params: Parameters for the user context aggregator.
|
|
821
|
+
assistant_params: Parameters for the assistant context aggregator.
|
|
822
|
+
"""
|
|
823
|
+
self._user = LLMUserAggregator(context, params=user_params)
|
|
824
|
+
self._assistant = LLMAssistantAggregator(context, params=assistant_params)
|
|
825
|
+
|
|
826
|
+
def user(self) -> LLMUserAggregator:
|
|
827
|
+
"""Get the user context aggregator.
|
|
828
|
+
|
|
829
|
+
Returns:
|
|
830
|
+
The user context aggregator instance.
|
|
831
|
+
"""
|
|
832
|
+
return self._user
|
|
833
|
+
|
|
834
|
+
def assistant(self) -> LLMAssistantAggregator:
|
|
835
|
+
"""Get the assistant context aggregator.
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
The assistant context aggregator instance.
|
|
839
|
+
"""
|
|
840
|
+
return self._assistant
|