dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -15,9 +15,10 @@ service-specific adapter.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import base64
|
|
18
|
+
import copy
|
|
18
19
|
import io
|
|
19
20
|
from dataclasses import dataclass
|
|
20
|
-
from typing import Any, List, Optional, TypeAlias, Union
|
|
21
|
+
from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
|
|
21
22
|
|
|
22
23
|
from loguru import logger
|
|
23
24
|
from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
|
|
@@ -31,6 +32,9 @@ from PIL import Image
|
|
|
31
32
|
from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
|
32
33
|
from pipecat.frames.frames import AudioRawFrame
|
|
33
34
|
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
37
|
+
|
|
34
38
|
# "Re-export" types from OpenAI that we're using as universal context types.
|
|
35
39
|
# NOTE: if universal message types need to someday diverge from OpenAI's, we
|
|
36
40
|
# should consider managing our own definitions. But we should do so carefully,
|
|
@@ -65,6 +69,26 @@ class LLMContext:
|
|
|
65
69
|
and content formatting.
|
|
66
70
|
"""
|
|
67
71
|
|
|
72
|
+
@staticmethod
|
|
73
|
+
def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
|
|
74
|
+
"""Create a universal LLM context from an OpenAI-specific context.
|
|
75
|
+
|
|
76
|
+
NOTE: this should only be used internally, for facilitating migration
|
|
77
|
+
from OpenAILLMContext to LLMContext. New user code should use
|
|
78
|
+
LLMContext directly.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
openai_context: The OpenAI LLM context to convert.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
New LLMContext instance with converted messages and settings.
|
|
85
|
+
"""
|
|
86
|
+
return LLMContext(
|
|
87
|
+
messages=openai_context.get_messages(),
|
|
88
|
+
tools=openai_context.tools,
|
|
89
|
+
tool_choice=openai_context.tool_choice,
|
|
90
|
+
)
|
|
91
|
+
|
|
68
92
|
def __init__(
|
|
69
93
|
self,
|
|
70
94
|
messages: Optional[List[LLMContextMessage]] = None,
|
|
@@ -82,6 +106,19 @@ class LLMContext:
|
|
|
82
106
|
self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
|
|
83
107
|
self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
|
|
84
108
|
|
|
109
|
+
@property
|
|
110
|
+
def messages(self) -> List[LLMContextMessage]:
|
|
111
|
+
"""Get the current messages list.
|
|
112
|
+
|
|
113
|
+
NOTE: This is equivalent to calling `get_messages()` with no filter. If
|
|
114
|
+
you want to filter out LLM-specific messages that don't pertain to your
|
|
115
|
+
LLM, use `get_messages()` directly.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
List of conversation messages.
|
|
119
|
+
"""
|
|
120
|
+
return self.get_messages()
|
|
121
|
+
|
|
85
122
|
def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
|
|
86
123
|
"""Get the current messages list.
|
|
87
124
|
|
|
@@ -89,7 +126,8 @@ class LLMContext:
|
|
|
89
126
|
llm_specific_filter: Optional filter to return LLM-specific
|
|
90
127
|
messages for the given LLM, in addition to the standard
|
|
91
128
|
messages. If messages end up being filtered, an error will be
|
|
92
|
-
logged
|
|
129
|
+
logged; this is intended to catch accidental use of
|
|
130
|
+
incompatible LLM-specific messages.
|
|
93
131
|
|
|
94
132
|
Returns:
|
|
95
133
|
List of conversation messages.
|
|
@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
|
|
|
23
23
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
|
24
24
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
|
25
25
|
from pipecat.frames.frames import (
|
|
26
|
-
BotInterruptionFrame,
|
|
27
26
|
BotStartedSpeakingFrame,
|
|
28
27
|
BotStoppedSpeakingFrame,
|
|
29
28
|
CancelFrame,
|
|
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
|
|
|
37
36
|
FunctionCallsStartedFrame,
|
|
38
37
|
InputAudioRawFrame,
|
|
39
38
|
InterimTranscriptionFrame,
|
|
39
|
+
InterruptionFrame,
|
|
40
40
|
LLMFullResponseEndFrame,
|
|
41
41
|
LLMFullResponseStartFrame,
|
|
42
42
|
LLMMessagesAppendFrame,
|
|
@@ -49,8 +49,8 @@ from pipecat.frames.frames import (
|
|
|
49
49
|
OpenAILLMContextAssistantTimestampFrame,
|
|
50
50
|
SpeechControlParamsFrame,
|
|
51
51
|
StartFrame,
|
|
52
|
-
StartInterruptionFrame,
|
|
53
52
|
TextFrame,
|
|
53
|
+
TranscriptDropFrame,
|
|
54
54
|
TranscriptionFrame,
|
|
55
55
|
UserImageRawFrame,
|
|
56
56
|
UserStartedSpeakingFrame,
|
|
@@ -139,7 +139,7 @@ class LLMFullResponseAggregator(FrameProcessor):
|
|
|
139
139
|
"""
|
|
140
140
|
await super().process_frame(frame, direction)
|
|
141
141
|
|
|
142
|
-
if isinstance(frame,
|
|
142
|
+
if isinstance(frame, InterruptionFrame):
|
|
143
143
|
await self._call_event_handler("on_completion", self._aggregation, False)
|
|
144
144
|
self._aggregation = ""
|
|
145
145
|
self._started = False
|
|
@@ -446,6 +446,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
446
446
|
self._latest_final_transcript = ""
|
|
447
447
|
self._last_user_speaking_time = 0
|
|
448
448
|
self._last_aggregation_push_time = 0
|
|
449
|
+
self._pending_transcription_ids: List[int] = []
|
|
449
450
|
|
|
450
451
|
async def reset(self):
|
|
451
452
|
"""Reset the aggregation state and interruption strategies."""
|
|
@@ -453,6 +454,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
453
454
|
self._was_bot_speaking = False
|
|
454
455
|
self._seen_interim_results = False
|
|
455
456
|
self._waiting_for_aggregation = False
|
|
457
|
+
self._pending_transcription_ids.clear()
|
|
456
458
|
[await s.reset() for s in self._interruption_strategies]
|
|
457
459
|
|
|
458
460
|
async def handle_aggregation(self, aggregation: str):
|
|
@@ -470,8 +472,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
470
472
|
frame: The frame to process.
|
|
471
473
|
direction: The direction of frame flow in the pipeline.
|
|
472
474
|
"""
|
|
473
|
-
if isinstance(frame,
|
|
474
|
-
self.logger.debug("Received
|
|
475
|
+
if isinstance(frame, InterruptionFrame):
|
|
476
|
+
self.logger.debug("Received InterruptionFrame")
|
|
475
477
|
await super().process_frame(frame, direction)
|
|
476
478
|
|
|
477
479
|
if isinstance(frame, StartFrame):
|
|
@@ -516,9 +518,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
516
518
|
self.set_tools(frame.tools)
|
|
517
519
|
elif isinstance(frame, LLMSetToolChoiceFrame):
|
|
518
520
|
self.set_tool_choice(frame.tool_choice)
|
|
519
|
-
elif isinstance(frame, LLMFullResponseStartFrame):
|
|
520
|
-
self._last_llm_response_start_time = time.time()
|
|
521
|
-
self._latest_final_transcript = ""
|
|
522
521
|
elif isinstance(frame, SpeechControlParamsFrame):
|
|
523
522
|
self._vad_params = frame.vad_params
|
|
524
523
|
self._turn_params = frame.turn_params
|
|
@@ -545,13 +544,14 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
545
544
|
|
|
546
545
|
if should_interrupt:
|
|
547
546
|
self.logger.debug(
|
|
548
|
-
"Interruption conditions met - pushing
|
|
547
|
+
"Interruption conditions met - pushing interruption and aggregation"
|
|
549
548
|
)
|
|
550
|
-
await self.
|
|
549
|
+
await self.push_interruption_task_frame_and_wait()
|
|
551
550
|
await self._process_aggregation()
|
|
552
551
|
else:
|
|
553
552
|
self.logger.debug("Interruption conditions not met - not pushing aggregation")
|
|
554
|
-
# Don't process aggregation,
|
|
553
|
+
# Don't process aggregation, discard pending transcriptions and reset
|
|
554
|
+
await self._discard_pending_transcriptions("interruption_conditions_not_met")
|
|
555
555
|
await self.reset()
|
|
556
556
|
else:
|
|
557
557
|
if trigger_interruption:
|
|
@@ -559,7 +559,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
559
559
|
"Triggering interruption - pushing BotInterruptionFrame and aggregation"
|
|
560
560
|
)
|
|
561
561
|
# await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
|
|
562
|
-
await self.push_frame(
|
|
562
|
+
await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
|
|
563
563
|
self.logger.debug("Pushed BotInterruptionFrame")
|
|
564
564
|
# No interruption config - normal behavior (always push aggregation)
|
|
565
565
|
await self._process_aggregation()
|
|
@@ -591,6 +591,13 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
591
591
|
|
|
592
592
|
return any([await should_interrupt(s) for s in self._interruption_strategies])
|
|
593
593
|
|
|
594
|
+
async def _discard_pending_transcriptions(self, reason: str):
|
|
595
|
+
"""Notify upstream processors that pending transcripts should be dropped."""
|
|
596
|
+
if self._pending_transcription_ids:
|
|
597
|
+
drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
|
|
598
|
+
await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
|
|
599
|
+
self._pending_transcription_ids.clear()
|
|
600
|
+
|
|
594
601
|
async def _start(self, frame: StartFrame):
|
|
595
602
|
self._create_aggregation_task()
|
|
596
603
|
|
|
@@ -617,10 +624,19 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
617
624
|
for s in self.interruption_strategies:
|
|
618
625
|
await s.append_audio(frame.audio, frame.sample_rate)
|
|
619
626
|
|
|
627
|
+
async def _discard_pending_transcriptions(self, reason: str):
|
|
628
|
+
"""Notify upstream processors that pending transcripts should be dropped."""
|
|
629
|
+
if self._pending_transcription_ids:
|
|
630
|
+
drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
|
|
631
|
+
await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
|
|
632
|
+
self._pending_transcription_ids.clear()
|
|
633
|
+
|
|
620
634
|
async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
|
|
621
635
|
if len(self._aggregation) > 0:
|
|
622
636
|
self.logger.debug(f"Dropping {self._aggregation}")
|
|
623
637
|
self._aggregation = ""
|
|
638
|
+
await self._discard_pending_transcriptions("user_started_speaking")
|
|
639
|
+
self._latest_final_transcript = ""
|
|
624
640
|
self._last_user_speaking_time = time.time()
|
|
625
641
|
self._user_speaking = True
|
|
626
642
|
self._waiting_for_aggregation = True
|
|
@@ -664,6 +680,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
664
680
|
return
|
|
665
681
|
|
|
666
682
|
self._aggregation += f" {text}" if self._aggregation else text
|
|
683
|
+
self._pending_transcription_ids.append(frame.id)
|
|
667
684
|
# We just got a final result, so let's reset interim results.
|
|
668
685
|
self._seen_interim_results = False
|
|
669
686
|
|
|
@@ -686,7 +703,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
686
703
|
elif (
|
|
687
704
|
not self._bot_speaking
|
|
688
705
|
and time_since_stopped < 3.0
|
|
689
|
-
and time.time() - self._last_llm_response_start_time > 3.0
|
|
690
706
|
and self._latest_final_transcript != text
|
|
691
707
|
):
|
|
692
708
|
self.logger.debug(
|
|
@@ -794,6 +810,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
|
|
|
794
810
|
if self._bot_speaking and not self._params.enable_emulated_vad_interruptions:
|
|
795
811
|
# If emulated VAD interruptions are disabled and bot is speaking, ignore
|
|
796
812
|
logger.debug("Ignoring user speaking emulation, bot is speaking.")
|
|
813
|
+
await self._discard_pending_transcriptions("emulated_vad_ignored")
|
|
797
814
|
await self.reset()
|
|
798
815
|
else:
|
|
799
816
|
# Either bot is not speaking, or emulated VAD interruptions are enabled
|
|
@@ -908,7 +925,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
|
|
908
925
|
"""
|
|
909
926
|
await super().process_frame(frame, direction)
|
|
910
927
|
|
|
911
|
-
if isinstance(frame,
|
|
928
|
+
if isinstance(frame, InterruptionFrame):
|
|
912
929
|
await self._handle_interruptions(frame)
|
|
913
930
|
await self.push_frame(frame, direction)
|
|
914
931
|
elif isinstance(frame, LLMFullResponseStartFrame):
|
|
@@ -974,7 +991,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
|
|
|
974
991
|
if frame.run_llm:
|
|
975
992
|
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
976
993
|
|
|
977
|
-
async def _handle_interruptions(self, frame:
|
|
994
|
+
async def _handle_interruptions(self, frame: InterruptionFrame):
|
|
978
995
|
await self.push_aggregation()
|
|
979
996
|
self._started = 0
|
|
980
997
|
await self.reset()
|
|
@@ -13,7 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
|
|
|
13
13
|
|
|
14
14
|
import asyncio
|
|
15
15
|
import json
|
|
16
|
-
from
|
|
16
|
+
from abc import abstractmethod
|
|
17
17
|
from typing import Any, Dict, List, Literal, Optional, Set
|
|
18
18
|
|
|
19
19
|
from loguru import logger
|
|
@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
|
|
|
23
23
|
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
|
|
24
24
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
|
25
25
|
from pipecat.frames.frames import (
|
|
26
|
-
BotInterruptionFrame,
|
|
27
26
|
BotStartedSpeakingFrame,
|
|
28
27
|
BotStoppedSpeakingFrame,
|
|
29
28
|
CancelFrame,
|
|
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
|
|
|
37
36
|
FunctionCallsStartedFrame,
|
|
38
37
|
InputAudioRawFrame,
|
|
39
38
|
InterimTranscriptionFrame,
|
|
39
|
+
InterruptionFrame,
|
|
40
40
|
LLMContextAssistantTimestampFrame,
|
|
41
41
|
LLMContextFrame,
|
|
42
42
|
LLMFullResponseEndFrame,
|
|
@@ -48,7 +48,6 @@ from pipecat.frames.frames import (
|
|
|
48
48
|
LLMSetToolsFrame,
|
|
49
49
|
SpeechControlParamsFrame,
|
|
50
50
|
StartFrame,
|
|
51
|
-
StartInterruptionFrame,
|
|
52
51
|
TextFrame,
|
|
53
52
|
TranscriptionFrame,
|
|
54
53
|
UserImageRawFrame,
|
|
@@ -171,6 +170,11 @@ class LLMContextAggregator(FrameProcessor):
|
|
|
171
170
|
"""Reset the aggregation state."""
|
|
172
171
|
self._aggregation = ""
|
|
173
172
|
|
|
173
|
+
@abstractmethod
|
|
174
|
+
async def push_aggregation(self):
|
|
175
|
+
"""Push the current aggregation downstream."""
|
|
176
|
+
pass
|
|
177
|
+
|
|
174
178
|
|
|
175
179
|
class LLMUserAggregator(LLMContextAggregator):
|
|
176
180
|
"""User LLM aggregator that processes speech-to-text transcriptions.
|
|
@@ -303,7 +307,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|
|
303
307
|
frame = LLMContextFrame(self._context)
|
|
304
308
|
await self.push_frame(frame)
|
|
305
309
|
|
|
306
|
-
async def
|
|
310
|
+
async def push_aggregation(self):
|
|
307
311
|
"""Push the current aggregation based on interruption strategies and conditions."""
|
|
308
312
|
if len(self._aggregation) > 0:
|
|
309
313
|
if self.interruption_strategies and self._bot_speaking:
|
|
@@ -311,9 +315,9 @@ class LLMUserAggregator(LLMContextAggregator):
|
|
|
311
315
|
|
|
312
316
|
if should_interrupt:
|
|
313
317
|
logger.debug(
|
|
314
|
-
"Interruption conditions met - pushing
|
|
318
|
+
"Interruption conditions met - pushing interruption and aggregation"
|
|
315
319
|
)
|
|
316
|
-
await self.
|
|
320
|
+
await self.push_interruption_task_frame_and_wait()
|
|
317
321
|
await self._process_aggregation()
|
|
318
322
|
else:
|
|
319
323
|
logger.debug("Interruption conditions not met - not pushing aggregation")
|
|
@@ -394,7 +398,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|
|
394
398
|
# pushing the aggregation as we will probably get a final transcription.
|
|
395
399
|
if len(self._aggregation) > 0:
|
|
396
400
|
if not self._seen_interim_results:
|
|
397
|
-
await self.
|
|
401
|
+
await self.push_aggregation()
|
|
398
402
|
# Handles the case where both the user and the bot are not speaking,
|
|
399
403
|
# and the bot was previously speaking before the user interruption.
|
|
400
404
|
# So in this case we are resetting the aggregation timer
|
|
@@ -473,7 +477,7 @@ class LLMUserAggregator(LLMContextAggregator):
|
|
|
473
477
|
await self._maybe_emulate_user_speaking()
|
|
474
478
|
except asyncio.TimeoutError:
|
|
475
479
|
if not self._user_speaking:
|
|
476
|
-
await self.
|
|
480
|
+
await self.push_aggregation()
|
|
477
481
|
|
|
478
482
|
# If we are emulating VAD we still need to send the user stopped
|
|
479
483
|
# speaking frame.
|
|
@@ -579,7 +583,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|
|
579
583
|
"""
|
|
580
584
|
await super().process_frame(frame, direction)
|
|
581
585
|
|
|
582
|
-
if isinstance(frame,
|
|
586
|
+
if isinstance(frame, InterruptionFrame):
|
|
583
587
|
await self._handle_interruptions(frame)
|
|
584
588
|
await self.push_frame(frame, direction)
|
|
585
589
|
elif isinstance(frame, LLMFullResponseStartFrame):
|
|
@@ -609,12 +613,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|
|
609
613
|
elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
|
|
610
614
|
await self._handle_user_image_frame(frame)
|
|
611
615
|
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
612
|
-
await self.
|
|
616
|
+
await self.push_aggregation()
|
|
613
617
|
await self.push_frame(frame, direction)
|
|
614
618
|
else:
|
|
615
619
|
await self.push_frame(frame, direction)
|
|
616
620
|
|
|
617
|
-
async def
|
|
621
|
+
async def push_aggregation(self):
|
|
618
622
|
"""Push the current assistant aggregation with timestamp."""
|
|
619
623
|
if not self._aggregation:
|
|
620
624
|
return
|
|
@@ -645,8 +649,8 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|
|
645
649
|
if frame.run_llm:
|
|
646
650
|
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
647
651
|
|
|
648
|
-
async def _handle_interruptions(self, frame:
|
|
649
|
-
await self.
|
|
652
|
+
async def _handle_interruptions(self, frame: InterruptionFrame):
|
|
653
|
+
await self.push_aggregation()
|
|
650
654
|
self._started = 0
|
|
651
655
|
await self.reset()
|
|
652
656
|
|
|
@@ -780,7 +784,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|
|
780
784
|
text=frame.request.context,
|
|
781
785
|
)
|
|
782
786
|
|
|
783
|
-
await self.
|
|
787
|
+
await self.push_aggregation()
|
|
784
788
|
await self.push_context_frame(FrameDirection.UPSTREAM)
|
|
785
789
|
|
|
786
790
|
async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
|
|
@@ -788,7 +792,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
|
|
|
788
792
|
|
|
789
793
|
async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
|
|
790
794
|
self._started -= 1
|
|
791
|
-
await self.
|
|
795
|
+
await self.push_aggregation()
|
|
792
796
|
|
|
793
797
|
async def _handle_text(self, frame: TextFrame):
|
|
794
798
|
if not self._started:
|
|
@@ -12,14 +12,14 @@ in conversational pipelines.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from pipecat.frames.frames import TextFrame
|
|
15
|
-
from pipecat.processors.aggregators.
|
|
16
|
-
from pipecat.processors.aggregators.
|
|
15
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
16
|
+
from pipecat.processors.aggregators.llm_response_universal import LLMUserAggregator
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class UserResponseAggregator(
|
|
19
|
+
class UserResponseAggregator(LLMUserAggregator):
|
|
20
20
|
"""Aggregates user responses into TextFrame objects.
|
|
21
21
|
|
|
22
|
-
This aggregator extends
|
|
22
|
+
This aggregator extends LLMUserAggregator to specifically handle
|
|
23
23
|
user input by collecting text responses and outputting them as TextFrame
|
|
24
24
|
objects when the aggregation is complete.
|
|
25
25
|
"""
|
|
@@ -28,9 +28,9 @@ class UserResponseAggregator(LLMUserContextAggregator):
|
|
|
28
28
|
"""Initialize the user response aggregator.
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
|
-
**kwargs: Additional arguments passed to parent
|
|
31
|
+
**kwargs: Additional arguments passed to parent LLMUserAggregator.
|
|
32
32
|
"""
|
|
33
|
-
super().__init__(context=
|
|
33
|
+
super().__init__(context=LLMContext(), **kwargs)
|
|
34
34
|
|
|
35
35
|
async def push_aggregation(self):
|
|
36
36
|
"""Push the aggregated user response as a TextFrame.
|
|
@@ -10,13 +10,22 @@ This module provides frame aggregation functionality to combine text and image
|
|
|
10
10
|
frames into vision frames for multimodal processing.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
-
from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
|
|
13
|
+
from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
|
|
14
|
+
from pipecat.processors.aggregators.openai_llm_context import (
|
|
15
|
+
OpenAILLMContext,
|
|
16
|
+
OpenAILLMContextFrame,
|
|
17
|
+
)
|
|
14
18
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class VisionImageFrameAggregator(FrameProcessor):
|
|
18
22
|
"""Aggregates consecutive text and image frames into vision frames.
|
|
19
23
|
|
|
24
|
+
.. deprecated:: 0.0.85
|
|
25
|
+
VisionImageRawFrame has been removed in favor of context frames
|
|
26
|
+
(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
|
|
27
|
+
needed anymore. See the 12* examples for the new recommended pattern.
|
|
28
|
+
|
|
20
29
|
This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
|
|
21
30
|
After the InputImageRawFrame arrives it will output a VisionImageRawFrame
|
|
22
31
|
combining both the text and image data for multimodal processing.
|
|
@@ -28,6 +37,17 @@ class VisionImageFrameAggregator(FrameProcessor):
|
|
|
28
37
|
The aggregator starts with no cached text, waiting for the first
|
|
29
38
|
TextFrame to arrive before it can create vision frames.
|
|
30
39
|
"""
|
|
40
|
+
import warnings
|
|
41
|
+
|
|
42
|
+
warnings.warn(
|
|
43
|
+
"VisionImageFrameAggregator is deprecated. "
|
|
44
|
+
"VisionImageRawFrame has been removed in favor of context frames "
|
|
45
|
+
"(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
|
|
46
|
+
"not needed anymore. See the 12* examples for the new recommended "
|
|
47
|
+
"pattern.",
|
|
48
|
+
DeprecationWarning,
|
|
49
|
+
stacklevel=2,
|
|
50
|
+
)
|
|
31
51
|
super().__init__()
|
|
32
52
|
self._describe_text = None
|
|
33
53
|
|
|
@@ -47,12 +67,14 @@ class VisionImageFrameAggregator(FrameProcessor):
|
|
|
47
67
|
self._describe_text = frame.text
|
|
48
68
|
elif isinstance(frame, InputImageRawFrame):
|
|
49
69
|
if self._describe_text:
|
|
50
|
-
|
|
70
|
+
context = OpenAILLMContext()
|
|
71
|
+
context.add_image_frame_message(
|
|
51
72
|
text=self._describe_text,
|
|
52
73
|
image=frame.image,
|
|
53
74
|
size=frame.size,
|
|
54
75
|
format=frame.format,
|
|
55
76
|
)
|
|
77
|
+
frame = OpenAILLMContextFrame(context)
|
|
56
78
|
await self.push_frame(frame)
|
|
57
79
|
self._describe_text = None
|
|
58
80
|
else:
|
|
@@ -137,12 +137,12 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
137
137
|
return self._num_channels
|
|
138
138
|
|
|
139
139
|
def has_audio(self) -> bool:
|
|
140
|
-
"""Check if
|
|
140
|
+
"""Check if either user or bot audio buffers contain data.
|
|
141
141
|
|
|
142
142
|
Returns:
|
|
143
|
-
True if
|
|
143
|
+
True if either buffer contains audio data.
|
|
144
144
|
"""
|
|
145
|
-
return self._buffer_has_audio(self._user_audio_buffer)
|
|
145
|
+
return self._buffer_has_audio(self._user_audio_buffer) or self._buffer_has_audio(
|
|
146
146
|
self._bot_audio_buffer
|
|
147
147
|
)
|
|
148
148
|
|
|
@@ -229,9 +229,12 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
229
229
|
# Save time of frame so we can compute silence.
|
|
230
230
|
self._last_bot_frame_at = time.time()
|
|
231
231
|
|
|
232
|
-
if self._buffer_size > 0 and
|
|
232
|
+
if self._buffer_size > 0 and (
|
|
233
|
+
len(self._user_audio_buffer) >= self._buffer_size
|
|
234
|
+
or len(self._bot_audio_buffer) >= self._buffer_size
|
|
235
|
+
):
|
|
233
236
|
await self._call_on_audio_data_handler()
|
|
234
|
-
self.
|
|
237
|
+
self._reset_primary_audio_buffers()
|
|
235
238
|
|
|
236
239
|
# Process turn recording with preprocessed data.
|
|
237
240
|
if self._enable_turn_audio:
|
|
@@ -272,9 +275,15 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
272
275
|
|
|
273
276
|
async def _call_on_audio_data_handler(self):
|
|
274
277
|
"""Call the audio data event handlers with buffered audio."""
|
|
275
|
-
if not self.
|
|
278
|
+
if not self._recording:
|
|
276
279
|
return
|
|
277
280
|
|
|
281
|
+
if len(self._user_audio_buffer) == 0 and len(self._bot_audio_buffer) == 0:
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
self._align_track_buffers()
|
|
285
|
+
flush_time = time.time()
|
|
286
|
+
|
|
278
287
|
# Call original handler with merged audio
|
|
279
288
|
merged_audio = self.merge_audio_buffers()
|
|
280
289
|
await self._call_event_handler(
|
|
@@ -290,23 +299,49 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
290
299
|
self._num_channels,
|
|
291
300
|
)
|
|
292
301
|
|
|
302
|
+
self._last_user_frame_at = flush_time
|
|
303
|
+
self._last_bot_frame_at = flush_time
|
|
304
|
+
|
|
293
305
|
def _buffer_has_audio(self, buffer: bytearray) -> bool:
|
|
294
306
|
"""Check if a buffer contains audio data."""
|
|
295
307
|
return buffer is not None and len(buffer) > 0
|
|
296
308
|
|
|
297
309
|
def _reset_recording(self):
|
|
298
310
|
"""Reset recording state and buffers."""
|
|
299
|
-
self.
|
|
311
|
+
self._reset_all_audio_buffers()
|
|
300
312
|
self._last_user_frame_at = time.time()
|
|
301
313
|
self._last_bot_frame_at = time.time()
|
|
302
314
|
|
|
303
|
-
def
|
|
315
|
+
def _reset_all_audio_buffers(self):
|
|
304
316
|
"""Reset all audio buffers to empty state."""
|
|
317
|
+
self._reset_primary_audio_buffers()
|
|
318
|
+
self._reset_turn_audio_buffers()
|
|
319
|
+
|
|
320
|
+
def _reset_primary_audio_buffers(self):
|
|
321
|
+
"""Clear user and bot buffers while preserving turn buffers and timestamps."""
|
|
305
322
|
self._user_audio_buffer = bytearray()
|
|
306
323
|
self._bot_audio_buffer = bytearray()
|
|
324
|
+
|
|
325
|
+
def _reset_turn_audio_buffers(self):
|
|
326
|
+
"""Clear user and bot turn buffers while preserving primary buffers and timestamps."""
|
|
307
327
|
self._user_turn_audio_buffer = bytearray()
|
|
308
328
|
self._bot_turn_audio_buffer = bytearray()
|
|
309
329
|
|
|
330
|
+
def _align_track_buffers(self):
|
|
331
|
+
"""Pad the shorter track with silence so both tracks stay in sync."""
|
|
332
|
+
user_len = len(self._user_audio_buffer)
|
|
333
|
+
bot_len = len(self._bot_audio_buffer)
|
|
334
|
+
if user_len == bot_len:
|
|
335
|
+
return
|
|
336
|
+
|
|
337
|
+
target_len = max(user_len, bot_len)
|
|
338
|
+
if user_len < target_len:
|
|
339
|
+
self._user_audio_buffer.extend(b"\x00" * (target_len - user_len))
|
|
340
|
+
self._last_user_frame_at = max(self._last_user_frame_at, self._last_bot_frame_at)
|
|
341
|
+
if bot_len < target_len:
|
|
342
|
+
self._bot_audio_buffer.extend(b"\x00" * (target_len - bot_len))
|
|
343
|
+
self._last_bot_frame_at = max(self._last_bot_frame_at, self._last_user_frame_at)
|
|
344
|
+
|
|
310
345
|
async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
|
|
311
346
|
"""Resample audio frame to the target sample rate."""
|
|
312
347
|
return await self._input_resampler.resample(
|