dv-pipecat-ai 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev698__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/METADATA +78 -117
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/RECORD +156 -122
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +120 -87
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +12 -4
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +85 -24
- pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_response.py +6 -7
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/filters/stt_mute_filter.py +2 -0
- pipecat/processors/frame_processor.py +103 -17
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +209 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +4 -4
- pipecat/processors/user_idle_processor.py +3 -6
- pipecat/runner/run.py +270 -50
- pipecat/runner/types.py +2 -0
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +6 -9
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/asyncai/tts.py +2 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +122 -97
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +367 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1155 -0
- pipecat/services/aws/stt.py +1 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +13 -355
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/tts.py +2 -2
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +636 -0
- pipecat/services/elevenlabs/__init__.py +2 -1
- pipecat/services/elevenlabs/stt.py +254 -276
- pipecat/services/elevenlabs/tts.py +5 -5
- pipecat/services/fish/tts.py +2 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +2 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +2 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +16 -8
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/playht/tts.py +31 -4
- pipecat/services/rime/tts.py +3 -4
- pipecat/services/sarvam/tts.py +2 -6
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +1 -7
- pipecat/services/stt_service.py +34 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +9 -9
- pipecat/services/vision_service.py +7 -6
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +17 -42
- pipecat/transports/base_output.py +42 -26
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +98 -19
- pipecat/transports/smallwebrtc/request_handler.py +204 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/string.py +12 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -14,7 +14,8 @@ visual content.
|
|
|
14
14
|
from abc import abstractmethod
|
|
15
15
|
from typing import AsyncGenerator
|
|
16
16
|
|
|
17
|
-
from pipecat.frames.frames import Frame,
|
|
17
|
+
from pipecat.frames.frames import Frame, LLMContextFrame
|
|
18
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
18
19
|
from pipecat.processors.frame_processor import FrameDirection
|
|
19
20
|
from pipecat.services.ai_service import AIService
|
|
20
21
|
|
|
@@ -37,15 +38,15 @@ class VisionService(AIService):
|
|
|
37
38
|
self._describe_text = None
|
|
38
39
|
|
|
39
40
|
@abstractmethod
|
|
40
|
-
async def run_vision(self,
|
|
41
|
-
"""Process
|
|
41
|
+
async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
|
|
42
|
+
"""Process the latest image in the context and generate results.
|
|
42
43
|
|
|
43
44
|
This method must be implemented by subclasses to provide actual computer
|
|
44
45
|
vision functionality such as image description, object detection, or
|
|
45
46
|
visual question answering.
|
|
46
47
|
|
|
47
48
|
Args:
|
|
48
|
-
|
|
49
|
+
context: The context to process, containing image data.
|
|
49
50
|
|
|
50
51
|
Yields:
|
|
51
52
|
Frame: Frames containing the vision analysis results, typically TextFrame
|
|
@@ -65,9 +66,9 @@ class VisionService(AIService):
|
|
|
65
66
|
"""
|
|
66
67
|
await super().process_frame(frame, direction)
|
|
67
68
|
|
|
68
|
-
if isinstance(frame,
|
|
69
|
+
if isinstance(frame, LLMContextFrame):
|
|
69
70
|
await self.start_processing_metrics()
|
|
70
|
-
await self.process_generator(self.run_vision(frame))
|
|
71
|
+
await self.process_generator(self.run_vision(frame.context))
|
|
71
72
|
await self.stop_processing_metrics()
|
|
72
73
|
else:
|
|
73
74
|
await self.push_frame(frame, direction)
|
pipecat/tests/utils.py
CHANGED
|
@@ -128,7 +128,7 @@ async def run_test(
|
|
|
128
128
|
expected_up_frames: Optional[Sequence[type]] = None,
|
|
129
129
|
ignore_start: bool = True,
|
|
130
130
|
observers: Optional[List[BaseObserver]] = None,
|
|
131
|
-
|
|
131
|
+
pipeline_params: Optional[PipelineParams] = None,
|
|
132
132
|
send_end_frame: bool = True,
|
|
133
133
|
) -> Tuple[Sequence[Frame], Sequence[Frame]]:
|
|
134
134
|
"""Run a test pipeline with the specified processor and validate frame flow.
|
|
@@ -144,7 +144,7 @@ async def run_test(
|
|
|
144
144
|
expected_up_frames: Expected frame types flowing upstream (optional).
|
|
145
145
|
ignore_start: Whether to ignore StartFrames in frame validation.
|
|
146
146
|
observers: Optional list of observers to attach to the pipeline.
|
|
147
|
-
|
|
147
|
+
pipeline_params: Optional pipeline parameters.
|
|
148
148
|
send_end_frame: Whether to send an EndFrame at the end of the test.
|
|
149
149
|
|
|
150
150
|
Returns:
|
|
@@ -154,7 +154,7 @@ async def run_test(
|
|
|
154
154
|
AssertionError: If the received frames don't match the expected frame types.
|
|
155
155
|
"""
|
|
156
156
|
observers = observers or []
|
|
157
|
-
|
|
157
|
+
pipeline_params = pipeline_params or PipelineParams()
|
|
158
158
|
|
|
159
159
|
received_up = asyncio.Queue()
|
|
160
160
|
received_down = asyncio.Queue()
|
|
@@ -173,7 +173,7 @@ async def run_test(
|
|
|
173
173
|
|
|
174
174
|
task = PipelineTask(
|
|
175
175
|
pipeline,
|
|
176
|
-
params=
|
|
176
|
+
params=pipeline_params,
|
|
177
177
|
observers=observers,
|
|
178
178
|
cancel_on_idle_timeout=False,
|
|
179
179
|
)
|
|
@@ -68,6 +68,9 @@ class Language(StrEnum):
|
|
|
68
68
|
AS = "as"
|
|
69
69
|
AS_IN = "as-IN"
|
|
70
70
|
|
|
71
|
+
# Asturian
|
|
72
|
+
AST = "ast"
|
|
73
|
+
|
|
71
74
|
# Azerbaijani
|
|
72
75
|
AZ = "az"
|
|
73
76
|
AZ_AZ = "az-AZ"
|
|
@@ -101,6 +104,9 @@ class Language(StrEnum):
|
|
|
101
104
|
CA = "ca"
|
|
102
105
|
CA_ES = "ca-ES"
|
|
103
106
|
|
|
107
|
+
# Cebuano
|
|
108
|
+
CEB = "ceb"
|
|
109
|
+
|
|
104
110
|
# Mandarin Chinese
|
|
105
111
|
CMN = "cmn"
|
|
106
112
|
CMN_CN = "cmn-CN"
|
|
@@ -185,6 +191,9 @@ class Language(StrEnum):
|
|
|
185
191
|
FA = "fa"
|
|
186
192
|
FA_IR = "fa-IR"
|
|
187
193
|
|
|
194
|
+
# Fulah
|
|
195
|
+
FF = "ff"
|
|
196
|
+
|
|
188
197
|
# Finnish
|
|
189
198
|
FI = "fi"
|
|
190
199
|
FI_FI = "fi-FI"
|
|
@@ -251,6 +260,9 @@ class Language(StrEnum):
|
|
|
251
260
|
ID = "id"
|
|
252
261
|
ID_ID = "id-ID"
|
|
253
262
|
|
|
263
|
+
# Igbo
|
|
264
|
+
IG = "ig"
|
|
265
|
+
|
|
254
266
|
# Icelandic
|
|
255
267
|
IS = "is"
|
|
256
268
|
IS_IS = "is-IS"
|
|
@@ -279,6 +291,9 @@ class Language(StrEnum):
|
|
|
279
291
|
KA = "ka"
|
|
280
292
|
KA_GE = "ka-GE"
|
|
281
293
|
|
|
294
|
+
# Kabuverdianu
|
|
295
|
+
KEA = "kea"
|
|
296
|
+
|
|
282
297
|
# Kazakh
|
|
283
298
|
KK = "kk"
|
|
284
299
|
KK_KZ = "kk-KZ"
|
|
@@ -295,6 +310,13 @@ class Language(StrEnum):
|
|
|
295
310
|
KO = "ko"
|
|
296
311
|
KO_KR = "ko-KR"
|
|
297
312
|
|
|
313
|
+
# Kurdish
|
|
314
|
+
KU = "ku"
|
|
315
|
+
|
|
316
|
+
# Kyrgyz
|
|
317
|
+
KY = "ky"
|
|
318
|
+
KY_KG = "ky-KG"
|
|
319
|
+
|
|
298
320
|
# Latin
|
|
299
321
|
LA = "la"
|
|
300
322
|
|
|
@@ -312,6 +334,12 @@ class Language(StrEnum):
|
|
|
312
334
|
LT = "lt"
|
|
313
335
|
LT_LT = "lt-LT"
|
|
314
336
|
|
|
337
|
+
# Ganda
|
|
338
|
+
LG = "lg"
|
|
339
|
+
|
|
340
|
+
# Luo
|
|
341
|
+
LUO = "luo"
|
|
342
|
+
|
|
315
343
|
# Latvian
|
|
316
344
|
LV = "lv"
|
|
317
345
|
LV_LV = "lv-LV"
|
|
@@ -366,6 +394,12 @@ class Language(StrEnum):
|
|
|
366
394
|
NL_BE = "nl-BE"
|
|
367
395
|
NL_NL = "nl-NL"
|
|
368
396
|
|
|
397
|
+
# Northern Sotho
|
|
398
|
+
NSO = "nso"
|
|
399
|
+
|
|
400
|
+
# Chichewa
|
|
401
|
+
NY = "ny"
|
|
402
|
+
|
|
369
403
|
# Occitan
|
|
370
404
|
OC = "oc"
|
|
371
405
|
|
|
@@ -484,6 +518,9 @@ class Language(StrEnum):
|
|
|
484
518
|
UK = "uk"
|
|
485
519
|
UK_UA = "uk-UA"
|
|
486
520
|
|
|
521
|
+
# Umbundu
|
|
522
|
+
UMB = "umb"
|
|
523
|
+
|
|
487
524
|
# Urdu
|
|
488
525
|
UR = "ur"
|
|
489
526
|
UR_IN = "ur-IN"
|
|
@@ -497,6 +534,9 @@ class Language(StrEnum):
|
|
|
497
534
|
VI = "vi"
|
|
498
535
|
VI_VN = "vi-VN"
|
|
499
536
|
|
|
537
|
+
# Wolof
|
|
538
|
+
WO = "wo"
|
|
539
|
+
|
|
500
540
|
# Wu Chinese
|
|
501
541
|
WUU = "wuu"
|
|
502
542
|
WUU_CN = "wuu-CN"
|
|
@@ -507,7 +547,7 @@ class Language(StrEnum):
|
|
|
507
547
|
# Yoruba
|
|
508
548
|
YO = "yo"
|
|
509
549
|
|
|
510
|
-
# Yue Chinese
|
|
550
|
+
# Yue Chinese (Cantonese)
|
|
511
551
|
YUE = "yue"
|
|
512
552
|
YUE_CN = "yue-CN"
|
|
513
553
|
|
pipecat/transports/base_input.py
CHANGED
|
@@ -11,7 +11,6 @@ input processing, including VAD, turn analysis, and interruption management.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import asyncio
|
|
14
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
15
14
|
from typing import Optional
|
|
16
15
|
|
|
17
16
|
from loguru import logger
|
|
@@ -22,7 +21,6 @@ from pipecat.audio.turn.base_turn_analyzer import (
|
|
|
22
21
|
)
|
|
23
22
|
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
|
|
24
23
|
from pipecat.frames.frames import (
|
|
25
|
-
BotInterruptionFrame,
|
|
26
24
|
BotStartedSpeakingFrame,
|
|
27
25
|
BotStoppedSpeakingFrame,
|
|
28
26
|
CancelFrame,
|
|
@@ -36,7 +34,6 @@ from pipecat.frames.frames import (
|
|
|
36
34
|
MetricsFrame,
|
|
37
35
|
SpeechControlParamsFrame,
|
|
38
36
|
StartFrame,
|
|
39
|
-
StartInterruptionFrame,
|
|
40
37
|
StopFrame,
|
|
41
38
|
SystemFrame,
|
|
42
39
|
UserSpeakingFrame,
|
|
@@ -81,10 +78,6 @@ class BaseInputTransport(FrameProcessor):
|
|
|
81
78
|
# Track user speaking state for interruption logic
|
|
82
79
|
self._user_speaking = False
|
|
83
80
|
|
|
84
|
-
# We read audio from a single queue one at a time and we then run VAD in
|
|
85
|
-
# a thread. Therefore, only one thread should be necessary.
|
|
86
|
-
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
87
|
-
|
|
88
81
|
# Task to process incoming audio (VAD) and push audio frames downstream
|
|
89
82
|
# if passthrough is enabled.
|
|
90
83
|
self._audio_task = None
|
|
@@ -289,8 +282,6 @@ class BaseInputTransport(FrameProcessor):
|
|
|
289
282
|
elif isinstance(frame, CancelFrame):
|
|
290
283
|
await self.cancel(frame)
|
|
291
284
|
await self.push_frame(frame, direction)
|
|
292
|
-
elif isinstance(frame, BotInterruptionFrame):
|
|
293
|
-
await self._handle_bot_interruption(frame)
|
|
294
285
|
elif isinstance(frame, BotStartedSpeakingFrame):
|
|
295
286
|
await self._handle_bot_started_speaking(frame)
|
|
296
287
|
await self.push_frame(frame, direction)
|
|
@@ -298,22 +289,12 @@ class BaseInputTransport(FrameProcessor):
|
|
|
298
289
|
await self._handle_bot_stopped_speaking(frame)
|
|
299
290
|
await self.push_frame(frame, direction)
|
|
300
291
|
elif isinstance(frame, EmulateUserStartedSpeakingFrame):
|
|
301
|
-
logger.debug("Emulating user started speaking")
|
|
292
|
+
self.logger.debug("Emulating user started speaking")
|
|
302
293
|
await self._handle_user_interruption(VADState.SPEAKING, emulated=True)
|
|
303
294
|
elif isinstance(frame, EmulateUserStoppedSpeakingFrame):
|
|
304
|
-
logger.debug("Emulating user stopped speaking")
|
|
295
|
+
self.logger.debug("Emulating user stopped speaking")
|
|
305
296
|
await self._handle_user_interruption(VADState.QUIET, emulated=True)
|
|
306
297
|
# All other system frames
|
|
307
|
-
elif isinstance(frame, VADParamsUpdateFrame):
|
|
308
|
-
if self.vad_analyzer:
|
|
309
|
-
self.vad_analyzer.set_params(frame.params, bot_logger=self.logger)
|
|
310
|
-
speech_frame = SpeechControlParamsFrame(
|
|
311
|
-
vad_params=frame.params,
|
|
312
|
-
turn_params=self._params.turn_analyzer.params
|
|
313
|
-
if self._params.turn_analyzer
|
|
314
|
-
else None,
|
|
315
|
-
)
|
|
316
|
-
await self.push_frame(speech_frame)
|
|
317
298
|
elif isinstance(frame, SystemFrame):
|
|
318
299
|
await self.push_frame(frame, direction)
|
|
319
300
|
# Control frames
|
|
@@ -325,6 +306,16 @@ class BaseInputTransport(FrameProcessor):
|
|
|
325
306
|
elif isinstance(frame, StopFrame):
|
|
326
307
|
await self.push_frame(frame, direction)
|
|
327
308
|
await self.pause(frame)
|
|
309
|
+
elif isinstance(frame, VADParamsUpdateFrame):
|
|
310
|
+
if self.vad_analyzer:
|
|
311
|
+
self.vad_analyzer.set_params(frame.params)
|
|
312
|
+
speech_frame = SpeechControlParamsFrame(
|
|
313
|
+
vad_params=frame.params,
|
|
314
|
+
turn_params=self._params.turn_analyzer.params
|
|
315
|
+
if self._params.turn_analyzer
|
|
316
|
+
else None,
|
|
317
|
+
)
|
|
318
|
+
await self.push_frame(speech_frame)
|
|
328
319
|
elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
|
|
329
320
|
await self._params.audio_in_filter.process_frame(frame)
|
|
330
321
|
# Other frames
|
|
@@ -335,13 +326,6 @@ class BaseInputTransport(FrameProcessor):
|
|
|
335
326
|
# Handle interruptions
|
|
336
327
|
#
|
|
337
328
|
|
|
338
|
-
async def _handle_bot_interruption(self, frame: BotInterruptionFrame):
|
|
339
|
-
"""Handle bot interruption frames."""
|
|
340
|
-
self.logger.debug("Bot interruption")
|
|
341
|
-
if self.interruptions_allowed:
|
|
342
|
-
await self._start_interruption()
|
|
343
|
-
await self.push_frame(StartInterruptionFrame())
|
|
344
|
-
|
|
345
329
|
async def _handle_user_interruption(self, vad_state: VADState, emulated: bool = False):
|
|
346
330
|
"""Handle user interruption events based on speaking state."""
|
|
347
331
|
if vad_state == VADState.SPEAKING:
|
|
@@ -353,7 +337,7 @@ class BaseInputTransport(FrameProcessor):
|
|
|
353
337
|
await self.push_frame(downstream_frame)
|
|
354
338
|
await self.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
355
339
|
|
|
356
|
-
# Only push
|
|
340
|
+
# Only push InterruptionFrame if:
|
|
357
341
|
# 1. No interruption config is set, OR
|
|
358
342
|
# 2. Interruption config is set but bot is not speaking
|
|
359
343
|
should_push_immediate_interruption = (
|
|
@@ -362,13 +346,9 @@ class BaseInputTransport(FrameProcessor):
|
|
|
362
346
|
|
|
363
347
|
# Make sure we notify about interruptions quickly out-of-band.
|
|
364
348
|
if should_push_immediate_interruption and self.interruptions_allowed:
|
|
365
|
-
await self.
|
|
366
|
-
# Push an out-of-band frame (i.e. not using the ordered push
|
|
367
|
-
# frame task) to stop everything, specially at the output
|
|
368
|
-
# transport.
|
|
369
|
-
await self.push_frame(StartInterruptionFrame())
|
|
349
|
+
await self.push_interruption_task_frame_and_wait()
|
|
370
350
|
elif self.interruption_strategies and self._bot_speaking:
|
|
371
|
-
logger.debug(
|
|
351
|
+
self.logger.debug(
|
|
372
352
|
"User started speaking while bot is speaking with interruption config - "
|
|
373
353
|
"deferring interruption to aggregator"
|
|
374
354
|
)
|
|
@@ -381,9 +361,6 @@ class BaseInputTransport(FrameProcessor):
|
|
|
381
361
|
await self.push_frame(downstream_frame)
|
|
382
362
|
await self.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
383
363
|
|
|
384
|
-
if self.interruptions_allowed:
|
|
385
|
-
await self._stop_interruption()
|
|
386
|
-
|
|
387
364
|
#
|
|
388
365
|
# Handle bot speaking state
|
|
389
366
|
#
|
|
@@ -416,9 +393,7 @@ class BaseInputTransport(FrameProcessor):
|
|
|
416
393
|
"""Analyze audio frame for voice activity."""
|
|
417
394
|
state = VADState.QUIET
|
|
418
395
|
if self.vad_analyzer:
|
|
419
|
-
state = await self.
|
|
420
|
-
self._executor, self.vad_analyzer.analyze_audio, audio_frame.audio
|
|
421
|
-
)
|
|
396
|
+
state = await self.vad_analyzer.analyze_audio(audio_frame.audio)
|
|
422
397
|
return state
|
|
423
398
|
|
|
424
399
|
async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState) -> VADState:
|
|
@@ -511,7 +486,7 @@ class BaseInputTransport(FrameProcessor):
|
|
|
511
486
|
self._audio_in_queue.task_done()
|
|
512
487
|
except asyncio.TimeoutError:
|
|
513
488
|
if self._user_speaking:
|
|
514
|
-
logger.warning(
|
|
489
|
+
self.logger.warning(
|
|
515
490
|
"Forcing user stopped speaking due to timeout receiving audio frame!"
|
|
516
491
|
)
|
|
517
492
|
vad_state = VADState.QUIET
|
|
@@ -29,20 +29,19 @@ from pipecat.frames.frames import (
|
|
|
29
29
|
CancelFrame,
|
|
30
30
|
EndFrame,
|
|
31
31
|
Frame,
|
|
32
|
-
|
|
32
|
+
InterruptionFrame,
|
|
33
33
|
MixerControlFrame,
|
|
34
34
|
OutputAudioRawFrame,
|
|
35
35
|
OutputDTMFFrame,
|
|
36
36
|
OutputDTMFUrgentFrame,
|
|
37
37
|
OutputImageRawFrame,
|
|
38
|
+
OutputTransportMessageFrame,
|
|
39
|
+
OutputTransportMessageUrgentFrame,
|
|
38
40
|
OutputTransportReadyFrame,
|
|
39
41
|
SpeechOutputAudioRawFrame,
|
|
40
42
|
SpriteFrame,
|
|
41
43
|
StartFrame,
|
|
42
|
-
StartInterruptionFrame,
|
|
43
44
|
SystemFrame,
|
|
44
|
-
TransportMessageFrame,
|
|
45
|
-
TransportMessageUrgentFrame,
|
|
46
45
|
TTSAudioRawFrame,
|
|
47
46
|
)
|
|
48
47
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
@@ -179,7 +178,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
179
178
|
# Sending a frame indicating that the output transport is ready and able to receive frames.
|
|
180
179
|
await self.push_frame(OutputTransportReadyFrame(), FrameDirection.UPSTREAM)
|
|
181
180
|
|
|
182
|
-
async def send_message(
|
|
181
|
+
async def send_message(
|
|
182
|
+
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
|
183
|
+
):
|
|
183
184
|
"""Send a transport message.
|
|
184
185
|
|
|
185
186
|
Args:
|
|
@@ -203,21 +204,27 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
203
204
|
"""
|
|
204
205
|
pass
|
|
205
206
|
|
|
206
|
-
async def write_video_frame(self, frame: OutputImageRawFrame):
|
|
207
|
+
async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
|
|
207
208
|
"""Write a video frame to the transport.
|
|
208
209
|
|
|
209
210
|
Args:
|
|
210
211
|
frame: The output video frame to write.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
True if the video frame was written successfully, False otherwise.
|
|
211
215
|
"""
|
|
212
|
-
|
|
216
|
+
return False
|
|
213
217
|
|
|
214
|
-
async def write_audio_frame(self, frame: OutputAudioRawFrame):
|
|
218
|
+
async def write_audio_frame(self, frame: OutputAudioRawFrame) -> bool:
|
|
215
219
|
"""Write an audio frame to the transport.
|
|
216
220
|
|
|
217
221
|
Args:
|
|
218
222
|
frame: The output audio frame to write.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
True if the audio frame was written successfully, False otherwise.
|
|
219
226
|
"""
|
|
220
|
-
|
|
227
|
+
return False
|
|
221
228
|
|
|
222
229
|
async def write_dtmf(self, frame: OutputDTMFFrame | OutputDTMFUrgentFrame):
|
|
223
230
|
"""Write a DTMF tone using the transport's preferred method.
|
|
@@ -288,9 +295,8 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
288
295
|
await super().process_frame(frame, direction)
|
|
289
296
|
|
|
290
297
|
#
|
|
291
|
-
# System frames (like
|
|
292
|
-
#
|
|
293
|
-
# queue.
|
|
298
|
+
# System frames (like InterruptionFrame) are pushed immediately. Other
|
|
299
|
+
# frames require order so they are put in the sink queue.
|
|
294
300
|
#
|
|
295
301
|
if isinstance(frame, StartFrame):
|
|
296
302
|
# Push StartFrame before start(), because we want StartFrame to be
|
|
@@ -300,12 +306,10 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
300
306
|
elif isinstance(frame, CancelFrame):
|
|
301
307
|
await self.cancel(frame)
|
|
302
308
|
await self.push_frame(frame, direction)
|
|
303
|
-
elif isinstance(frame,
|
|
309
|
+
elif isinstance(frame, InterruptionFrame):
|
|
304
310
|
await self.push_frame(frame, direction)
|
|
305
311
|
await self._handle_frame(frame)
|
|
306
|
-
elif isinstance(frame,
|
|
307
|
-
frame, InputTransportMessageUrgentFrame
|
|
308
|
-
):
|
|
312
|
+
elif isinstance(frame, OutputTransportMessageUrgentFrame):
|
|
309
313
|
await self.send_message(frame)
|
|
310
314
|
elif isinstance(frame, OutputDTMFUrgentFrame):
|
|
311
315
|
await self.write_dtmf(frame)
|
|
@@ -341,7 +345,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
341
345
|
|
|
342
346
|
sender = self._media_senders[frame.transport_destination]
|
|
343
347
|
|
|
344
|
-
if isinstance(frame,
|
|
348
|
+
if isinstance(frame, InterruptionFrame):
|
|
345
349
|
await sender.handle_interruptions(frame)
|
|
346
350
|
elif isinstance(frame, OutputAudioRawFrame):
|
|
347
351
|
await sender.handle_audio_frame(frame)
|
|
@@ -492,7 +496,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
492
496
|
await self._cancel_clock_task()
|
|
493
497
|
await self._cancel_video_task()
|
|
494
498
|
|
|
495
|
-
async def handle_interruptions(self, _:
|
|
499
|
+
async def handle_interruptions(self, _: InterruptionFrame):
|
|
496
500
|
"""Handle interruption events by restarting tasks and clearing buffers.
|
|
497
501
|
|
|
498
502
|
Args:
|
|
@@ -642,7 +646,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
642
646
|
await self._set_video_image(frame)
|
|
643
647
|
elif isinstance(frame, SpriteFrame):
|
|
644
648
|
await self._set_video_images(frame.images)
|
|
645
|
-
elif isinstance(frame,
|
|
649
|
+
elif isinstance(frame, OutputTransportMessageFrame):
|
|
646
650
|
await self._transport.send_message(frame)
|
|
647
651
|
elif isinstance(frame, OutputDTMFFrame):
|
|
648
652
|
await self._transport.write_dtmf(frame)
|
|
@@ -661,6 +665,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
661
665
|
self._audio_queue.get(), timeout=vad_stop_secs
|
|
662
666
|
)
|
|
663
667
|
yield frame
|
|
668
|
+
self._audio_queue.task_done()
|
|
664
669
|
except asyncio.TimeoutError:
|
|
665
670
|
# Notify the bot stopped speaking upstream if necessary.
|
|
666
671
|
await self._bot_stopped_speaking()
|
|
@@ -673,8 +678,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
673
678
|
frame = self._audio_queue.get_nowait()
|
|
674
679
|
if isinstance(frame, OutputAudioRawFrame):
|
|
675
680
|
frame.audio = await self._mixer.mix(frame.audio)
|
|
676
|
-
|
|
681
|
+
last_frame_time = time.time()
|
|
677
682
|
yield frame
|
|
683
|
+
self._audio_queue.task_done()
|
|
678
684
|
except asyncio.QueueEmpty:
|
|
679
685
|
# Notify the bot stopped speaking upstream if necessary.
|
|
680
686
|
diff_time = time.time() - last_frame_time
|
|
@@ -740,12 +746,22 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
740
746
|
# Handle frame.
|
|
741
747
|
await self._handle_frame(frame)
|
|
742
748
|
|
|
743
|
-
#
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
+
# If we are not able to write to the transport we shouldn't
|
|
750
|
+
# pushb downstream.
|
|
751
|
+
push_downstream = True
|
|
752
|
+
|
|
753
|
+
# Try to send audio to the transport.
|
|
754
|
+
try:
|
|
755
|
+
if isinstance(frame, OutputAudioRawFrame):
|
|
756
|
+
push_downstream = await self._transport.write_audio_frame(frame)
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logger.error(f"{self} Error writing {frame} to transport: {e}")
|
|
759
|
+
push_downstream = False
|
|
760
|
+
|
|
761
|
+
# If we were able to send to the transport, push the frame
|
|
762
|
+
# downstream in case anyone else needs it.
|
|
763
|
+
if push_downstream:
|
|
764
|
+
await self._transport.push_frame(frame)
|
|
749
765
|
|
|
750
766
|
#
|
|
751
767
|
# Video handling
|