dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -29,20 +29,19 @@ from pipecat.frames.frames import (
|
|
|
29
29
|
CancelFrame,
|
|
30
30
|
EndFrame,
|
|
31
31
|
Frame,
|
|
32
|
-
|
|
32
|
+
InterruptionFrame,
|
|
33
33
|
MixerControlFrame,
|
|
34
34
|
OutputAudioRawFrame,
|
|
35
35
|
OutputDTMFFrame,
|
|
36
36
|
OutputDTMFUrgentFrame,
|
|
37
37
|
OutputImageRawFrame,
|
|
38
|
+
OutputTransportMessageFrame,
|
|
39
|
+
OutputTransportMessageUrgentFrame,
|
|
38
40
|
OutputTransportReadyFrame,
|
|
39
41
|
SpeechOutputAudioRawFrame,
|
|
40
42
|
SpriteFrame,
|
|
41
43
|
StartFrame,
|
|
42
|
-
StartInterruptionFrame,
|
|
43
44
|
SystemFrame,
|
|
44
|
-
TransportMessageFrame,
|
|
45
|
-
TransportMessageUrgentFrame,
|
|
46
45
|
TTSAudioRawFrame,
|
|
47
46
|
)
|
|
48
47
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
@@ -51,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
|
|
|
51
50
|
|
|
52
51
|
# TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
|
|
53
52
|
BOT_VAD_STOP_SECS = 0.30
|
|
53
|
+
# For the very first bot utterance (e.g., intro), we can safely
|
|
54
|
+
# detect end-of-speech sooner to improve responsiveness for the
|
|
55
|
+
# user’s first short reply. Keep conservative to avoid mid-utterance
|
|
56
|
+
# false stops when TTS streams quickly.
|
|
57
|
+
FIRST_BOT_VAD_STOP_SECS = 0.08
|
|
54
58
|
|
|
55
59
|
|
|
56
60
|
class BaseOutputTransport(FrameProcessor):
|
|
@@ -85,6 +89,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
85
89
|
# us to send multiple streams at the same time if the transport allows
|
|
86
90
|
# it.
|
|
87
91
|
self._media_senders: Dict[Any, "BaseOutputTransport.MediaSender"] = {}
|
|
92
|
+
self._register_event_handler("on_output_terminated")
|
|
88
93
|
|
|
89
94
|
@property
|
|
90
95
|
def sample_rate(self) -> int:
|
|
@@ -179,7 +184,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
179
184
|
# Sending a frame indicating that the output transport is ready and able to receive frames.
|
|
180
185
|
await self.push_frame(OutputTransportReadyFrame(), FrameDirection.UPSTREAM)
|
|
181
186
|
|
|
182
|
-
async def send_message(
|
|
187
|
+
async def send_message(
|
|
188
|
+
self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
|
|
189
|
+
):
|
|
183
190
|
"""Send a transport message.
|
|
184
191
|
|
|
185
192
|
Args:
|
|
@@ -203,21 +210,27 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
203
210
|
"""
|
|
204
211
|
pass
|
|
205
212
|
|
|
206
|
-
async def write_video_frame(self, frame: OutputImageRawFrame):
|
|
213
|
+
async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
|
|
207
214
|
"""Write a video frame to the transport.
|
|
208
215
|
|
|
209
216
|
Args:
|
|
210
217
|
frame: The output video frame to write.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
True if the video frame was written successfully, False otherwise.
|
|
211
221
|
"""
|
|
212
|
-
|
|
222
|
+
return False
|
|
213
223
|
|
|
214
|
-
async def write_audio_frame(self, frame: OutputAudioRawFrame):
|
|
224
|
+
async def write_audio_frame(self, frame: OutputAudioRawFrame) -> bool:
|
|
215
225
|
"""Write an audio frame to the transport.
|
|
216
226
|
|
|
217
227
|
Args:
|
|
218
228
|
frame: The output audio frame to write.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
True if the audio frame was written successfully, False otherwise.
|
|
219
232
|
"""
|
|
220
|
-
|
|
233
|
+
return False
|
|
221
234
|
|
|
222
235
|
async def write_dtmf(self, frame: OutputDTMFFrame | OutputDTMFUrgentFrame):
|
|
223
236
|
"""Write a DTMF tone using the transport's preferred method.
|
|
@@ -287,45 +300,29 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
287
300
|
"""
|
|
288
301
|
await super().process_frame(frame, direction)
|
|
289
302
|
|
|
290
|
-
#
|
|
291
|
-
# System frames (like StartInterruptionFrame) are pushed
|
|
292
|
-
# immediately. Other frames require order so they are put in the sink
|
|
293
|
-
# queue.
|
|
294
|
-
#
|
|
295
303
|
if isinstance(frame, StartFrame):
|
|
296
304
|
# Push StartFrame before start(), because we want StartFrame to be
|
|
297
305
|
# processed by every processor before any other frame is processed.
|
|
298
306
|
await self.push_frame(frame, direction)
|
|
299
307
|
await self.start(frame)
|
|
308
|
+
elif isinstance(frame, EndFrame):
|
|
309
|
+
await self.stop(frame)
|
|
310
|
+
await self._call_event_handler("on_output_terminated", frame)
|
|
311
|
+
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
312
|
+
await self.push_frame(frame, direction)
|
|
300
313
|
elif isinstance(frame, CancelFrame):
|
|
301
314
|
await self.cancel(frame)
|
|
315
|
+
await self._call_event_handler("on_output_terminated", frame)
|
|
302
316
|
await self.push_frame(frame, direction)
|
|
303
|
-
elif isinstance(frame,
|
|
317
|
+
elif isinstance(frame, InterruptionFrame):
|
|
304
318
|
await self.push_frame(frame, direction)
|
|
305
319
|
await self._handle_frame(frame)
|
|
306
|
-
elif isinstance(frame,
|
|
307
|
-
frame, InputTransportMessageUrgentFrame
|
|
308
|
-
):
|
|
320
|
+
elif isinstance(frame, OutputTransportMessageUrgentFrame):
|
|
309
321
|
await self.send_message(frame)
|
|
310
322
|
elif isinstance(frame, OutputDTMFUrgentFrame):
|
|
311
323
|
await self.write_dtmf(frame)
|
|
312
324
|
elif isinstance(frame, SystemFrame):
|
|
313
325
|
await self.push_frame(frame, direction)
|
|
314
|
-
# Control frames.
|
|
315
|
-
elif isinstance(frame, EndFrame):
|
|
316
|
-
await self.stop(frame)
|
|
317
|
-
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
318
|
-
await self.push_frame(frame, direction)
|
|
319
|
-
elif isinstance(frame, MixerControlFrame):
|
|
320
|
-
await self._handle_frame(frame)
|
|
321
|
-
# Other frames.
|
|
322
|
-
elif isinstance(frame, OutputAudioRawFrame):
|
|
323
|
-
await self._handle_frame(frame)
|
|
324
|
-
elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
|
|
325
|
-
await self._handle_frame(frame)
|
|
326
|
-
# TODO(aleix): Images and audio should support presentation timestamps.
|
|
327
|
-
elif frame.pts:
|
|
328
|
-
await self._handle_frame(frame)
|
|
329
326
|
elif direction == FrameDirection.UPSTREAM:
|
|
330
327
|
await self.push_frame(frame, direction)
|
|
331
328
|
else:
|
|
@@ -341,7 +338,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
341
338
|
|
|
342
339
|
sender = self._media_senders[frame.transport_destination]
|
|
343
340
|
|
|
344
|
-
if isinstance(frame,
|
|
341
|
+
if isinstance(frame, InterruptionFrame):
|
|
345
342
|
await sender.handle_interruptions(frame)
|
|
346
343
|
elif isinstance(frame, OutputAudioRawFrame):
|
|
347
344
|
await sender.handle_audio_frame(frame)
|
|
@@ -407,6 +404,16 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
407
404
|
|
|
408
405
|
# Indicates if the bot is currently speaking.
|
|
409
406
|
self._bot_speaking = False
|
|
407
|
+
# Last time a BotSpeakingFrame was pushed.
|
|
408
|
+
self._bot_speaking_frame_time = 0
|
|
409
|
+
# How often a BotSpeakingFrame should be pushed (value should be
|
|
410
|
+
# lower than the audio chunks).
|
|
411
|
+
self._bot_speaking_frame_period = 0.2
|
|
412
|
+
# Last time the bot actually spoke.
|
|
413
|
+
self._bot_speech_last_time = 0
|
|
414
|
+
# Before the first stop event, we use a shorter silence
|
|
415
|
+
# threshold to make the first turn more responsive.
|
|
416
|
+
self._first_stop_pending = True
|
|
410
417
|
|
|
411
418
|
self._audio_task: Optional[asyncio.Task] = None
|
|
412
419
|
self._video_task: Optional[asyncio.Task] = None
|
|
@@ -492,7 +499,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
492
499
|
await self._cancel_clock_task()
|
|
493
500
|
await self._cancel_video_task()
|
|
494
501
|
|
|
495
|
-
async def handle_interruptions(self, _:
|
|
502
|
+
async def handle_interruptions(self, _: InterruptionFrame):
|
|
496
503
|
"""Handle interruption events by restarting tasks and clearing buffers.
|
|
497
504
|
|
|
498
505
|
Args:
|
|
@@ -598,39 +605,75 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
598
605
|
|
|
599
606
|
async def _bot_started_speaking(self):
|
|
600
607
|
"""Handle bot started speaking event."""
|
|
601
|
-
if
|
|
602
|
-
|
|
603
|
-
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
604
|
-
)
|
|
608
|
+
if self._bot_speaking:
|
|
609
|
+
return
|
|
605
610
|
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
upstream_frame.transport_destination = self._destination
|
|
610
|
-
await self._transport.push_frame(downstream_frame)
|
|
611
|
-
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
611
|
+
self._transport.logger.debug(
|
|
612
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
613
|
+
)
|
|
612
614
|
|
|
613
|
-
|
|
615
|
+
downstream_frame = BotStartedSpeakingFrame()
|
|
616
|
+
downstream_frame.transport_destination = self._destination
|
|
617
|
+
upstream_frame = BotStartedSpeakingFrame()
|
|
618
|
+
upstream_frame.transport_destination = self._destination
|
|
619
|
+
await self._transport.push_frame(downstream_frame)
|
|
620
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
621
|
+
|
|
622
|
+
self._bot_speaking = True
|
|
614
623
|
|
|
615
624
|
async def _bot_stopped_speaking(self):
|
|
616
625
|
"""Handle bot stopped speaking event."""
|
|
617
|
-
if self._bot_speaking:
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
626
|
+
if not self._bot_speaking:
|
|
627
|
+
return
|
|
628
|
+
|
|
629
|
+
self._transport.logger.debug(
|
|
630
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
downstream_frame = BotStoppedSpeakingFrame()
|
|
634
|
+
downstream_frame.transport_destination = self._destination
|
|
635
|
+
upstream_frame = BotStoppedSpeakingFrame()
|
|
636
|
+
upstream_frame.transport_destination = self._destination
|
|
637
|
+
await self._transport.push_frame(downstream_frame)
|
|
638
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
639
|
+
|
|
640
|
+
self._bot_speaking = False
|
|
621
641
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
upstream_frame.transport_destination = self._destination
|
|
626
|
-
await self._transport.push_frame(downstream_frame)
|
|
627
|
-
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
642
|
+
# Mark that the first stop has been completed so subsequent
|
|
643
|
+
# stops use the regular (longer) VAD stop threshold.
|
|
644
|
+
self._first_stop_pending = False
|
|
628
645
|
|
|
629
|
-
|
|
646
|
+
# Clean audio buffer (there could be tiny left overs if not multiple
|
|
647
|
+
# to our output chunk size).
|
|
648
|
+
self._audio_buffer = bytearray()
|
|
649
|
+
|
|
650
|
+
async def _bot_currently_speaking(self):
|
|
651
|
+
"""Handle bot speaking event."""
|
|
652
|
+
await self._bot_started_speaking()
|
|
630
653
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
self.
|
|
654
|
+
diff_time = time.time() - self._bot_speaking_frame_time
|
|
655
|
+
if diff_time >= self._bot_speaking_frame_period:
|
|
656
|
+
await self._transport.push_frame(BotSpeakingFrame())
|
|
657
|
+
await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
658
|
+
self._bot_speaking_frame_time = time.time()
|
|
659
|
+
|
|
660
|
+
self._bot_speech_last_time = time.time()
|
|
661
|
+
|
|
662
|
+
async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
|
|
663
|
+
if not is_silence(frame.audio):
|
|
664
|
+
await self._bot_currently_speaking()
|
|
665
|
+
else:
|
|
666
|
+
silence_duration = time.time() - self._bot_speech_last_time
|
|
667
|
+
if silence_duration > BOT_VAD_STOP_SECS:
|
|
668
|
+
await self._bot_stopped_speaking()
|
|
669
|
+
|
|
670
|
+
async def _handle_bot_speech(self, frame: Frame):
|
|
671
|
+
# TTS case.
|
|
672
|
+
if isinstance(frame, TTSAudioRawFrame):
|
|
673
|
+
await self._bot_currently_speaking()
|
|
674
|
+
# Speech stream case.
|
|
675
|
+
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
676
|
+
await self._maybe_bot_currently_speaking(frame)
|
|
634
677
|
|
|
635
678
|
async def _handle_frame(self, frame: Frame):
|
|
636
679
|
"""Handle various frame types with appropriate processing.
|
|
@@ -638,11 +681,13 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
638
681
|
Args:
|
|
639
682
|
frame: The frame to handle.
|
|
640
683
|
"""
|
|
641
|
-
if isinstance(frame,
|
|
684
|
+
if isinstance(frame, OutputAudioRawFrame):
|
|
685
|
+
await self._handle_bot_speech(frame)
|
|
686
|
+
elif isinstance(frame, OutputImageRawFrame):
|
|
642
687
|
await self._set_video_image(frame)
|
|
643
688
|
elif isinstance(frame, SpriteFrame):
|
|
644
689
|
await self._set_video_images(frame.images)
|
|
645
|
-
elif isinstance(frame,
|
|
690
|
+
elif isinstance(frame, OutputTransportMessageFrame):
|
|
646
691
|
await self._transport.send_message(frame)
|
|
647
692
|
elif isinstance(frame, OutputDTMFFrame):
|
|
648
693
|
await self._transport.write_dtmf(frame)
|
|
@@ -657,10 +702,16 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
657
702
|
async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
|
|
658
703
|
while True:
|
|
659
704
|
try:
|
|
660
|
-
|
|
661
|
-
|
|
705
|
+
# Use a shorter timeout only for the first bot stop to
|
|
706
|
+
# accelerate the initial turn handoff right after the intro.
|
|
707
|
+
timeout = (
|
|
708
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
709
|
+
if getattr(self, "_first_stop_pending", True)
|
|
710
|
+
else BOT_VAD_STOP_SECS
|
|
662
711
|
)
|
|
712
|
+
frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
|
|
663
713
|
yield frame
|
|
714
|
+
self._audio_queue.task_done()
|
|
664
715
|
except asyncio.TimeoutError:
|
|
665
716
|
# Notify the bot stopped speaking upstream if necessary.
|
|
666
717
|
await self._bot_stopped_speaking()
|
|
@@ -673,12 +724,19 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
673
724
|
frame = self._audio_queue.get_nowait()
|
|
674
725
|
if isinstance(frame, OutputAudioRawFrame):
|
|
675
726
|
frame.audio = await self._mixer.mix(frame.audio)
|
|
676
|
-
|
|
727
|
+
last_frame_time = time.time()
|
|
677
728
|
yield frame
|
|
729
|
+
self._audio_queue.task_done()
|
|
678
730
|
except asyncio.QueueEmpty:
|
|
679
731
|
# Notify the bot stopped speaking upstream if necessary.
|
|
680
732
|
diff_time = time.time() - last_frame_time
|
|
681
|
-
|
|
733
|
+
# Use a shorter threshold for the first stop only.
|
|
734
|
+
current_stop_secs = (
|
|
735
|
+
FIRST_BOT_VAD_STOP_SECS
|
|
736
|
+
if getattr(self, "_first_stop_pending", True)
|
|
737
|
+
else BOT_VAD_STOP_SECS
|
|
738
|
+
)
|
|
739
|
+
if diff_time > current_stop_secs:
|
|
682
740
|
await self._bot_stopped_speaking()
|
|
683
741
|
# Generate an audio frame with only the mixer's part.
|
|
684
742
|
frame = OutputAudioRawFrame(
|
|
@@ -700,39 +758,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
700
758
|
|
|
701
759
|
async def _audio_task_handler(self):
|
|
702
760
|
"""Main audio processing task handler."""
|
|
703
|
-
# Push a BotSpeakingFrame every 200ms, we don't really need to push it
|
|
704
|
-
# at every audio chunk. If the audio chunk is bigger than 200ms, push at
|
|
705
|
-
# every audio chunk.
|
|
706
|
-
TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
|
|
707
|
-
BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
|
|
708
|
-
bot_speaking_counter = 0
|
|
709
|
-
speech_last_speaking_time = 0
|
|
710
|
-
|
|
711
761
|
async for frame in self._next_frame():
|
|
712
|
-
# Notify the bot started speaking upstream if necessary and that
|
|
713
|
-
# it's actually speaking.
|
|
714
|
-
is_speaking = False
|
|
715
|
-
if isinstance(frame, TTSAudioRawFrame):
|
|
716
|
-
is_speaking = True
|
|
717
|
-
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
718
|
-
if not is_silence(frame.audio):
|
|
719
|
-
is_speaking = True
|
|
720
|
-
speech_last_speaking_time = time.time()
|
|
721
|
-
else:
|
|
722
|
-
silence_duration = time.time() - speech_last_speaking_time
|
|
723
|
-
if silence_duration > BOT_VAD_STOP_SECS:
|
|
724
|
-
await self._bot_stopped_speaking()
|
|
725
|
-
|
|
726
|
-
if is_speaking:
|
|
727
|
-
await self._bot_started_speaking()
|
|
728
|
-
if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
|
|
729
|
-
await self._transport.push_frame(BotSpeakingFrame())
|
|
730
|
-
await self._transport.push_frame(
|
|
731
|
-
BotSpeakingFrame(), FrameDirection.UPSTREAM
|
|
732
|
-
)
|
|
733
|
-
bot_speaking_counter = 0
|
|
734
|
-
bot_speaking_counter += 1
|
|
735
|
-
|
|
736
762
|
# No need to push EndFrame, it's pushed from process_frame().
|
|
737
763
|
if isinstance(frame, EndFrame):
|
|
738
764
|
break
|
|
@@ -740,12 +766,22 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
740
766
|
# Handle frame.
|
|
741
767
|
await self._handle_frame(frame)
|
|
742
768
|
|
|
743
|
-
#
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
769
|
+
# If we are not able to write to the transport we shouldn't
|
|
770
|
+
# pushb downstream.
|
|
771
|
+
push_downstream = True
|
|
772
|
+
|
|
773
|
+
# Try to send audio to the transport.
|
|
774
|
+
try:
|
|
775
|
+
if isinstance(frame, OutputAudioRawFrame):
|
|
776
|
+
push_downstream = await self._transport.write_audio_frame(frame)
|
|
777
|
+
except Exception as e:
|
|
778
|
+
logger.error(f"{self} Error writing {frame} to transport: {e}")
|
|
779
|
+
push_downstream = False
|
|
780
|
+
|
|
781
|
+
# If we were able to send to the transport, push the frame
|
|
782
|
+
# downstream in case anyone else needs it.
|
|
783
|
+
if push_downstream:
|
|
784
|
+
await self._transport.push_frame(frame)
|
|
749
785
|
|
|
750
786
|
#
|
|
751
787
|
# Video handling
|