dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -29,20 +29,19 @@ from pipecat.frames.frames import (
29
29
  CancelFrame,
30
30
  EndFrame,
31
31
  Frame,
32
- InputTransportMessageUrgentFrame,
32
+ InterruptionFrame,
33
33
  MixerControlFrame,
34
34
  OutputAudioRawFrame,
35
35
  OutputDTMFFrame,
36
36
  OutputDTMFUrgentFrame,
37
37
  OutputImageRawFrame,
38
+ OutputTransportMessageFrame,
39
+ OutputTransportMessageUrgentFrame,
38
40
  OutputTransportReadyFrame,
39
41
  SpeechOutputAudioRawFrame,
40
42
  SpriteFrame,
41
43
  StartFrame,
42
- StartInterruptionFrame,
43
44
  SystemFrame,
44
- TransportMessageFrame,
45
- TransportMessageUrgentFrame,
46
45
  TTSAudioRawFrame,
47
46
  )
48
47
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -51,6 +50,11 @@ from pipecat.utils.time import nanoseconds_to_seconds
51
50
 
52
51
  # TODO: When we use GeminiMultimodalLiveLLMService, we need to change this to 0.35 but that creates issue for faster TTS.
53
52
  BOT_VAD_STOP_SECS = 0.30
53
+ # For the very first bot utterance (e.g., intro), we can safely
54
+ # detect end-of-speech sooner to improve responsiveness for the
55
+ # user’s first short reply. Keep conservative to avoid mid-utterance
56
+ # false stops when TTS streams quickly.
57
+ FIRST_BOT_VAD_STOP_SECS = 0.08
54
58
 
55
59
 
56
60
  class BaseOutputTransport(FrameProcessor):
@@ -85,6 +89,7 @@ class BaseOutputTransport(FrameProcessor):
85
89
  # us to send multiple streams at the same time if the transport allows
86
90
  # it.
87
91
  self._media_senders: Dict[Any, "BaseOutputTransport.MediaSender"] = {}
92
+ self._register_event_handler("on_output_terminated")
88
93
 
89
94
  @property
90
95
  def sample_rate(self) -> int:
@@ -179,7 +184,9 @@ class BaseOutputTransport(FrameProcessor):
179
184
  # Sending a frame indicating that the output transport is ready and able to receive frames.
180
185
  await self.push_frame(OutputTransportReadyFrame(), FrameDirection.UPSTREAM)
181
186
 
182
- async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
187
+ async def send_message(
188
+ self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
189
+ ):
183
190
  """Send a transport message.
184
191
 
185
192
  Args:
@@ -203,21 +210,27 @@ class BaseOutputTransport(FrameProcessor):
203
210
  """
204
211
  pass
205
212
 
206
- async def write_video_frame(self, frame: OutputImageRawFrame):
213
+ async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
207
214
  """Write a video frame to the transport.
208
215
 
209
216
  Args:
210
217
  frame: The output video frame to write.
218
+
219
+ Returns:
220
+ True if the video frame was written successfully, False otherwise.
211
221
  """
212
- pass
222
+ return False
213
223
 
214
- async def write_audio_frame(self, frame: OutputAudioRawFrame):
224
+ async def write_audio_frame(self, frame: OutputAudioRawFrame) -> bool:
215
225
  """Write an audio frame to the transport.
216
226
 
217
227
  Args:
218
228
  frame: The output audio frame to write.
229
+
230
+ Returns:
231
+ True if the audio frame was written successfully, False otherwise.
219
232
  """
220
- pass
233
+ return False
221
234
 
222
235
  async def write_dtmf(self, frame: OutputDTMFFrame | OutputDTMFUrgentFrame):
223
236
  """Write a DTMF tone using the transport's preferred method.
@@ -287,45 +300,29 @@ class BaseOutputTransport(FrameProcessor):
287
300
  """
288
301
  await super().process_frame(frame, direction)
289
302
 
290
- #
291
- # System frames (like StartInterruptionFrame) are pushed
292
- # immediately. Other frames require order so they are put in the sink
293
- # queue.
294
- #
295
303
  if isinstance(frame, StartFrame):
296
304
  # Push StartFrame before start(), because we want StartFrame to be
297
305
  # processed by every processor before any other frame is processed.
298
306
  await self.push_frame(frame, direction)
299
307
  await self.start(frame)
308
+ elif isinstance(frame, EndFrame):
309
+ await self.stop(frame)
310
+ await self._call_event_handler("on_output_terminated", frame)
311
+ # Keep pushing EndFrame down so all the pipeline stops nicely.
312
+ await self.push_frame(frame, direction)
300
313
  elif isinstance(frame, CancelFrame):
301
314
  await self.cancel(frame)
315
+ await self._call_event_handler("on_output_terminated", frame)
302
316
  await self.push_frame(frame, direction)
303
- elif isinstance(frame, StartInterruptionFrame):
317
+ elif isinstance(frame, InterruptionFrame):
304
318
  await self.push_frame(frame, direction)
305
319
  await self._handle_frame(frame)
306
- elif isinstance(frame, TransportMessageUrgentFrame) and not isinstance(
307
- frame, InputTransportMessageUrgentFrame
308
- ):
320
+ elif isinstance(frame, OutputTransportMessageUrgentFrame):
309
321
  await self.send_message(frame)
310
322
  elif isinstance(frame, OutputDTMFUrgentFrame):
311
323
  await self.write_dtmf(frame)
312
324
  elif isinstance(frame, SystemFrame):
313
325
  await self.push_frame(frame, direction)
314
- # Control frames.
315
- elif isinstance(frame, EndFrame):
316
- await self.stop(frame)
317
- # Keep pushing EndFrame down so all the pipeline stops nicely.
318
- await self.push_frame(frame, direction)
319
- elif isinstance(frame, MixerControlFrame):
320
- await self._handle_frame(frame)
321
- # Other frames.
322
- elif isinstance(frame, OutputAudioRawFrame):
323
- await self._handle_frame(frame)
324
- elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
325
- await self._handle_frame(frame)
326
- # TODO(aleix): Images and audio should support presentation timestamps.
327
- elif frame.pts:
328
- await self._handle_frame(frame)
329
326
  elif direction == FrameDirection.UPSTREAM:
330
327
  await self.push_frame(frame, direction)
331
328
  else:
@@ -341,7 +338,7 @@ class BaseOutputTransport(FrameProcessor):
341
338
 
342
339
  sender = self._media_senders[frame.transport_destination]
343
340
 
344
- if isinstance(frame, StartInterruptionFrame):
341
+ if isinstance(frame, InterruptionFrame):
345
342
  await sender.handle_interruptions(frame)
346
343
  elif isinstance(frame, OutputAudioRawFrame):
347
344
  await sender.handle_audio_frame(frame)
@@ -407,6 +404,16 @@ class BaseOutputTransport(FrameProcessor):
407
404
 
408
405
  # Indicates if the bot is currently speaking.
409
406
  self._bot_speaking = False
407
+ # Last time a BotSpeakingFrame was pushed.
408
+ self._bot_speaking_frame_time = 0
409
+ # How often a BotSpeakingFrame should be pushed (value should be
410
+ # lower than the audio chunks).
411
+ self._bot_speaking_frame_period = 0.2
412
+ # Last time the bot actually spoke.
413
+ self._bot_speech_last_time = 0
414
+ # Before the first stop event, we use a shorter silence
415
+ # threshold to make the first turn more responsive.
416
+ self._first_stop_pending = True
410
417
 
411
418
  self._audio_task: Optional[asyncio.Task] = None
412
419
  self._video_task: Optional[asyncio.Task] = None
@@ -492,7 +499,7 @@ class BaseOutputTransport(FrameProcessor):
492
499
  await self._cancel_clock_task()
493
500
  await self._cancel_video_task()
494
501
 
495
- async def handle_interruptions(self, _: StartInterruptionFrame):
502
+ async def handle_interruptions(self, _: InterruptionFrame):
496
503
  """Handle interruption events by restarting tasks and clearing buffers.
497
504
 
498
505
  Args:
@@ -598,39 +605,75 @@ class BaseOutputTransport(FrameProcessor):
598
605
 
599
606
  async def _bot_started_speaking(self):
600
607
  """Handle bot started speaking event."""
601
- if not self._bot_speaking:
602
- self._transport.logger.debug(
603
- f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
604
- )
608
+ if self._bot_speaking:
609
+ return
605
610
 
606
- downstream_frame = BotStartedSpeakingFrame()
607
- downstream_frame.transport_destination = self._destination
608
- upstream_frame = BotStartedSpeakingFrame()
609
- upstream_frame.transport_destination = self._destination
610
- await self._transport.push_frame(downstream_frame)
611
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
611
+ self._transport.logger.debug(
612
+ f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
613
+ )
612
614
 
613
- self._bot_speaking = True
615
+ downstream_frame = BotStartedSpeakingFrame()
616
+ downstream_frame.transport_destination = self._destination
617
+ upstream_frame = BotStartedSpeakingFrame()
618
+ upstream_frame.transport_destination = self._destination
619
+ await self._transport.push_frame(downstream_frame)
620
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
621
+
622
+ self._bot_speaking = True
614
623
 
615
624
  async def _bot_stopped_speaking(self):
616
625
  """Handle bot stopped speaking event."""
617
- if self._bot_speaking:
618
- self._transport.logger.debug(
619
- f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
620
- )
626
+ if not self._bot_speaking:
627
+ return
628
+
629
+ self._transport.logger.debug(
630
+ f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
631
+ )
632
+
633
+ downstream_frame = BotStoppedSpeakingFrame()
634
+ downstream_frame.transport_destination = self._destination
635
+ upstream_frame = BotStoppedSpeakingFrame()
636
+ upstream_frame.transport_destination = self._destination
637
+ await self._transport.push_frame(downstream_frame)
638
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
639
+
640
+ self._bot_speaking = False
621
641
 
622
- downstream_frame = BotStoppedSpeakingFrame()
623
- downstream_frame.transport_destination = self._destination
624
- upstream_frame = BotStoppedSpeakingFrame()
625
- upstream_frame.transport_destination = self._destination
626
- await self._transport.push_frame(downstream_frame)
627
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
642
+ # Mark that the first stop has been completed so subsequent
643
+ # stops use the regular (longer) VAD stop threshold.
644
+ self._first_stop_pending = False
628
645
 
629
- self._bot_speaking = False
646
+ # Clean audio buffer (there could be tiny left overs if not multiple
647
+ # to our output chunk size).
648
+ self._audio_buffer = bytearray()
649
+
650
+ async def _bot_currently_speaking(self):
651
+ """Handle bot speaking event."""
652
+ await self._bot_started_speaking()
630
653
 
631
- # Clean audio buffer (there could be tiny left overs if not multiple
632
- # to our output chunk size).
633
- self._audio_buffer = bytearray()
654
+ diff_time = time.time() - self._bot_speaking_frame_time
655
+ if diff_time >= self._bot_speaking_frame_period:
656
+ await self._transport.push_frame(BotSpeakingFrame())
657
+ await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
658
+ self._bot_speaking_frame_time = time.time()
659
+
660
+ self._bot_speech_last_time = time.time()
661
+
662
+ async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
663
+ if not is_silence(frame.audio):
664
+ await self._bot_currently_speaking()
665
+ else:
666
+ silence_duration = time.time() - self._bot_speech_last_time
667
+ if silence_duration > BOT_VAD_STOP_SECS:
668
+ await self._bot_stopped_speaking()
669
+
670
+ async def _handle_bot_speech(self, frame: Frame):
671
+ # TTS case.
672
+ if isinstance(frame, TTSAudioRawFrame):
673
+ await self._bot_currently_speaking()
674
+ # Speech stream case.
675
+ elif isinstance(frame, SpeechOutputAudioRawFrame):
676
+ await self._maybe_bot_currently_speaking(frame)
634
677
 
635
678
  async def _handle_frame(self, frame: Frame):
636
679
  """Handle various frame types with appropriate processing.
@@ -638,11 +681,13 @@ class BaseOutputTransport(FrameProcessor):
638
681
  Args:
639
682
  frame: The frame to handle.
640
683
  """
641
- if isinstance(frame, OutputImageRawFrame):
684
+ if isinstance(frame, OutputAudioRawFrame):
685
+ await self._handle_bot_speech(frame)
686
+ elif isinstance(frame, OutputImageRawFrame):
642
687
  await self._set_video_image(frame)
643
688
  elif isinstance(frame, SpriteFrame):
644
689
  await self._set_video_images(frame.images)
645
- elif isinstance(frame, TransportMessageFrame):
690
+ elif isinstance(frame, OutputTransportMessageFrame):
646
691
  await self._transport.send_message(frame)
647
692
  elif isinstance(frame, OutputDTMFFrame):
648
693
  await self._transport.write_dtmf(frame)
@@ -657,10 +702,16 @@ class BaseOutputTransport(FrameProcessor):
657
702
  async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
658
703
  while True:
659
704
  try:
660
- frame = await asyncio.wait_for(
661
- self._audio_queue.get(), timeout=vad_stop_secs
705
+ # Use a shorter timeout only for the first bot stop to
706
+ # accelerate the initial turn handoff right after the intro.
707
+ timeout = (
708
+ FIRST_BOT_VAD_STOP_SECS
709
+ if getattr(self, "_first_stop_pending", True)
710
+ else BOT_VAD_STOP_SECS
662
711
  )
712
+ frame = await asyncio.wait_for(self._audio_queue.get(), timeout=timeout)
663
713
  yield frame
714
+ self._audio_queue.task_done()
664
715
  except asyncio.TimeoutError:
665
716
  # Notify the bot stopped speaking upstream if necessary.
666
717
  await self._bot_stopped_speaking()
@@ -673,12 +724,19 @@ class BaseOutputTransport(FrameProcessor):
673
724
  frame = self._audio_queue.get_nowait()
674
725
  if isinstance(frame, OutputAudioRawFrame):
675
726
  frame.audio = await self._mixer.mix(frame.audio)
676
- last_frame_time = time.time()
727
+ last_frame_time = time.time()
677
728
  yield frame
729
+ self._audio_queue.task_done()
678
730
  except asyncio.QueueEmpty:
679
731
  # Notify the bot stopped speaking upstream if necessary.
680
732
  diff_time = time.time() - last_frame_time
681
- if diff_time > vad_stop_secs:
733
+ # Use a shorter threshold for the first stop only.
734
+ current_stop_secs = (
735
+ FIRST_BOT_VAD_STOP_SECS
736
+ if getattr(self, "_first_stop_pending", True)
737
+ else BOT_VAD_STOP_SECS
738
+ )
739
+ if diff_time > current_stop_secs:
682
740
  await self._bot_stopped_speaking()
683
741
  # Generate an audio frame with only the mixer's part.
684
742
  frame = OutputAudioRawFrame(
@@ -700,39 +758,7 @@ class BaseOutputTransport(FrameProcessor):
700
758
 
701
759
  async def _audio_task_handler(self):
702
760
  """Main audio processing task handler."""
703
- # Push a BotSpeakingFrame every 200ms, we don't really need to push it
704
- # at every audio chunk. If the audio chunk is bigger than 200ms, push at
705
- # every audio chunk.
706
- TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
707
- BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
708
- bot_speaking_counter = 0
709
- speech_last_speaking_time = 0
710
-
711
761
  async for frame in self._next_frame():
712
- # Notify the bot started speaking upstream if necessary and that
713
- # it's actually speaking.
714
- is_speaking = False
715
- if isinstance(frame, TTSAudioRawFrame):
716
- is_speaking = True
717
- elif isinstance(frame, SpeechOutputAudioRawFrame):
718
- if not is_silence(frame.audio):
719
- is_speaking = True
720
- speech_last_speaking_time = time.time()
721
- else:
722
- silence_duration = time.time() - speech_last_speaking_time
723
- if silence_duration > BOT_VAD_STOP_SECS:
724
- await self._bot_stopped_speaking()
725
-
726
- if is_speaking:
727
- await self._bot_started_speaking()
728
- if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
729
- await self._transport.push_frame(BotSpeakingFrame())
730
- await self._transport.push_frame(
731
- BotSpeakingFrame(), FrameDirection.UPSTREAM
732
- )
733
- bot_speaking_counter = 0
734
- bot_speaking_counter += 1
735
-
736
762
  # No need to push EndFrame, it's pushed from process_frame().
737
763
  if isinstance(frame, EndFrame):
738
764
  break
@@ -740,12 +766,22 @@ class BaseOutputTransport(FrameProcessor):
740
766
  # Handle frame.
741
767
  await self._handle_frame(frame)
742
768
 
743
- # Also, push frame downstream in case anyone else needs it.
744
- await self._transport.push_frame(frame)
745
-
746
- # Send audio.
747
- if isinstance(frame, OutputAudioRawFrame):
748
- await self._transport.write_audio_frame(frame)
769
+ # If we are not able to write to the transport we shouldn't
770
+ # pushb downstream.
771
+ push_downstream = True
772
+
773
+ # Try to send audio to the transport.
774
+ try:
775
+ if isinstance(frame, OutputAudioRawFrame):
776
+ push_downstream = await self._transport.write_audio_frame(frame)
777
+ except Exception as e:
778
+ logger.error(f"{self} Error writing {frame} to transport: {e}")
779
+ push_downstream = False
780
+
781
+ # If we were able to send to the transport, push the frame
782
+ # downstream in case anyone else needs it.
783
+ if push_downstream:
784
+ await self._transport.push_frame(frame)
749
785
 
750
786
  #
751
787
  # Video handling