dv-pipecat-ai 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (45) hide show
  1. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
  2. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +45 -43
  3. pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
  4. pipecat/pipeline/runner.py +6 -2
  5. pipecat/pipeline/task.py +40 -55
  6. pipecat/processors/aggregators/llm_context.py +40 -2
  7. pipecat/processors/frameworks/rtvi.py +1 -0
  8. pipecat/runner/daily.py +59 -20
  9. pipecat/runner/run.py +149 -67
  10. pipecat/runner/types.py +5 -5
  11. pipecat/services/assemblyai/models.py +6 -0
  12. pipecat/services/assemblyai/stt.py +13 -5
  13. pipecat/services/asyncai/tts.py +3 -0
  14. pipecat/services/aws/llm.py +33 -16
  15. pipecat/services/aws/nova_sonic/context.py +69 -0
  16. pipecat/services/aws/nova_sonic/llm.py +199 -89
  17. pipecat/services/aws/stt.py +2 -0
  18. pipecat/services/aws_nova_sonic/context.py +8 -12
  19. pipecat/services/cartesia/stt.py +77 -70
  20. pipecat/services/cartesia/tts.py +3 -1
  21. pipecat/services/deepgram/flux/stt.py +4 -0
  22. pipecat/services/elevenlabs/tts.py +82 -41
  23. pipecat/services/fish/tts.py +3 -0
  24. pipecat/services/google/stt.py +4 -0
  25. pipecat/services/lmnt/tts.py +2 -0
  26. pipecat/services/neuphonic/tts.py +3 -0
  27. pipecat/services/openai/tts.py +37 -6
  28. pipecat/services/piper/tts.py +7 -9
  29. pipecat/services/playht/tts.py +3 -0
  30. pipecat/services/rime/tts.py +9 -8
  31. pipecat/services/riva/stt.py +3 -1
  32. pipecat/services/salesforce/__init__.py +9 -0
  33. pipecat/services/salesforce/llm.py +465 -0
  34. pipecat/services/sarvam/tts.py +87 -10
  35. pipecat/services/speechmatics/stt.py +3 -1
  36. pipecat/services/stt_service.py +23 -10
  37. pipecat/services/tts_service.py +64 -13
  38. pipecat/transports/base_input.py +3 -0
  39. pipecat/transports/base_output.py +71 -77
  40. pipecat/transports/smallwebrtc/connection.py +5 -0
  41. pipecat/transports/smallwebrtc/request_handler.py +42 -0
  42. pipecat/utils/string.py +1 -0
  43. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
  44. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
  45. {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
@@ -77,17 +77,29 @@ class SarvamHttpTTSService(TTSService):
77
77
 
78
78
  Example::
79
79
 
80
- tts = SarvamTTSService(
80
+ tts = SarvamHttpTTSService(
81
81
  api_key="your-api-key",
82
82
  voice_id="anushka",
83
83
  model="bulbul:v2",
84
84
  aiohttp_session=session,
85
- params=SarvamTTSService.InputParams(
85
+ params=SarvamHttpTTSService.InputParams(
86
86
  language=Language.HI,
87
87
  pitch=0.1,
88
88
  pace=1.2
89
89
  )
90
90
  )
91
+
92
+ # For bulbul v3 beta with any speaker:
93
+ tts_v3 = SarvamHttpTTSService(
94
+ api_key="your-api-key",
95
+ voice_id="speaker_name",
96
+ model="bulbul:v3,
97
+ aiohttp_session=session,
98
+ params=SarvamHttpTTSService.InputParams(
99
+ language=Language.HI,
100
+ temperature=0.8
101
+ )
102
+ )
91
103
  """
92
104
 
93
105
  class InputParams(BaseModel):
@@ -106,6 +118,14 @@ class SarvamHttpTTSService(TTSService):
106
118
  pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
107
119
  loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
108
120
  enable_preprocessing: Optional[bool] = False
121
+ temperature: Optional[float] = Field(
122
+ default=0.6,
123
+ ge=0.01,
124
+ le=1.0,
125
+ description="Controls the randomness of the output for bulbul v3 beta. "
126
+ "Lower values make the output more focused and deterministic, while "
127
+ "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
128
+ )
109
129
 
110
130
  def __init__(
111
131
  self,
@@ -125,7 +145,7 @@ class SarvamHttpTTSService(TTSService):
125
145
  api_key: Sarvam AI API subscription key.
126
146
  aiohttp_session: Shared aiohttp session for making requests.
127
147
  voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
128
- model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
148
+ model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
129
149
  base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
130
150
  sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
131
151
  params: Additional voice and preprocessing parameters. If None, uses defaults.
@@ -139,16 +159,32 @@ class SarvamHttpTTSService(TTSService):
139
159
  self._base_url = base_url
140
160
  self._session = aiohttp_session
141
161
 
162
+ # Build base settings common to all models
142
163
  self._settings = {
143
164
  "language": (
144
165
  self.language_to_service_language(params.language) if params.language else "en-IN"
145
166
  ),
146
- "pitch": params.pitch,
147
- "pace": params.pace,
148
- "loudness": params.loudness,
149
167
  "enable_preprocessing": params.enable_preprocessing,
150
168
  }
151
169
 
170
+ # Add model-specific parameters
171
+ if model in ("bulbul:v3-beta", "bulbul:v3"):
172
+ self._settings.update(
173
+ {
174
+ "temperature": getattr(params, "temperature", 0.6),
175
+ "model": model,
176
+ }
177
+ )
178
+ else:
179
+ self._settings.update(
180
+ {
181
+ "pitch": params.pitch,
182
+ "pace": params.pace,
183
+ "loudness": params.loudness,
184
+ "model": model,
185
+ }
186
+ )
187
+
152
188
  self.set_model_name(model)
153
189
  self.set_voice(voice_id)
154
190
 
@@ -276,6 +312,18 @@ class SarvamTTSService(InterruptibleTTSService):
276
312
  pace=1.2
277
313
  )
278
314
  )
315
+
316
+ # For bulbul v3 beta with any speaker and temperature:
317
+ # Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
318
+ tts_v3 = SarvamTTSService(
319
+ api_key="your-api-key",
320
+ voice_id="speaker_name",
321
+ model="bulbul:v3",
322
+ params=SarvamTTSService.InputParams(
323
+ language=Language.HI,
324
+ temperature=0.8
325
+ )
326
+ )
279
327
  """
280
328
 
281
329
  class InputParams(BaseModel):
@@ -311,6 +359,14 @@ class SarvamTTSService(InterruptibleTTSService):
311
359
  output_audio_codec: Optional[str] = "linear16"
312
360
  output_audio_bitrate: Optional[str] = "128k"
313
361
  language: Optional[Language] = Language.EN
362
+ temperature: Optional[float] = Field(
363
+ default=0.6,
364
+ ge=0.01,
365
+ le=1.0,
366
+ description="Controls the randomness of the output for bulbul v3 beta. "
367
+ "Lower values make the output more focused and deterministic, while "
368
+ "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
369
+ )
314
370
 
315
371
  def __init__(
316
372
  self,
@@ -330,6 +386,7 @@ class SarvamTTSService(InterruptibleTTSService):
330
386
  Args:
331
387
  api_key: Sarvam API key for authenticating TTS requests.
332
388
  model: Identifier of the Sarvam speech model (default "bulbul:v2").
389
+ Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
333
390
  voice_id: Voice identifier for synthesis (default "anushka").
334
391
  url: WebSocket URL for connecting to the TTS backend (default production URL).
335
392
  aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
@@ -372,15 +429,12 @@ class SarvamTTSService(InterruptibleTTSService):
372
429
  self._api_key = api_key
373
430
  self.set_model_name(model)
374
431
  self.set_voice(voice_id)
375
- # Configuration parameters
432
+ # Build base settings common to all models
376
433
  self._settings = {
377
434
  "target_language_code": (
378
435
  self.language_to_service_language(params.language) if params.language else "en-IN"
379
436
  ),
380
- "pitch": params.pitch,
381
- "pace": params.pace,
382
437
  "speaker": voice_id,
383
- "loudness": params.loudness,
384
438
  "speech_sample_rate": 0,
385
439
  "enable_preprocessing": params.enable_preprocessing,
386
440
  "min_buffer_size": params.min_buffer_size,
@@ -388,6 +442,24 @@ class SarvamTTSService(InterruptibleTTSService):
388
442
  "output_audio_codec": params.output_audio_codec,
389
443
  "output_audio_bitrate": params.output_audio_bitrate,
390
444
  }
445
+
446
+ # Add model-specific parameters
447
+ if model in ("bulbul:v3-beta", "bulbul:v3"):
448
+ self._settings.update(
449
+ {
450
+ "temperature": getattr(params, "temperature", 0.6),
451
+ "model": model,
452
+ }
453
+ )
454
+ else:
455
+ self._settings.update(
456
+ {
457
+ "pitch": params.pitch,
458
+ "pace": params.pace,
459
+ "loudness": params.loudness,
460
+ "model": model,
461
+ }
462
+ )
391
463
  self._started = False
392
464
 
393
465
  self._receive_task = None
@@ -526,6 +598,7 @@ class SarvamTTSService(InterruptibleTTSService):
526
598
  logger.debug("Connected to Sarvam TTS Websocket")
527
599
  await self._send_config()
528
600
 
601
+ await self._call_event_handler("on_connected")
529
602
  except Exception as e:
530
603
  logger.error(f"{self} initialization error: {e}")
531
604
  self._websocket = None
@@ -557,6 +630,10 @@ class SarvamTTSService(InterruptibleTTSService):
557
630
  await self._websocket.close()
558
631
  except Exception as e:
559
632
  logger.error(f"{self} error closing websocket: {e}")
633
+ finally:
634
+ self._started = False
635
+ self._websocket = None
636
+ await self._call_event_handler("on_disconnected")
560
637
 
561
638
  def _get_websocket(self):
562
639
  if self._websocket:
@@ -577,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
577
577
  ),
578
578
  )
579
579
  logger.debug(f"{self} Connected to Speechmatics STT service")
580
+ await self._call_event_handler("on_connected")
580
581
  except Exception as e:
581
582
  logger.error(f"{self} Error connecting to Speechmatics: {e}")
582
583
  self._client = None
@@ -595,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
595
596
  logger.error(f"{self} Error closing Speechmatics client: {e}")
596
597
  finally:
597
598
  self._client = None
599
+ await self._call_event_handler("on_disconnected")
598
600
 
599
601
  def _process_config(self) -> None:
600
602
  """Create a formatted STT transcription config.
@@ -618,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
618
620
  transcription_config.additional_vocab = [
619
621
  {
620
622
  "content": e.content,
621
- "sounds_like": e.sounds_like,
623
+ **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
622
624
  }
623
625
  for e in self._params.additional_vocab
624
626
  ]
@@ -36,6 +36,25 @@ class STTService(AIService):
36
36
  Provides common functionality for STT services including audio passthrough,
37
37
  muting, settings management, and audio processing. Subclasses must implement
38
38
  the run_stt method to provide actual speech recognition.
39
+
40
+ Event handlers:
41
+ on_connected: Called when connected to the STT service.
42
+ on_connected: Called when disconnected from the STT service.
43
+ on_connection_error: Called when a connection to the STT service error occurs.
44
+
45
+ Example::
46
+
47
+ @stt.event_handler("on_connected")
48
+ async def on_connected(stt: STTService):
49
+ logger.debug(f"STT connected")
50
+
51
+ @stt.event_handler("on_disconnected")
52
+ async def on_disconnected(stt: STTService):
53
+ logger.debug(f"STT disconnected")
54
+
55
+ @stt.event_handler("on_connection_error")
56
+ async def on_connection_error(stt: STTService, error: str):
57
+ logger.error(f"STT connection error: {error}")
39
58
  """
40
59
 
41
60
  def __init__(
@@ -66,6 +85,10 @@ class STTService(AIService):
66
85
  self._voicemail_detect: bool = False
67
86
  self._user_id: str = ""
68
87
 
88
+ self._register_event_handler("on_connected")
89
+ self._register_event_handler("on_disconnected")
90
+ self._register_event_handler("on_connection_error")
91
+
69
92
  @property
70
93
  def is_muted(self) -> bool:
71
94
  """Check if the STT service is currently muted.
@@ -307,15 +330,6 @@ class WebsocketSTTService(STTService, WebsocketService):
307
330
 
308
331
  Combines STT functionality with websocket connectivity, providing automatic
309
332
  error handling and reconnection capabilities.
310
-
311
- Event handlers:
312
- on_connection_error: Called when a websocket connection error occurs.
313
-
314
- Example::
315
-
316
- @stt.event_handler("on_connection_error")
317
- async def on_connection_error(stt: STTService, error: str):
318
- logger.error(f"STT connection error: {error}")
319
333
  """
320
334
 
321
335
  def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -327,7 +341,6 @@ class WebsocketSTTService(STTService, WebsocketService):
327
341
  """
328
342
  STTService.__init__(self, **kwargs)
329
343
  WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
330
- self._register_event_handler("on_connection_error")
331
344
 
332
345
  async def _report_error(self, error: ErrorFrame):
333
346
  await self._call_event_handler("on_connection_error", error.error)
@@ -8,7 +8,17 @@
8
8
 
9
9
  import asyncio
10
10
  from abc import abstractmethod
11
- from typing import Any, AsyncGenerator, Callable, Dict, List, Mapping, Optional, Sequence, Tuple
11
+ from typing import (
12
+ Any,
13
+ AsyncGenerator,
14
+ AsyncIterator,
15
+ Dict,
16
+ List,
17
+ Mapping,
18
+ Optional,
19
+ Sequence,
20
+ Tuple,
21
+ )
12
22
 
13
23
  from loguru import logger
14
24
 
@@ -49,6 +59,25 @@ class TTSService(AIService):
49
59
  Provides common functionality for TTS services including text aggregation,
50
60
  filtering, audio generation, and frame management. Supports configurable
51
61
  sentence aggregation, silence insertion, and frame processing control.
62
+
63
+ Event handlers:
64
+ on_connected: Called when connected to the STT service.
65
+ on_connected: Called when disconnected from the STT service.
66
+ on_connection_error: Called when a connection to the STT service error occurs.
67
+
68
+ Example::
69
+
70
+ @tts.event_handler("on_connected")
71
+ async def on_connected(tts: TTSService):
72
+ logger.debug(f"TTS connected")
73
+
74
+ @tts.event_handler("on_disconnected")
75
+ async def on_disconnected(tts: TTSService):
76
+ logger.debug(f"TTS disconnected")
77
+
78
+ @tts.event_handler("on_connection_error")
79
+ async def on_connection_error(stt: TTSService, error: str):
80
+ logger.error(f"TTS connection error: {error}")
52
81
  """
53
82
 
54
83
  def __init__(
@@ -124,7 +153,6 @@ class TTSService(AIService):
124
153
 
125
154
  self._tracing_enabled: bool = False
126
155
 
127
-
128
156
  if text_filter:
129
157
  import warnings
130
158
 
@@ -143,6 +171,10 @@ class TTSService(AIService):
143
171
 
144
172
  self._processing_text: bool = False
145
173
 
174
+ self._register_event_handler("on_connected")
175
+ self._register_event_handler("on_disconnected")
176
+ self._register_event_handler("on_connection_error")
177
+
146
178
  @property
147
179
  def sample_rate(self) -> int:
148
180
  """Get the current sample rate for audio output.
@@ -384,6 +416,36 @@ class TTSService(AIService):
384
416
  ):
385
417
  await self._stop_frame_queue.put(frame)
386
418
 
419
+ async def _stream_audio_frames_from_iterator(
420
+ self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
421
+ ) -> AsyncGenerator[Frame, None]:
422
+ buffer = bytearray()
423
+ need_to_strip_wav_header = strip_wav_header
424
+ async for chunk in iterator:
425
+ if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
426
+ chunk = chunk[44:]
427
+ need_to_strip_wav_header = False
428
+
429
+ # Append to current buffer.
430
+ buffer.extend(chunk)
431
+
432
+ # Round to nearest even number.
433
+ aligned_length = len(buffer) & ~1 # 111111111...11110
434
+ if aligned_length > 0:
435
+ aligned_chunk = buffer[:aligned_length]
436
+ buffer = buffer[aligned_length:] # keep any leftover byte
437
+
438
+ if len(aligned_chunk) > 0:
439
+ frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
440
+ yield frame
441
+
442
+ if len(buffer) > 0:
443
+ # Make sure we don't need an extra padding byte.
444
+ if len(buffer) % 2 == 1:
445
+ buffer.extend(b"\x00")
446
+ frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
447
+ yield frame
448
+
387
449
  async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
388
450
  self._processing_text = False
389
451
  await self._text_aggregator.handle_interruption()
@@ -613,7 +675,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
613
675
  """
614
676
  TTSService.__init__(self, **kwargs)
615
677
  WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
616
- self._register_event_handler("on_connection_error")
617
678
 
618
679
  async def _report_error(self, error: ErrorFrame):
619
680
  await self._call_event_handler("on_connection_error", error.error)
@@ -665,15 +726,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
665
726
  """Base class for websocket-based TTS services that support word timestamps.
666
727
 
667
728
  Combines word timestamp functionality with websocket connectivity.
668
-
669
- Event handlers:
670
- on_connection_error: Called when a websocket connection error occurs.
671
-
672
- Example::
673
-
674
- @tts.event_handler("on_connection_error")
675
- async def on_connection_error(tts: TTSService, error: str):
676
- logger.error(f"TTS connection error: {error}")
677
729
  """
678
730
 
679
731
  def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -685,7 +737,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
685
737
  """
686
738
  WordTTSService.__init__(self, **kwargs)
687
739
  WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
688
- self._register_event_handler("on_connection_error")
689
740
 
690
741
  async def _report_error(self, error: ErrorFrame):
691
742
  await self._call_event_handler("on_connection_error", error.error)
@@ -232,6 +232,9 @@ class BaseInputTransport(FrameProcessor):
232
232
  """
233
233
  # Cancel and wait for the audio input task to finish.
234
234
  await self._cancel_audio_task()
235
+ # Stop audio filter.
236
+ if self._params.audio_in_filter:
237
+ await self._params.audio_in_filter.stop()
235
238
 
236
239
  async def set_transport_ready(self, frame: StartFrame):
237
240
  """Called when the transport is ready to stream.
@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
294
294
  """
295
295
  await super().process_frame(frame, direction)
296
296
 
297
- #
298
- # System frames (like InterruptionFrame) are pushed immediately. Other
299
- # frames require order so they are put in the sink queue.
300
- #
301
297
  if isinstance(frame, StartFrame):
302
298
  # Push StartFrame before start(), because we want StartFrame to be
303
299
  # processed by every processor before any other frame is processed.
304
300
  await self.push_frame(frame, direction)
305
301
  await self.start(frame)
302
+ elif isinstance(frame, EndFrame):
303
+ await self.stop(frame)
304
+ # Keep pushing EndFrame down so all the pipeline stops nicely.
305
+ await self.push_frame(frame, direction)
306
306
  elif isinstance(frame, CancelFrame):
307
307
  await self.cancel(frame)
308
308
  await self.push_frame(frame, direction)
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
315
315
  await self.write_dtmf(frame)
316
316
  elif isinstance(frame, SystemFrame):
317
317
  await self.push_frame(frame, direction)
318
- # Control frames.
319
- elif isinstance(frame, EndFrame):
320
- await self.stop(frame)
321
- # Keep pushing EndFrame down so all the pipeline stops nicely.
322
- await self.push_frame(frame, direction)
323
- elif isinstance(frame, MixerControlFrame):
324
- await self._handle_frame(frame)
325
- # Other frames.
326
- elif isinstance(frame, OutputAudioRawFrame):
327
- await self._handle_frame(frame)
328
- elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
329
- await self._handle_frame(frame)
330
- # TODO(aleix): Images and audio should support presentation timestamps.
331
- elif frame.pts:
332
- await self._handle_frame(frame)
333
318
  elif direction == FrameDirection.UPSTREAM:
334
319
  await self.push_frame(frame, direction)
335
320
  else:
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
411
396
 
412
397
  # Indicates if the bot is currently speaking.
413
398
  self._bot_speaking = False
399
+ # Last time a BotSpeakingFrame was pushed.
400
+ self._bot_speaking_frame_time = 0
401
+ # How often a BotSpeakingFrame should be pushed (value should be
402
+ # lower than the audio chunks).
403
+ self._bot_speaking_frame_period = 0.2
404
+ # Last time the bot actually spoke.
405
+ self._bot_speech_last_time = 0
414
406
 
415
407
  self._audio_task: Optional[asyncio.Task] = None
416
408
  self._video_task: Optional[asyncio.Task] = None
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
602
594
 
603
595
  async def _bot_started_speaking(self):
604
596
  """Handle bot started speaking event."""
605
- if not self._bot_speaking:
606
- self._transport.logger.debug(
607
- f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
608
- )
597
+ if self._bot_speaking:
598
+ return
609
599
 
610
- downstream_frame = BotStartedSpeakingFrame()
611
- downstream_frame.transport_destination = self._destination
612
- upstream_frame = BotStartedSpeakingFrame()
613
- upstream_frame.transport_destination = self._destination
614
- await self._transport.push_frame(downstream_frame)
615
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
600
+ self._transport.logger.debug(
601
+ f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
602
+ )
603
+
604
+ downstream_frame = BotStartedSpeakingFrame()
605
+ downstream_frame.transport_destination = self._destination
606
+ upstream_frame = BotStartedSpeakingFrame()
607
+ upstream_frame.transport_destination = self._destination
608
+ await self._transport.push_frame(downstream_frame)
609
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
616
610
 
617
- self._bot_speaking = True
611
+ self._bot_speaking = True
618
612
 
619
613
  async def _bot_stopped_speaking(self):
620
614
  """Handle bot stopped speaking event."""
621
- if self._bot_speaking:
622
- self._transport.logger.debug(
623
- f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
624
- )
615
+ if not self._bot_speaking:
616
+ return
625
617
 
626
- downstream_frame = BotStoppedSpeakingFrame()
627
- downstream_frame.transport_destination = self._destination
628
- upstream_frame = BotStoppedSpeakingFrame()
629
- upstream_frame.transport_destination = self._destination
630
- await self._transport.push_frame(downstream_frame)
631
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
618
+ self._transport.logger.debug(
619
+ f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
620
+ )
621
+
622
+ downstream_frame = BotStoppedSpeakingFrame()
623
+ downstream_frame.transport_destination = self._destination
624
+ upstream_frame = BotStoppedSpeakingFrame()
625
+ upstream_frame.transport_destination = self._destination
626
+ await self._transport.push_frame(downstream_frame)
627
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
632
628
 
633
- self._bot_speaking = False
629
+ self._bot_speaking = False
630
+
631
+ # Clean audio buffer (there could be tiny left overs if not multiple
632
+ # to our output chunk size).
633
+ self._audio_buffer = bytearray()
634
634
 
635
- # Clean audio buffer (there could be tiny left overs if not multiple
636
- # to our output chunk size).
637
- self._audio_buffer = bytearray()
635
+ async def _bot_currently_speaking(self):
636
+ """Handle bot speaking event."""
637
+ await self._bot_started_speaking()
638
+
639
+ diff_time = time.time() - self._bot_speaking_frame_time
640
+ if diff_time >= self._bot_speaking_frame_period:
641
+ await self._transport.push_frame(BotSpeakingFrame())
642
+ await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
643
+ self._bot_speaking_frame_time = time.time()
644
+
645
+ self._bot_speech_last_time = time.time()
646
+
647
+ async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
648
+ if not is_silence(frame.audio):
649
+ await self._bot_currently_speaking()
650
+ else:
651
+ silence_duration = time.time() - self._bot_speech_last_time
652
+ if silence_duration > BOT_VAD_STOP_SECS:
653
+ await self._bot_stopped_speaking()
654
+
655
+ async def _handle_bot_speech(self, frame: Frame):
656
+ # TTS case.
657
+ if isinstance(frame, TTSAudioRawFrame):
658
+ await self._bot_currently_speaking()
659
+ # Speech stream case.
660
+ elif isinstance(frame, SpeechOutputAudioRawFrame):
661
+ await self._maybe_bot_currently_speaking(frame)
638
662
 
639
663
  async def _handle_frame(self, frame: Frame):
640
664
  """Handle various frame types with appropriate processing.
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
642
666
  Args:
643
667
  frame: The frame to handle.
644
668
  """
645
- if isinstance(frame, OutputImageRawFrame):
669
+ if isinstance(frame, OutputAudioRawFrame):
670
+ await self._handle_bot_speech(frame)
671
+ elif isinstance(frame, OutputImageRawFrame):
646
672
  await self._set_video_image(frame)
647
673
  elif isinstance(frame, SpriteFrame):
648
674
  await self._set_video_images(frame.images)
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
706
732
 
707
733
  async def _audio_task_handler(self):
708
734
  """Main audio processing task handler."""
709
- # Push a BotSpeakingFrame every 200ms, we don't really need to push it
710
- # at every audio chunk. If the audio chunk is bigger than 200ms, push at
711
- # every audio chunk.
712
- TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
713
- BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
714
- bot_speaking_counter = 0
715
- speech_last_speaking_time = 0
716
-
717
735
  async for frame in self._next_frame():
718
- # Notify the bot started speaking upstream if necessary and that
719
- # it's actually speaking.
720
- is_speaking = False
721
- if isinstance(frame, TTSAudioRawFrame):
722
- is_speaking = True
723
- elif isinstance(frame, SpeechOutputAudioRawFrame):
724
- if not is_silence(frame.audio):
725
- is_speaking = True
726
- speech_last_speaking_time = time.time()
727
- else:
728
- silence_duration = time.time() - speech_last_speaking_time
729
- if silence_duration > BOT_VAD_STOP_SECS:
730
- await self._bot_stopped_speaking()
731
-
732
- if is_speaking:
733
- await self._bot_started_speaking()
734
- if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
735
- await self._transport.push_frame(BotSpeakingFrame())
736
- await self._transport.push_frame(
737
- BotSpeakingFrame(), FrameDirection.UPSTREAM
738
- )
739
- bot_speaking_counter = 0
740
- bot_speaking_counter += 1
741
-
742
736
  # No need to push EndFrame, it's pushed from process_frame().
743
737
  if isinstance(frame, EndFrame):
744
738
  break
@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
689
689
  )()
690
690
  if track:
691
691
  track.set_enabled(signalling_message.enabled)
692
+
693
+ async def add_ice_candidate(self, candidate):
694
+ """Handle incoming ICE candidates."""
695
+ logger.debug(f"Adding remote candidate: {candidate}")
696
+ await self.pc.addIceCandidate(candidate)