PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev698py3-none-any.whl → 0.0.85.dev814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (45) hide show

{dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
{dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +45 -43
pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
pipecat/pipeline/runner.py +6 -2
pipecat/pipeline/task.py +40 -55
pipecat/processors/aggregators/llm_context.py +40 -2
pipecat/processors/frameworks/rtvi.py +1 -0
pipecat/runner/daily.py +59 -20
pipecat/runner/run.py +149 -67
pipecat/runner/types.py +5 -5
pipecat/services/assemblyai/models.py +6 -0
pipecat/services/assemblyai/stt.py +13 -5
pipecat/services/asyncai/tts.py +3 -0
pipecat/services/aws/llm.py +33 -16
pipecat/services/aws/nova_sonic/context.py +69 -0
pipecat/services/aws/nova_sonic/llm.py +199 -89
pipecat/services/aws/stt.py +2 -0
pipecat/services/aws_nova_sonic/context.py +8 -12
pipecat/services/cartesia/stt.py +77 -70
pipecat/services/cartesia/tts.py +3 -1
pipecat/services/deepgram/flux/stt.py +4 -0
pipecat/services/elevenlabs/tts.py +82 -41
pipecat/services/fish/tts.py +3 -0
pipecat/services/google/stt.py +4 -0
pipecat/services/lmnt/tts.py +2 -0
pipecat/services/neuphonic/tts.py +3 -0
pipecat/services/openai/tts.py +37 -6
pipecat/services/piper/tts.py +7 -9
pipecat/services/playht/tts.py +3 -0
pipecat/services/rime/tts.py +9 -8
pipecat/services/riva/stt.py +3 -1
pipecat/services/salesforce/__init__.py +9 -0
pipecat/services/salesforce/llm.py +465 -0
pipecat/services/sarvam/tts.py +87 -10
pipecat/services/speechmatics/stt.py +3 -1
pipecat/services/stt_service.py +23 -10
pipecat/services/tts_service.py +64 -13
pipecat/transports/base_input.py +3 -0
pipecat/transports/base_output.py +71 -77
pipecat/transports/smallwebrtc/connection.py +5 -0
pipecat/transports/smallwebrtc/request_handler.py +42 -0
pipecat/utils/string.py +1 -0
{dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0

pipecat/services/sarvam/tts.py CHANGED Viewed

@@ -77,17 +77,29 @@ class SarvamHttpTTSService(TTSService):
     Example::
-        tts = SarvamTTSService(
+        tts = SarvamHttpTTSService(
             api_key="your-api-key",
             voice_id="anushka",
             model="bulbul:v2",
             aiohttp_session=session,
-            params=SarvamTTSService.InputParams(
+            params=SarvamHttpTTSService.InputParams(
                 language=Language.HI,
                 pitch=0.1,
                 pace=1.2
             )
         )
+        # For bulbul v3 beta with any speaker:
+        tts_v3 = SarvamHttpTTSService(
+            api_key="your-api-key",
+            voice_id="speaker_name",
+            model="bulbul:v3,
+            aiohttp_session=session,
+            params=SarvamHttpTTSService.InputParams(
+                language=Language.HI,
+                temperature=0.8
+            )
+        )
     """
     class InputParams(BaseModel):
@@ -106,6 +118,14 @@ class SarvamHttpTTSService(TTSService):
         pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
         loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
         enable_preprocessing: Optional[bool] = False
+        temperature: Optional[float] = Field(
+            default=0.6,
+            ge=0.01,
+            le=1.0,
+            description="Controls the randomness of the output for bulbul v3 beta. "
+            "Lower values make the output more focused and deterministic, while "
+            "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
+        )
     def __init__(
         self,
@@ -125,7 +145,7 @@ class SarvamHttpTTSService(TTSService):
             api_key: Sarvam AI API subscription key.
             aiohttp_session: Shared aiohttp session for making requests.
             voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
-            model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
+            model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
             base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
             sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
             params: Additional voice and preprocessing parameters. If None, uses defaults.
@@ -139,16 +159,32 @@ class SarvamHttpTTSService(TTSService):
         self._base_url = base_url
         self._session = aiohttp_session
+        # Build base settings common to all models
         self._settings = {
             "language": (
                 self.language_to_service_language(params.language) if params.language else "en-IN"
             ),
-            "pitch": params.pitch,
-            "pace": params.pace,
-            "loudness": params.loudness,
             "enable_preprocessing": params.enable_preprocessing,
         }
+        # Add model-specific parameters
+        if model in ("bulbul:v3-beta", "bulbul:v3"):
+            self._settings.update(
+                {
+                    "temperature": getattr(params, "temperature", 0.6),
+                    "model": model,
+                }
+            )
+        else:
+            self._settings.update(
+                {
+                    "pitch": params.pitch,
+                    "pace": params.pace,
+                    "loudness": params.loudness,
+                    "model": model,
+                }
+            )
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -276,6 +312,18 @@ class SarvamTTSService(InterruptibleTTSService):
                 pace=1.2
             )
         )
+        # For bulbul v3 beta with any speaker and temperature:
+        # Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
+        tts_v3 = SarvamTTSService(
+            api_key="your-api-key",
+            voice_id="speaker_name",
+            model="bulbul:v3",
+            params=SarvamTTSService.InputParams(
+                language=Language.HI,
+                temperature=0.8
+            )
+        )
     """
     class InputParams(BaseModel):
@@ -311,6 +359,14 @@ class SarvamTTSService(InterruptibleTTSService):
         output_audio_codec: Optional[str] = "linear16"
         output_audio_bitrate: Optional[str] = "128k"
         language: Optional[Language] = Language.EN
+        temperature: Optional[float] = Field(
+            default=0.6,
+            ge=0.01,
+            le=1.0,
+            description="Controls the randomness of the output for bulbul v3 beta. "
+            "Lower values make the output more focused and deterministic, while "
+            "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
+        )
     def __init__(
         self,
@@ -330,6 +386,7 @@ class SarvamTTSService(InterruptibleTTSService):
         Args:
             api_key: Sarvam API key for authenticating TTS requests.
             model: Identifier of the Sarvam speech model (default "bulbul:v2").
+                Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
             voice_id: Voice identifier for synthesis (default "anushka").
             url: WebSocket URL for connecting to the TTS backend (default production URL).
             aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
@@ -372,15 +429,12 @@ class SarvamTTSService(InterruptibleTTSService):
         self._api_key = api_key
         self.set_model_name(model)
         self.set_voice(voice_id)
-        # Configuration parameters
+        # Build base settings common to all models
         self._settings = {
             "target_language_code": (
                 self.language_to_service_language(params.language) if params.language else "en-IN"
             ),
-            "pitch": params.pitch,
-            "pace": params.pace,
             "speaker": voice_id,
-            "loudness": params.loudness,
             "speech_sample_rate": 0,
             "enable_preprocessing": params.enable_preprocessing,
             "min_buffer_size": params.min_buffer_size,
@@ -388,6 +442,24 @@ class SarvamTTSService(InterruptibleTTSService):
             "output_audio_codec": params.output_audio_codec,
             "output_audio_bitrate": params.output_audio_bitrate,
         }
+        # Add model-specific parameters
+        if model in ("bulbul:v3-beta", "bulbul:v3"):
+            self._settings.update(
+                {
+                    "temperature": getattr(params, "temperature", 0.6),
+                    "model": model,
+                }
+            )
+        else:
+            self._settings.update(
+                {
+                    "pitch": params.pitch,
+                    "pace": params.pace,
+                    "loudness": params.loudness,
+                    "model": model,
+                }
+            )
         self._started = False
         self._receive_task = None
@@ -526,6 +598,7 @@ class SarvamTTSService(InterruptibleTTSService):
             logger.debug("Connected to Sarvam TTS Websocket")
             await self._send_config()
+            await self._call_event_handler("on_connected")
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
             self._websocket = None
@@ -557,6 +630,10 @@ class SarvamTTSService(InterruptibleTTSService):
                 await self._websocket.close()
         except Exception as e:
             logger.error(f"{self} error closing websocket: {e}")
+        finally:
+            self._started = False
+            self._websocket = None
+            await self._call_event_handler("on_disconnected")
     def _get_websocket(self):
         if self._websocket:

pipecat/services/speechmatics/stt.py CHANGED Viewed

@@ -577,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
                 ),
             )
             logger.debug(f"{self} Connected to Speechmatics STT service")
+            await self._call_event_handler("on_connected")
         except Exception as e:
             logger.error(f"{self} Error connecting to Speechmatics: {e}")
             self._client = None
@@ -595,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
             logger.error(f"{self} Error closing Speechmatics client: {e}")
         finally:
             self._client = None
+            await self._call_event_handler("on_disconnected")
     def _process_config(self) -> None:
         """Create a formatted STT transcription config.
@@ -618,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
             transcription_config.additional_vocab = [
                 {
                     "content": e.content,
-                    "sounds_like": e.sounds_like,
+                    **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
                 }
                 for e in self._params.additional_vocab
             ]

pipecat/services/stt_service.py CHANGED Viewed

@@ -36,6 +36,25 @@ class STTService(AIService):
     Provides common functionality for STT services including audio passthrough,
     muting, settings management, and audio processing. Subclasses must implement
     the run_stt method to provide actual speech recognition.
+    Event handlers:
+        on_connected: Called when connected to the STT service.
+        on_connected: Called when disconnected from the STT service.
+        on_connection_error: Called when a connection to the STT service error occurs.
+    Example::
+        @stt.event_handler("on_connected")
+        async def on_connected(stt: STTService):
+            logger.debug(f"STT connected")
+        @stt.event_handler("on_disconnected")
+        async def on_disconnected(stt: STTService):
+            logger.debug(f"STT disconnected")
+        @stt.event_handler("on_connection_error")
+        async def on_connection_error(stt: STTService, error: str):
+            logger.error(f"STT connection error: {error}")
     """
     def __init__(
@@ -66,6 +85,10 @@ class STTService(AIService):
         self._voicemail_detect: bool = False
         self._user_id: str = ""
+        self._register_event_handler("on_connected")
+        self._register_event_handler("on_disconnected")
+        self._register_event_handler("on_connection_error")
     @property
     def is_muted(self) -> bool:
         """Check if the STT service is currently muted.
@@ -307,15 +330,6 @@ class WebsocketSTTService(STTService, WebsocketService):
     Combines STT functionality with websocket connectivity, providing automatic
     error handling and reconnection capabilities.
-    Event handlers:
-        on_connection_error: Called when a websocket connection error occurs.
-    Example::
-        @stt.event_handler("on_connection_error")
-        async def on_connection_error(stt: STTService, error: str):
-            logger.error(f"STT connection error: {error}")
     """
     def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -327,7 +341,6 @@ class WebsocketSTTService(STTService, WebsocketService):
         """
         STTService.__init__(self, **kwargs)
         WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
-        self._register_event_handler("on_connection_error")
     async def _report_error(self, error: ErrorFrame):
         await self._call_event_handler("on_connection_error", error.error)

pipecat/services/tts_service.py CHANGED Viewed

@@ -8,7 +8,17 @@
 import asyncio
 from abc import abstractmethod
-from typing import Any, AsyncGenerator, Callable, Dict, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+)
 from loguru import logger
@@ -49,6 +59,25 @@ class TTSService(AIService):
     Provides common functionality for TTS services including text aggregation,
     filtering, audio generation, and frame management. Supports configurable
     sentence aggregation, silence insertion, and frame processing control.
+    Event handlers:
+        on_connected: Called when connected to the STT service.
+        on_connected: Called when disconnected from the STT service.
+        on_connection_error: Called when a connection to the STT service error occurs.
+    Example::
+        @tts.event_handler("on_connected")
+        async def on_connected(tts: TTSService):
+            logger.debug(f"TTS connected")
+        @tts.event_handler("on_disconnected")
+        async def on_disconnected(tts: TTSService):
+            logger.debug(f"TTS disconnected")
+        @tts.event_handler("on_connection_error")
+        async def on_connection_error(stt: TTSService, error: str):
+            logger.error(f"TTS connection error: {error}")
     """
     def __init__(
@@ -124,7 +153,6 @@ class TTSService(AIService):
         self._tracing_enabled: bool = False
         if text_filter:
             import warnings
@@ -143,6 +171,10 @@ class TTSService(AIService):
         self._processing_text: bool = False
+        self._register_event_handler("on_connected")
+        self._register_event_handler("on_disconnected")
+        self._register_event_handler("on_connection_error")
     @property
     def sample_rate(self) -> int:
         """Get the current sample rate for audio output.
@@ -384,6 +416,36 @@ class TTSService(AIService):
         ):
             await self._stop_frame_queue.put(frame)
+    async def _stream_audio_frames_from_iterator(
+        self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
+    ) -> AsyncGenerator[Frame, None]:
+        buffer = bytearray()
+        need_to_strip_wav_header = strip_wav_header
+        async for chunk in iterator:
+            if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
+                chunk = chunk[44:]
+                need_to_strip_wav_header = False
+            # Append to current buffer.
+            buffer.extend(chunk)
+            # Round to nearest even number.
+            aligned_length = len(buffer) & ~1  # 111111111...11110
+            if aligned_length > 0:
+                aligned_chunk = buffer[:aligned_length]
+                buffer = buffer[aligned_length:]  # keep any leftover byte
+                if len(aligned_chunk) > 0:
+                    frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
+                    yield frame
+        if len(buffer) > 0:
+            # Make sure we don't need an extra padding byte.
+            if len(buffer) % 2 == 1:
+                buffer.extend(b"\x00")
+            frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
+            yield frame
     async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         self._processing_text = False
         await self._text_aggregator.handle_interruption()
@@ -613,7 +675,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
         """
         TTSService.__init__(self, **kwargs)
         WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
-        self._register_event_handler("on_connection_error")
     async def _report_error(self, error: ErrorFrame):
         await self._call_event_handler("on_connection_error", error.error)
@@ -665,15 +726,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
     """Base class for websocket-based TTS services that support word timestamps.
     Combines word timestamp functionality with websocket connectivity.
-    Event handlers:
-        on_connection_error: Called when a websocket connection error occurs.
-    Example::
-        @tts.event_handler("on_connection_error")
-        async def on_connection_error(tts: TTSService, error: str):
-            logger.error(f"TTS connection error: {error}")
     """
     def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -685,7 +737,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
         """
         WordTTSService.__init__(self, **kwargs)
         WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
-        self._register_event_handler("on_connection_error")
     async def _report_error(self, error: ErrorFrame):
         await self._call_event_handler("on_connection_error", error.error)

pipecat/transports/base_input.py CHANGED Viewed

@@ -232,6 +232,9 @@ class BaseInputTransport(FrameProcessor):
         """
         # Cancel and wait for the audio input task to finish.
         await self._cancel_audio_task()
+        # Stop audio filter.
+        if self._params.audio_in_filter:
+            await self._params.audio_in_filter.stop()
     async def set_transport_ready(self, frame: StartFrame):
         """Called when the transport is ready to stream.

pipecat/transports/base_output.py CHANGED Viewed

@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
         """
         await super().process_frame(frame, direction)
-        #
-        # System frames (like InterruptionFrame) are pushed immediately. Other
-        # frames require order so they are put in the sink queue.
-        #
         if isinstance(frame, StartFrame):
             # Push StartFrame before start(), because we want StartFrame to be
             # processed by every processor before any other frame is processed.
             await self.push_frame(frame, direction)
             await self.start(frame)
+        elif isinstance(frame, EndFrame):
+            await self.stop(frame)
+            # Keep pushing EndFrame down so all the pipeline stops nicely.
+            await self.push_frame(frame, direction)
         elif isinstance(frame, CancelFrame):
             await self.cancel(frame)
             await self.push_frame(frame, direction)
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
             await self.write_dtmf(frame)
         elif isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
-        # Control frames.
-        elif isinstance(frame, EndFrame):
-            await self.stop(frame)
-            # Keep pushing EndFrame down so all the pipeline stops nicely.
-            await self.push_frame(frame, direction)
-        elif isinstance(frame, MixerControlFrame):
-            await self._handle_frame(frame)
-        # Other frames.
-        elif isinstance(frame, OutputAudioRawFrame):
-            await self._handle_frame(frame)
-        elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
-            await self._handle_frame(frame)
-        # TODO(aleix): Images and audio should support presentation timestamps.
-        elif frame.pts:
-            await self._handle_frame(frame)
         elif direction == FrameDirection.UPSTREAM:
             await self.push_frame(frame, direction)
         else:
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
             # Indicates if the bot is currently speaking.
             self._bot_speaking = False
+            # Last time a BotSpeakingFrame was pushed.
+            self._bot_speaking_frame_time = 0
+            # How often a BotSpeakingFrame should be pushed (value should be
+            # lower than the audio chunks).
+            self._bot_speaking_frame_period = 0.2
+            # Last time the bot actually spoke.
+            self._bot_speech_last_time = 0
             self._audio_task: Optional[asyncio.Task] = None
             self._video_task: Optional[asyncio.Task] = None
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
         async def _bot_started_speaking(self):
             """Handle bot started speaking event."""
-            if not self._bot_speaking:
-                self._transport.logger.debug(
-                    f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
-                )
+            if self._bot_speaking:
+                return
-                downstream_frame = BotStartedSpeakingFrame()
-                downstream_frame.transport_destination = self._destination
-                upstream_frame = BotStartedSpeakingFrame()
-                upstream_frame.transport_destination = self._destination
-                await self._transport.push_frame(downstream_frame)
-                await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
+            self._transport.logger.debug(
+                f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
+            )
+            downstream_frame = BotStartedSpeakingFrame()
+            downstream_frame.transport_destination = self._destination
+            upstream_frame = BotStartedSpeakingFrame()
+            upstream_frame.transport_destination = self._destination
+            await self._transport.push_frame(downstream_frame)
+            await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-                self._bot_speaking = True
+            self._bot_speaking = True
         async def _bot_stopped_speaking(self):
             """Handle bot stopped speaking event."""
-            if self._bot_speaking:
-                self._transport.logger.debug(
-                    f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
-                )
+            if not self._bot_speaking:
+                return
-                downstream_frame = BotStoppedSpeakingFrame()
-                downstream_frame.transport_destination = self._destination
-                upstream_frame = BotStoppedSpeakingFrame()
-                upstream_frame.transport_destination = self._destination
-                await self._transport.push_frame(downstream_frame)
-                await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
+            self._transport.logger.debug(
+                f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
+            )
+            downstream_frame = BotStoppedSpeakingFrame()
+            downstream_frame.transport_destination = self._destination
+            upstream_frame = BotStoppedSpeakingFrame()
+            upstream_frame.transport_destination = self._destination
+            await self._transport.push_frame(downstream_frame)
+            await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-                self._bot_speaking = False
+            self._bot_speaking = False
+            # Clean audio buffer (there could be tiny left overs if not multiple
+            # to our output chunk size).
+            self._audio_buffer = bytearray()
-                # Clean audio buffer (there could be tiny left overs if not multiple
-                # to our output chunk size).
-                self._audio_buffer = bytearray()
+        async def _bot_currently_speaking(self):
+            """Handle bot speaking event."""
+            await self._bot_started_speaking()
+            diff_time = time.time() - self._bot_speaking_frame_time
+            if diff_time >= self._bot_speaking_frame_period:
+                await self._transport.push_frame(BotSpeakingFrame())
+                await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
+                self._bot_speaking_frame_time = time.time()
+            self._bot_speech_last_time = time.time()
+        async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
+            if not is_silence(frame.audio):
+                await self._bot_currently_speaking()
+            else:
+                silence_duration = time.time() - self._bot_speech_last_time
+                if silence_duration > BOT_VAD_STOP_SECS:
+                    await self._bot_stopped_speaking()
+        async def _handle_bot_speech(self, frame: Frame):
+            # TTS case.
+            if isinstance(frame, TTSAudioRawFrame):
+                await self._bot_currently_speaking()
+            # Speech stream case.
+            elif isinstance(frame, SpeechOutputAudioRawFrame):
+                await self._maybe_bot_currently_speaking(frame)
         async def _handle_frame(self, frame: Frame):
             """Handle various frame types with appropriate processing.
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
             Args:
                 frame: The frame to handle.
             """
-            if isinstance(frame, OutputImageRawFrame):
+            if isinstance(frame, OutputAudioRawFrame):
+                await self._handle_bot_speech(frame)
+            elif isinstance(frame, OutputImageRawFrame):
                 await self._set_video_image(frame)
             elif isinstance(frame, SpriteFrame):
                 await self._set_video_images(frame.images)
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
         async def _audio_task_handler(self):
             """Main audio processing task handler."""
-            # Push a BotSpeakingFrame every 200ms, we don't really need to push it
-            # at every audio chunk. If the audio chunk is bigger than 200ms, push at
-            # every audio chunk.
-            TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
-            BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
-            bot_speaking_counter = 0
-            speech_last_speaking_time = 0
             async for frame in self._next_frame():
-                # Notify the bot started speaking upstream if necessary and that
-                # it's actually speaking.
-                is_speaking = False
-                if isinstance(frame, TTSAudioRawFrame):
-                    is_speaking = True
-                elif isinstance(frame, SpeechOutputAudioRawFrame):
-                    if not is_silence(frame.audio):
-                        is_speaking = True
-                        speech_last_speaking_time = time.time()
-                    else:
-                        silence_duration = time.time() - speech_last_speaking_time
-                        if silence_duration > BOT_VAD_STOP_SECS:
-                            await self._bot_stopped_speaking()
-                if is_speaking:
-                    await self._bot_started_speaking()
-                    if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
-                        await self._transport.push_frame(BotSpeakingFrame())
-                        await self._transport.push_frame(
-                            BotSpeakingFrame(), FrameDirection.UPSTREAM
-                        )
-                        bot_speaking_counter = 0
-                    bot_speaking_counter += 1
                 # No need to push EndFrame, it's pushed from process_frame().
                 if isinstance(frame, EndFrame):
                     break

pipecat/transports/smallwebrtc/connection.py CHANGED Viewed

@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
                 )()
                 if track:
                     track.set_enabled(signalling_message.enabled)
+    async def add_ice_candidate(self, candidate):
+        """Handle incoming ICE candidates."""
+        logger.debug(f"Adding remote candidate: {candidate}")
+        await self.pc.addIceCandidate(candidate)

dv-pipecat-ai 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev698py3-none-any.whl → 0.0.85.dev814py3-none-any.whl