PyPI - dv-pipecat-ai - Versions diffs - 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show

{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
pipecat/adapters/base_llm_adapter.py +38 -1
pipecat/adapters/services/anthropic_adapter.py +9 -14
pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
pipecat/adapters/services/bedrock_adapter.py +236 -13
pipecat/adapters/services/gemini_adapter.py +12 -8
pipecat/adapters/services/open_ai_adapter.py +19 -7
pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
pipecat/audio/dtmf/dtmf-0.wav +0 -0
pipecat/audio/dtmf/dtmf-1.wav +0 -0
pipecat/audio/dtmf/dtmf-2.wav +0 -0
pipecat/audio/dtmf/dtmf-3.wav +0 -0
pipecat/audio/dtmf/dtmf-4.wav +0 -0
pipecat/audio/dtmf/dtmf-5.wav +0 -0
pipecat/audio/dtmf/dtmf-6.wav +0 -0
pipecat/audio/dtmf/dtmf-7.wav +0 -0
pipecat/audio/dtmf/dtmf-8.wav +0 -0
pipecat/audio/dtmf/dtmf-9.wav +0 -0
pipecat/audio/dtmf/dtmf-pound.wav +0 -0
pipecat/audio/dtmf/dtmf-star.wav +0 -0
pipecat/audio/filters/krisp_viva_filter.py +193 -0
pipecat/audio/filters/noisereduce_filter.py +15 -0
pipecat/audio/turn/base_turn_analyzer.py +9 -1
pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
pipecat/audio/vad/data/README.md +10 -0
pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
pipecat/audio/vad/silero.py +9 -3
pipecat/audio/vad/vad_analyzer.py +13 -1
pipecat/extensions/voicemail/voicemail_detector.py +5 -5
pipecat/frames/frames.py +277 -86
pipecat/observers/loggers/debug_log_observer.py +3 -3
pipecat/observers/loggers/llm_log_observer.py +7 -3
pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
pipecat/pipeline/runner.py +18 -6
pipecat/pipeline/service_switcher.py +64 -36
pipecat/pipeline/task.py +125 -79
pipecat/pipeline/tts_switcher.py +30 -0
pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
pipecat/processors/aggregators/llm_context.py +40 -2
pipecat/processors/aggregators/llm_response.py +32 -15
pipecat/processors/aggregators/llm_response_universal.py +19 -15
pipecat/processors/aggregators/user_response.py +6 -6
pipecat/processors/aggregators/vision_image_frame.py +24 -2
pipecat/processors/audio/audio_buffer_processor.py +43 -8
pipecat/processors/dtmf_aggregator.py +174 -77
pipecat/processors/filters/stt_mute_filter.py +17 -0
pipecat/processors/frame_processor.py +110 -24
pipecat/processors/frameworks/langchain.py +8 -2
pipecat/processors/frameworks/rtvi.py +210 -68
pipecat/processors/frameworks/strands_agents.py +170 -0
pipecat/processors/logger.py +2 -2
pipecat/processors/transcript_processor.py +26 -5
pipecat/processors/user_idle_processor.py +35 -11
pipecat/runner/daily.py +59 -20
pipecat/runner/run.py +395 -93
pipecat/runner/types.py +6 -4
pipecat/runner/utils.py +51 -10
pipecat/serializers/__init__.py +5 -1
pipecat/serializers/asterisk.py +16 -2
pipecat/serializers/convox.py +41 -4
pipecat/serializers/custom.py +257 -0
pipecat/serializers/exotel.py +5 -5
pipecat/serializers/livekit.py +20 -0
pipecat/serializers/plivo.py +5 -5
pipecat/serializers/protobuf.py +6 -5
pipecat/serializers/telnyx.py +2 -2
pipecat/serializers/twilio.py +43 -23
pipecat/serializers/vi.py +324 -0
pipecat/services/ai_service.py +2 -6
pipecat/services/anthropic/llm.py +2 -25
pipecat/services/assemblyai/models.py +6 -0
pipecat/services/assemblyai/stt.py +13 -5
pipecat/services/asyncai/tts.py +5 -3
pipecat/services/aws/__init__.py +1 -0
pipecat/services/aws/llm.py +147 -105
pipecat/services/aws/nova_sonic/__init__.py +0 -0
pipecat/services/aws/nova_sonic/context.py +436 -0
pipecat/services/aws/nova_sonic/frames.py +25 -0
pipecat/services/aws/nova_sonic/llm.py +1265 -0
pipecat/services/aws/stt.py +3 -3
pipecat/services/aws_nova_sonic/__init__.py +19 -1
pipecat/services/aws_nova_sonic/aws.py +11 -1151
pipecat/services/aws_nova_sonic/context.py +8 -354
pipecat/services/aws_nova_sonic/frames.py +13 -17
pipecat/services/azure/llm.py +51 -1
pipecat/services/azure/realtime/__init__.py +0 -0
pipecat/services/azure/realtime/llm.py +65 -0
pipecat/services/azure/stt.py +15 -0
pipecat/services/cartesia/stt.py +77 -70
pipecat/services/cartesia/tts.py +80 -13
pipecat/services/deepgram/__init__.py +1 -0
pipecat/services/deepgram/flux/__init__.py +0 -0
pipecat/services/deepgram/flux/stt.py +640 -0
pipecat/services/elevenlabs/__init__.py +4 -1
pipecat/services/elevenlabs/stt.py +339 -0
pipecat/services/elevenlabs/tts.py +87 -46
pipecat/services/fish/tts.py +5 -2
pipecat/services/gemini_multimodal_live/events.py +38 -524
pipecat/services/gemini_multimodal_live/file_api.py +23 -173
pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
pipecat/services/gladia/stt.py +56 -72
pipecat/services/google/__init__.py +1 -0
pipecat/services/google/gemini_live/__init__.py +3 -0
pipecat/services/google/gemini_live/file_api.py +189 -0
pipecat/services/google/gemini_live/llm.py +1582 -0
pipecat/services/google/gemini_live/llm_vertex.py +184 -0
pipecat/services/google/llm.py +15 -11
pipecat/services/google/llm_openai.py +3 -3
pipecat/services/google/llm_vertex.py +86 -16
pipecat/services/google/stt.py +4 -0
pipecat/services/google/tts.py +7 -3
pipecat/services/heygen/api.py +2 -0
pipecat/services/heygen/client.py +8 -4
pipecat/services/heygen/video.py +2 -0
pipecat/services/hume/__init__.py +5 -0
pipecat/services/hume/tts.py +220 -0
pipecat/services/inworld/tts.py +6 -6
pipecat/services/llm_service.py +15 -5
pipecat/services/lmnt/tts.py +4 -2
pipecat/services/mcp_service.py +4 -2
pipecat/services/mem0/memory.py +6 -5
pipecat/services/mistral/llm.py +29 -8
pipecat/services/moondream/vision.py +42 -16
pipecat/services/neuphonic/tts.py +5 -2
pipecat/services/openai/__init__.py +1 -0
pipecat/services/openai/base_llm.py +27 -20
pipecat/services/openai/realtime/__init__.py +0 -0
pipecat/services/openai/realtime/context.py +272 -0
pipecat/services/openai/realtime/events.py +1106 -0
pipecat/services/openai/realtime/frames.py +37 -0
pipecat/services/openai/realtime/llm.py +829 -0
pipecat/services/openai/tts.py +49 -10
pipecat/services/openai_realtime/__init__.py +27 -0
pipecat/services/openai_realtime/azure.py +21 -0
pipecat/services/openai_realtime/context.py +21 -0
pipecat/services/openai_realtime/events.py +21 -0
pipecat/services/openai_realtime/frames.py +21 -0
pipecat/services/openai_realtime_beta/azure.py +16 -0
pipecat/services/openai_realtime_beta/openai.py +17 -5
pipecat/services/piper/tts.py +7 -9
pipecat/services/playht/tts.py +34 -4
pipecat/services/rime/tts.py +12 -12
pipecat/services/riva/stt.py +3 -1
pipecat/services/salesforce/__init__.py +9 -0
pipecat/services/salesforce/llm.py +700 -0
pipecat/services/sarvam/__init__.py +7 -0
pipecat/services/sarvam/stt.py +540 -0
pipecat/services/sarvam/tts.py +97 -13
pipecat/services/simli/video.py +2 -2
pipecat/services/speechmatics/stt.py +22 -10
pipecat/services/stt_service.py +47 -0
pipecat/services/tavus/video.py +2 -2
pipecat/services/tts_service.py +75 -22
pipecat/services/vision_service.py +7 -6
pipecat/services/vistaar/llm.py +51 -9
pipecat/tests/utils.py +4 -4
pipecat/transcriptions/language.py +41 -1
pipecat/transports/base_input.py +13 -34
pipecat/transports/base_output.py +140 -104
pipecat/transports/daily/transport.py +199 -26
pipecat/transports/heygen/__init__.py +0 -0
pipecat/transports/heygen/transport.py +381 -0
pipecat/transports/livekit/transport.py +228 -63
pipecat/transports/local/audio.py +6 -1
pipecat/transports/local/tk.py +11 -2
pipecat/transports/network/fastapi_websocket.py +1 -1
pipecat/transports/smallwebrtc/connection.py +103 -19
pipecat/transports/smallwebrtc/request_handler.py +246 -0
pipecat/transports/smallwebrtc/transport.py +65 -23
pipecat/transports/tavus/transport.py +23 -12
pipecat/transports/websocket/client.py +41 -5
pipecat/transports/websocket/fastapi.py +21 -11
pipecat/transports/websocket/server.py +14 -7
pipecat/transports/whatsapp/api.py +8 -0
pipecat/transports/whatsapp/client.py +47 -0
pipecat/utils/base_object.py +54 -22
pipecat/utils/redis.py +58 -0
pipecat/utils/string.py +13 -1
pipecat/utils/tracing/service_decorators.py +21 -21
pipecat/serializers/genesys.py +0 -95
pipecat/services/google/test-google-chirp.py +0 -45
pipecat/services/openai.py +0 -698
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
/pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0

pipecat/services/sarvam/tts.py CHANGED Viewed

@@ -20,9 +20,9 @@ from pipecat.frames.frames import (
     EndFrame,
     ErrorFrame,
     Frame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     StartFrame,
-    StartInterruptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
@@ -76,17 +76,29 @@ class SarvamHttpTTSService(TTSService):
     Example::
-        tts = SarvamTTSService(
+        tts = SarvamHttpTTSService(
             api_key="your-api-key",
             voice_id="anushka",
             model="bulbul:v2",
             aiohttp_session=session,
-            params=SarvamTTSService.InputParams(
+            params=SarvamHttpTTSService.InputParams(
                 language=Language.HI,
                 pitch=0.1,
                 pace=1.2
             )
         )
+        # For bulbul v3 beta with any speaker:
+        tts_v3 = SarvamHttpTTSService(
+            api_key="your-api-key",
+            voice_id="speaker_name",
+            model="bulbul:v3,
+            aiohttp_session=session,
+            params=SarvamHttpTTSService.InputParams(
+                language=Language.HI,
+                temperature=0.8
+            )
+        )
     """
     class InputParams(BaseModel):
@@ -105,6 +117,14 @@ class SarvamHttpTTSService(TTSService):
         pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
         loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
         enable_preprocessing: Optional[bool] = False
+        temperature: Optional[float] = Field(
+            default=0.6,
+            ge=0.01,
+            le=1.0,
+            description="Controls the randomness of the output for bulbul v3 beta. "
+            "Lower values make the output more focused and deterministic, while "
+            "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
+        )
     def __init__(
         self,
@@ -124,7 +144,7 @@ class SarvamHttpTTSService(TTSService):
             api_key: Sarvam AI API subscription key.
             aiohttp_session: Shared aiohttp session for making requests.
             voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
-            model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
+            model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
             base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
             sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
             params: Additional voice and preprocessing parameters. If None, uses defaults.
@@ -138,16 +158,32 @@ class SarvamHttpTTSService(TTSService):
         self._base_url = base_url
         self._session = aiohttp_session
+        # Build base settings common to all models
         self._settings = {
             "language": (
                 self.language_to_service_language(params.language) if params.language else "en-IN"
             ),
-            "pitch": params.pitch,
-            "pace": params.pace,
-            "loudness": params.loudness,
             "enable_preprocessing": params.enable_preprocessing,
         }
+        # Add model-specific parameters
+        if model in ("bulbul:v3-beta", "bulbul:v3"):
+            self._settings.update(
+                {
+                    "temperature": getattr(params, "temperature", 0.6),
+                    "model": model,
+                }
+            )
+        else:
+            self._settings.update(
+                {
+                    "pitch": params.pitch,
+                    "pace": params.pace,
+                    "loudness": params.loudness,
+                    "model": model,
+                }
+            )
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -275,6 +311,18 @@ class SarvamTTSService(InterruptibleTTSService):
                 pace=1.2
             )
         )
+        # For bulbul v3 beta with any speaker and temperature:
+        # Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
+        tts_v3 = SarvamTTSService(
+            api_key="your-api-key",
+            voice_id="speaker_name",
+            model="bulbul:v3",
+            params=SarvamTTSService.InputParams(
+                language=Language.HI,
+                temperature=0.8
+            )
+        )
     """
     class InputParams(BaseModel):
@@ -310,6 +358,14 @@ class SarvamTTSService(InterruptibleTTSService):
         output_audio_codec: Optional[str] = "linear16"
         output_audio_bitrate: Optional[str] = "128k"
         language: Optional[Language] = Language.EN
+        temperature: Optional[float] = Field(
+            default=0.6,
+            ge=0.01,
+            le=1.0,
+            description="Controls the randomness of the output for bulbul v3 beta. "
+            "Lower values make the output more focused and deterministic, while "
+            "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
+        )
     def __init__(
         self,
@@ -329,6 +385,7 @@ class SarvamTTSService(InterruptibleTTSService):
         Args:
             api_key: Sarvam API key for authenticating TTS requests.
             model: Identifier of the Sarvam speech model (default "bulbul:v2").
+                Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
             voice_id: Voice identifier for synthesis (default "anushka").
             url: WebSocket URL for connecting to the TTS backend (default production URL).
             aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
@@ -371,15 +428,12 @@ class SarvamTTSService(InterruptibleTTSService):
         self._api_key = api_key
         self.set_model_name(model)
         self.set_voice(voice_id)
-        # Configuration parameters
+        # Build base settings common to all models
         self._settings = {
             "target_language_code": (
                 self.language_to_service_language(params.language) if params.language else "en-IN"
             ),
-            "pitch": params.pitch,
-            "pace": params.pace,
             "speaker": voice_id,
-            "loudness": params.loudness,
             "speech_sample_rate": 0,
             "enable_preprocessing": params.enable_preprocessing,
             "min_buffer_size": params.min_buffer_size,
@@ -387,6 +441,24 @@ class SarvamTTSService(InterruptibleTTSService):
             "output_audio_codec": params.output_audio_codec,
             "output_audio_bitrate": params.output_audio_bitrate,
         }
+        # Add model-specific parameters
+        if model in ("bulbul:v3-beta", "bulbul:v3"):
+            self._settings.update(
+                {
+                    "temperature": getattr(params, "temperature", 0.6),
+                    "model": model,
+                }
+            )
+        else:
+            self._settings.update(
+                {
+                    "pitch": params.pitch,
+                    "pace": params.pace,
+                    "loudness": params.loudness,
+                    "model": model,
+                }
+            )
         self._started = False
         self._receive_task = None
@@ -455,7 +527,7 @@ class SarvamTTSService(InterruptibleTTSService):
             direction: The direction to push the frame.
         """
         await super().push_frame(frame, direction)
-        if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
+        if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
             self._started = False
     async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -525,6 +597,7 @@ class SarvamTTSService(InterruptibleTTSService):
             logger.debug("Connected to Sarvam TTS Websocket")
             await self._send_config()
+            await self._call_event_handler("on_connected")
         except Exception as e:
             logger.error(f"{self} initialization error: {e}")
             self._websocket = None
@@ -556,6 +629,10 @@ class SarvamTTSService(InterruptibleTTSService):
                 await self._websocket.close()
         except Exception as e:
             logger.error(f"{self} error closing websocket: {e}")
+        finally:
+            self._started = False
+            self._websocket = None
+            await self._call_event_handler("on_disconnected")
     def _get_websocket(self):
         if self._websocket:
@@ -605,8 +682,15 @@ class SarvamTTSService(InterruptibleTTSService):
             logger.warning("Service is disconnecting, ignoring text send")
             return
+        # Validate text input
+        if not text or not isinstance(text, str) or not text.strip():
+            logger.warning(f"Invalid text input for Sarvam TTS: {repr(text)}")
+            return
         if self._websocket and self._websocket.state == State.OPEN:
-            msg = {"type": "text", "data": {"text": text}}
+            msg = {"type": "text", "data": {"text": text.strip()}}
+            logger.info(f"Sarvam TTS: Sending text message: {repr(text.strip())}")
+            logger.debug(f"Sarvam TTS: Full message payload: {msg}")
             await self._websocket.send(json.dumps(msg))
         else:
             logger.warning("WebSocket not ready, cannot send text")

pipecat/services/simli/video.py CHANGED Viewed

@@ -15,8 +15,8 @@ from pipecat.frames.frames import (
     CancelFrame,
     EndFrame,
     Frame,
+    InterruptionFrame,
     OutputImageRawFrame,
-    StartInterruptionFrame,
     TTSAudioRawFrame,
     TTSStoppedFrame,
     UserStartedSpeakingFrame,
@@ -179,7 +179,7 @@ class SimliVideoService(FrameProcessor):
             return
         elif isinstance(frame, (EndFrame, CancelFrame)):
             await self._stop()
-        elif isinstance(frame, (StartInterruptionFrame, UserStartedSpeakingFrame)):
+        elif isinstance(frame, (InterruptionFrame, UserStartedSpeakingFrame)):
             if not self._previously_interrupted:
                 await self._simli_client.clearBuffer()
             self._previously_interrupted = self._is_trinity_avatar

pipecat/services/speechmatics/stt.py CHANGED Viewed

@@ -19,7 +19,6 @@ from loguru import logger
 from pydantic import BaseModel
 from pipecat.frames.frames import (
-    BotInterruptionFrame,
     CancelFrame,
     EndFrame,
     ErrorFrame,
@@ -551,7 +550,7 @@ class SpeechmaticsSTTService(STTService):
             @self._client.on(ServerMessageType.END_OF_UTTERANCE)
             def _evt_on_end_of_utterance(message: dict[str, Any]):
-                logger.debug("End of utterance received from STT")
+                self.logger.debug("End of utterance received from STT")
                 asyncio.run_coroutine_threadsafe(
                     self._handle_end_of_utterance(), self.get_event_loop()
                 )
@@ -578,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
                 ),
             )
             logger.debug(f"{self} Connected to Speechmatics STT service")
+            await self._call_event_handler("on_connected")
         except Exception as e:
             logger.error(f"{self} Error connecting to Speechmatics: {e}")
             self._client = None
@@ -596,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
             logger.error(f"{self} Error closing Speechmatics client: {e}")
         finally:
             self._client = None
+            await self._call_event_handler("on_disconnected")
     def _process_config(self) -> None:
         """Create a formatted STT transcription config.
@@ -619,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
             transcription_config.additional_vocab = [
                 {
                     "content": e.content,
-                    "sounds_like": e.sounds_like,
+                    **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
                 }
                 for e in self._params.additional_vocab
             ]
@@ -749,14 +750,13 @@ class SpeechmaticsSTTService(STTService):
             return
         # Frames to send
-        upstream_frames: list[Frame] = []
         downstream_frames: list[Frame] = []
         # If VAD is enabled, then send a speaking frame
         if self._params.enable_vad and not self._is_speaking:
             logger.debug("User started speaking")
             self._is_speaking = True
-            upstream_frames += [BotInterruptionFrame()]
+            await self.push_interruption_task_frame_and_wait()
             downstream_frames += [UserStartedSpeakingFrame()]
         # If final, then re-parse into TranscriptionFrame
@@ -775,7 +775,7 @@ class SpeechmaticsSTTService(STTService):
             ]
             # Log transcript(s)
-            logger.debug(f"Finalized transcript: {[f.text for f in downstream_frames]}")
+            self.logger.debug(f"Finalized transcript: {[f.text for f in downstream_frames]}")
         # Return as interim results (unformatted)
         else:
@@ -794,10 +794,6 @@ class SpeechmaticsSTTService(STTService):
             self._is_speaking = False
             downstream_frames += [UserStoppedSpeakingFrame()]
-        # Send UPSTREAM frames
-        for frame in upstream_frames:
-            await self.push_frame(frame, FrameDirection.UPSTREAM)
         # Send the DOWNSTREAM frames
         for frame in downstream_frames:
             await self.push_frame(frame, FrameDirection.DOWNSTREAM)
@@ -996,6 +992,22 @@ def _language_to_speechmatics_language(language: Language) -> str:
     # List of supported input languages
     BASE_LANGUAGES = {
         Language.AR: "ar",
+        Language.AR_AE: "ar",
+        Language.AR_BH: "ar",
+        Language.AR_DZ: "ar",
+        Language.AR_EG: "ar",
+        Language.AR_IQ: "ar",
+        Language.AR_JO: "ar",
+        Language.AR_KW: "ar",
+        Language.AR_LB: "ar",
+        Language.AR_LY: "ar",
+        Language.AR_MA: "ar",
+        Language.AR_OM: "ar",
+        Language.AR_QA: "ar",
+        Language.AR_SA: "ar",
+        Language.AR_SY: "ar",
+        Language.AR_TN: "ar",
+        Language.AR_YE: "ar",
         Language.BA: "ba",
         Language.EU: "eu",
         Language.BE: "be",

pipecat/services/stt_service.py CHANGED Viewed

@@ -16,6 +16,7 @@ from loguru import logger
 from pipecat.frames.frames import (
     AudioRawFrame,
     BotStoppedSpeakingFrame,
+    ErrorFrame,
     Frame,
     StartFrame,
     STTMuteFrame,
@@ -25,6 +26,7 @@ from pipecat.frames.frames import (
 )
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_service import AIService
+from pipecat.services.websocket_service import WebsocketService
 from pipecat.transcriptions.language import Language
@@ -34,6 +36,25 @@ class STTService(AIService):
     Provides common functionality for STT services including audio passthrough,
     muting, settings management, and audio processing. Subclasses must implement
     the run_stt method to provide actual speech recognition.
+    Event handlers:
+        on_connected: Called when connected to the STT service.
+        on_connected: Called when disconnected from the STT service.
+        on_connection_error: Called when a connection to the STT service error occurs.
+    Example::
+        @stt.event_handler("on_connected")
+        async def on_connected(stt: STTService):
+            logger.debug(f"STT connected")
+        @stt.event_handler("on_disconnected")
+        async def on_disconnected(stt: STTService):
+            logger.debug(f"STT disconnected")
+        @stt.event_handler("on_connection_error")
+        async def on_connection_error(stt: STTService, error: str):
+            logger.error(f"STT connection error: {error}")
     """
     def __init__(
@@ -64,6 +85,10 @@ class STTService(AIService):
         self._voicemail_detect: bool = False
         self._user_id: str = ""
+        self._register_event_handler("on_connected")
+        self._register_event_handler("on_disconnected")
+        self._register_event_handler("on_connection_error")
     @property
     def is_muted(self) -> bool:
         """Check if the STT service is currently muted.
@@ -298,3 +323,25 @@ class SegmentedSTTService(STTService):
         if not self._user_speaking and len(self._audio_buffer) > self._audio_buffer_size_1s:
             discarded = len(self._audio_buffer) - self._audio_buffer_size_1s
             self._audio_buffer = self._audio_buffer[discarded:]
+class WebsocketSTTService(STTService, WebsocketService):
+    """Base class for websocket-based STT services.
+    Combines STT functionality with websocket connectivity, providing automatic
+    error handling and reconnection capabilities.
+    """
+    def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
+        """Initialize the Websocket STT service.
+        Args:
+            reconnect_on_error: Whether to automatically reconnect on websocket errors.
+            **kwargs: Additional arguments passed to parent classes.
+        """
+        STTService.__init__(self, **kwargs)
+        WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
+    async def _report_error(self, error: ErrorFrame):
+        await self._call_event_handler("on_connection_error", error.error)
+        await self.push_error(error)

pipecat/services/tavus/video.py CHANGED Viewed

@@ -23,12 +23,12 @@ from pipecat.frames.frames import (
     CancelFrame,
     EndFrame,
     Frame,
+    InterruptionFrame,
     OutputAudioRawFrame,
     OutputImageRawFrame,
     OutputTransportReadyFrame,
     SpeechOutputAudioRawFrame,
     StartFrame,
-    StartInterruptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
 )
@@ -222,7 +222,7 @@ class TavusVideoService(AIService):
         """
         await super().process_frame(frame, direction)
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, InterruptionFrame):
             await self._handle_interruptions()
             await self.push_frame(frame, direction)
         elif isinstance(frame, TTSAudioRawFrame):

pipecat/services/tts_service.py CHANGED Viewed

@@ -8,7 +8,18 @@
 import asyncio
 from abc import abstractmethod
-from typing import Any, AsyncGenerator, Callable, Dict, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+)
 from loguru import logger
@@ -20,10 +31,10 @@ from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
     InterimTranscriptionFrame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     StartFrame,
-    StartInterruptionFrame,
     TextFrame,
     TranscriptionFrame,
     TTSAudioRawFrame,
@@ -49,6 +60,25 @@ class TTSService(AIService):
     Provides common functionality for TTS services including text aggregation,
     filtering, audio generation, and frame management. Supports configurable
     sentence aggregation, silence insertion, and frame processing control.
+    Event handlers:
+        on_connected: Called when connected to the STT service.
+        on_connected: Called when disconnected from the STT service.
+        on_connection_error: Called when a connection to the STT service error occurs.
+    Example::
+        @tts.event_handler("on_connected")
+        async def on_connected(tts: TTSService):
+            logger.debug(f"TTS connected")
+        @tts.event_handler("on_disconnected")
+        async def on_disconnected(tts: TTSService):
+            logger.debug(f"TTS disconnected")
+        @tts.event_handler("on_connection_error")
+        async def on_connection_error(stt: TTSService, error: str):
+            logger.error(f"TTS connection error: {error}")
     """
     def __init__(
@@ -98,6 +128,7 @@ class TTSService(AIService):
                 .. deprecated:: 0.0.59
                     Use `text_filters` instead, which allows multiple filters.
+            text_formatter: Optional callable receiving text and language code, returning formatted text.
             transport_destination: Destination for generated audio frames.
             **kwargs: Additional arguments passed to the parent AIService.
@@ -124,7 +155,6 @@ class TTSService(AIService):
         self._tracing_enabled: bool = False
         if text_filter:
             import warnings
@@ -143,6 +173,10 @@ class TTSService(AIService):
         self._processing_text: bool = False
+        self._register_event_handler("on_connected")
+        self._register_event_handler("on_disconnected")
+        self._register_event_handler("on_connection_error")
     @property
     def sample_rate(self) -> int:
         """Get the current sample rate for audio output.
@@ -319,7 +353,7 @@ class TTSService(AIService):
             and not isinstance(frame, TranscriptionFrame)
         ):
             await self._process_text_frame(frame)
-        elif isinstance(frame, StartInterruptionFrame):
+        elif isinstance(frame, InterruptionFrame):
             await self._handle_interruption(frame, direction)
             await self.push_frame(frame, direction)
         elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
@@ -377,14 +411,44 @@ class TTSService(AIService):
         await super().push_frame(frame, direction)
         if self._push_stop_frames and (
-            isinstance(frame, StartInterruptionFrame)
+            isinstance(frame, InterruptionFrame)
             or isinstance(frame, TTSStartedFrame)
             or isinstance(frame, TTSAudioRawFrame)
             or isinstance(frame, TTSStoppedFrame)
         ):
             await self._stop_frame_queue.put(frame)
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _stream_audio_frames_from_iterator(
+        self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
+    ) -> AsyncGenerator[Frame, None]:
+        buffer = bytearray()
+        need_to_strip_wav_header = strip_wav_header
+        async for chunk in iterator:
+            if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
+                chunk = chunk[44:]
+                need_to_strip_wav_header = False
+            # Append to current buffer.
+            buffer.extend(chunk)
+            # Round to nearest even number.
+            aligned_length = len(buffer) & ~1  # 111111111...11110
+            if aligned_length > 0:
+                aligned_chunk = buffer[:aligned_length]
+                buffer = buffer[aligned_length:]  # keep any leftover byte
+                if len(aligned_chunk) > 0:
+                    frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
+                    yield frame
+        if len(buffer) > 0:
+            # Make sure we don't need an extra padding byte.
+            if len(buffer) % 2 == 1:
+                buffer.extend(b"\x00")
+            frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
+            yield frame
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         self._processing_text = False
         await self._text_aggregator.handle_interruption()
         for filter in self._text_filters:
@@ -465,7 +529,7 @@ class TTSService(AIService):
                 )
                 if isinstance(frame, TTSStartedFrame):
                     has_started = True
-                elif isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
+                elif isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
                     has_started = False
             except asyncio.TimeoutError:
                 if has_started:
@@ -550,7 +614,7 @@ class WordTTSService(TTSService):
         elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
             await self.flush_audio()
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         self._llm_response_started = False
         self.reset_word_timestamps()
@@ -613,7 +677,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
         """
         TTSService.__init__(self, **kwargs)
         WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
-        self._register_event_handler("on_connection_error")
     async def _report_error(self, error: ErrorFrame):
         await self._call_event_handler("on_connection_error", error.error)
@@ -640,7 +703,7 @@ class InterruptibleTTSService(WebsocketTTSService):
         # user interrupts we need to reconnect.
         self._bot_speaking = False
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         if self._bot_speaking:
             await self._disconnect()
@@ -665,15 +728,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
     """Base class for websocket-based TTS services that support word timestamps.
     Combines word timestamp functionality with websocket connectivity.
-    Event handlers:
-        on_connection_error: Called when a websocket connection error occurs.
-    Example::
-        @tts.event_handler("on_connection_error")
-        async def on_connection_error(tts: TTSService, error: str):
-            logger.error(f"TTS connection error: {error}")
     """
     def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -685,7 +739,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
         """
         WordTTSService.__init__(self, **kwargs)
         WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
-        self._register_event_handler("on_connection_error")
     async def _report_error(self, error: ErrorFrame):
         await self._call_event_handler("on_connection_error", error.error)
@@ -712,7 +765,7 @@ class InterruptibleWordTTSService(WebsocketWordTTSService):
         # user interrupts we need to reconnect.
         self._bot_speaking = False
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         if self._bot_speaking:
             await self._disconnect()
@@ -840,7 +893,7 @@ class AudioContextWordTTSService(WebsocketWordTTSService):
         await super().cancel(frame)
         await self._stop_audio_context_task()
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         await self._stop_audio_context_task()
         self._create_audio_context_task()

dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl