PyPI - dv-pipecat-ai - Versions diffs - 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show

{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
pipecat/adapters/base_llm_adapter.py +38 -1
pipecat/adapters/services/anthropic_adapter.py +9 -14
pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
pipecat/adapters/services/bedrock_adapter.py +236 -13
pipecat/adapters/services/gemini_adapter.py +12 -8
pipecat/adapters/services/open_ai_adapter.py +19 -7
pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
pipecat/audio/dtmf/dtmf-0.wav +0 -0
pipecat/audio/dtmf/dtmf-1.wav +0 -0
pipecat/audio/dtmf/dtmf-2.wav +0 -0
pipecat/audio/dtmf/dtmf-3.wav +0 -0
pipecat/audio/dtmf/dtmf-4.wav +0 -0
pipecat/audio/dtmf/dtmf-5.wav +0 -0
pipecat/audio/dtmf/dtmf-6.wav +0 -0
pipecat/audio/dtmf/dtmf-7.wav +0 -0
pipecat/audio/dtmf/dtmf-8.wav +0 -0
pipecat/audio/dtmf/dtmf-9.wav +0 -0
pipecat/audio/dtmf/dtmf-pound.wav +0 -0
pipecat/audio/dtmf/dtmf-star.wav +0 -0
pipecat/audio/filters/krisp_viva_filter.py +193 -0
pipecat/audio/filters/noisereduce_filter.py +15 -0
pipecat/audio/turn/base_turn_analyzer.py +9 -1
pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
pipecat/audio/vad/data/README.md +10 -0
pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
pipecat/audio/vad/silero.py +9 -3
pipecat/audio/vad/vad_analyzer.py +13 -1
pipecat/extensions/voicemail/voicemail_detector.py +5 -5
pipecat/frames/frames.py +277 -86
pipecat/observers/loggers/debug_log_observer.py +3 -3
pipecat/observers/loggers/llm_log_observer.py +7 -3
pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
pipecat/pipeline/runner.py +18 -6
pipecat/pipeline/service_switcher.py +64 -36
pipecat/pipeline/task.py +125 -79
pipecat/pipeline/tts_switcher.py +30 -0
pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
pipecat/processors/aggregators/llm_context.py +40 -2
pipecat/processors/aggregators/llm_response.py +32 -15
pipecat/processors/aggregators/llm_response_universal.py +19 -15
pipecat/processors/aggregators/user_response.py +6 -6
pipecat/processors/aggregators/vision_image_frame.py +24 -2
pipecat/processors/audio/audio_buffer_processor.py +43 -8
pipecat/processors/dtmf_aggregator.py +174 -77
pipecat/processors/filters/stt_mute_filter.py +17 -0
pipecat/processors/frame_processor.py +110 -24
pipecat/processors/frameworks/langchain.py +8 -2
pipecat/processors/frameworks/rtvi.py +210 -68
pipecat/processors/frameworks/strands_agents.py +170 -0
pipecat/processors/logger.py +2 -2
pipecat/processors/transcript_processor.py +26 -5
pipecat/processors/user_idle_processor.py +35 -11
pipecat/runner/daily.py +59 -20
pipecat/runner/run.py +395 -93
pipecat/runner/types.py +6 -4
pipecat/runner/utils.py +51 -10
pipecat/serializers/__init__.py +5 -1
pipecat/serializers/asterisk.py +16 -2
pipecat/serializers/convox.py +41 -4
pipecat/serializers/custom.py +257 -0
pipecat/serializers/exotel.py +5 -5
pipecat/serializers/livekit.py +20 -0
pipecat/serializers/plivo.py +5 -5
pipecat/serializers/protobuf.py +6 -5
pipecat/serializers/telnyx.py +2 -2
pipecat/serializers/twilio.py +43 -23
pipecat/serializers/vi.py +324 -0
pipecat/services/ai_service.py +2 -6
pipecat/services/anthropic/llm.py +2 -25
pipecat/services/assemblyai/models.py +6 -0
pipecat/services/assemblyai/stt.py +13 -5
pipecat/services/asyncai/tts.py +5 -3
pipecat/services/aws/__init__.py +1 -0
pipecat/services/aws/llm.py +147 -105
pipecat/services/aws/nova_sonic/__init__.py +0 -0
pipecat/services/aws/nova_sonic/context.py +436 -0
pipecat/services/aws/nova_sonic/frames.py +25 -0
pipecat/services/aws/nova_sonic/llm.py +1265 -0
pipecat/services/aws/stt.py +3 -3
pipecat/services/aws_nova_sonic/__init__.py +19 -1
pipecat/services/aws_nova_sonic/aws.py +11 -1151
pipecat/services/aws_nova_sonic/context.py +8 -354
pipecat/services/aws_nova_sonic/frames.py +13 -17
pipecat/services/azure/llm.py +51 -1
pipecat/services/azure/realtime/__init__.py +0 -0
pipecat/services/azure/realtime/llm.py +65 -0
pipecat/services/azure/stt.py +15 -0
pipecat/services/cartesia/stt.py +77 -70
pipecat/services/cartesia/tts.py +80 -13
pipecat/services/deepgram/__init__.py +1 -0
pipecat/services/deepgram/flux/__init__.py +0 -0
pipecat/services/deepgram/flux/stt.py +640 -0
pipecat/services/elevenlabs/__init__.py +4 -1
pipecat/services/elevenlabs/stt.py +339 -0
pipecat/services/elevenlabs/tts.py +87 -46
pipecat/services/fish/tts.py +5 -2
pipecat/services/gemini_multimodal_live/events.py +38 -524
pipecat/services/gemini_multimodal_live/file_api.py +23 -173
pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
pipecat/services/gladia/stt.py +56 -72
pipecat/services/google/__init__.py +1 -0
pipecat/services/google/gemini_live/__init__.py +3 -0
pipecat/services/google/gemini_live/file_api.py +189 -0
pipecat/services/google/gemini_live/llm.py +1582 -0
pipecat/services/google/gemini_live/llm_vertex.py +184 -0
pipecat/services/google/llm.py +15 -11
pipecat/services/google/llm_openai.py +3 -3
pipecat/services/google/llm_vertex.py +86 -16
pipecat/services/google/stt.py +4 -0
pipecat/services/google/tts.py +7 -3
pipecat/services/heygen/api.py +2 -0
pipecat/services/heygen/client.py +8 -4
pipecat/services/heygen/video.py +2 -0
pipecat/services/hume/__init__.py +5 -0
pipecat/services/hume/tts.py +220 -0
pipecat/services/inworld/tts.py +6 -6
pipecat/services/llm_service.py +15 -5
pipecat/services/lmnt/tts.py +4 -2
pipecat/services/mcp_service.py +4 -2
pipecat/services/mem0/memory.py +6 -5
pipecat/services/mistral/llm.py +29 -8
pipecat/services/moondream/vision.py +42 -16
pipecat/services/neuphonic/tts.py +5 -2
pipecat/services/openai/__init__.py +1 -0
pipecat/services/openai/base_llm.py +27 -20
pipecat/services/openai/realtime/__init__.py +0 -0
pipecat/services/openai/realtime/context.py +272 -0
pipecat/services/openai/realtime/events.py +1106 -0
pipecat/services/openai/realtime/frames.py +37 -0
pipecat/services/openai/realtime/llm.py +829 -0
pipecat/services/openai/tts.py +49 -10
pipecat/services/openai_realtime/__init__.py +27 -0
pipecat/services/openai_realtime/azure.py +21 -0
pipecat/services/openai_realtime/context.py +21 -0
pipecat/services/openai_realtime/events.py +21 -0
pipecat/services/openai_realtime/frames.py +21 -0
pipecat/services/openai_realtime_beta/azure.py +16 -0
pipecat/services/openai_realtime_beta/openai.py +17 -5
pipecat/services/piper/tts.py +7 -9
pipecat/services/playht/tts.py +34 -4
pipecat/services/rime/tts.py +12 -12
pipecat/services/riva/stt.py +3 -1
pipecat/services/salesforce/__init__.py +9 -0
pipecat/services/salesforce/llm.py +700 -0
pipecat/services/sarvam/__init__.py +7 -0
pipecat/services/sarvam/stt.py +540 -0
pipecat/services/sarvam/tts.py +97 -13
pipecat/services/simli/video.py +2 -2
pipecat/services/speechmatics/stt.py +22 -10
pipecat/services/stt_service.py +47 -0
pipecat/services/tavus/video.py +2 -2
pipecat/services/tts_service.py +75 -22
pipecat/services/vision_service.py +7 -6
pipecat/services/vistaar/llm.py +51 -9
pipecat/tests/utils.py +4 -4
pipecat/transcriptions/language.py +41 -1
pipecat/transports/base_input.py +13 -34
pipecat/transports/base_output.py +140 -104
pipecat/transports/daily/transport.py +199 -26
pipecat/transports/heygen/__init__.py +0 -0
pipecat/transports/heygen/transport.py +381 -0
pipecat/transports/livekit/transport.py +228 -63
pipecat/transports/local/audio.py +6 -1
pipecat/transports/local/tk.py +11 -2
pipecat/transports/network/fastapi_websocket.py +1 -1
pipecat/transports/smallwebrtc/connection.py +103 -19
pipecat/transports/smallwebrtc/request_handler.py +246 -0
pipecat/transports/smallwebrtc/transport.py +65 -23
pipecat/transports/tavus/transport.py +23 -12
pipecat/transports/websocket/client.py +41 -5
pipecat/transports/websocket/fastapi.py +21 -11
pipecat/transports/websocket/server.py +14 -7
pipecat/transports/whatsapp/api.py +8 -0
pipecat/transports/whatsapp/client.py +47 -0
pipecat/utils/base_object.py +54 -22
pipecat/utils/redis.py +58 -0
pipecat/utils/string.py +13 -1
pipecat/utils/tracing/service_decorators.py +21 -21
pipecat/serializers/genesys.py +0 -95
pipecat/services/google/test-google-chirp.py +0 -45
pipecat/services/openai.py +0 -698
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
/pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0

pipecat/services/elevenlabs/stt.py ADDED Viewed

@@ -0,0 +1,339 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+"""ElevenLabs speech-to-text service implementation.
+This module provides integration with ElevenLabs' Speech-to-Text API for transcription
+using segmented audio processing. The service uploads audio files and receives
+transcription results directly.
+"""
+import io
+from typing import AsyncGenerator, Optional
+import aiohttp
+from loguru import logger
+from pydantic import BaseModel
+from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
+from pipecat.services.stt_service import SegmentedSTTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+from pipecat.utils.tracing.service_decorators import traced_stt
+def language_to_elevenlabs_language(language: Language) -> Optional[str]:
+    """Convert a Language enum to ElevenLabs language code.
+    Source:
+        https://elevenlabs.io/docs/capabilities/speech-to-text
+    Args:
+        language: The Language enum value to convert.
+    Returns:
+        The corresponding ElevenLabs language code, or None if not supported.
+    """
+    BASE_LANGUAGES = {
+        Language.AF: "afr",  # Afrikaans
+        Language.AM: "amh",  # Amharic
+        Language.AR: "ara",  # Arabic
+        Language.HY: "hye",  # Armenian
+        Language.AS: "asm",  # Assamese
+        Language.AST: "ast",  # Asturian
+        Language.AZ: "aze",  # Azerbaijani
+        Language.BE: "bel",  # Belarusian
+        Language.BN: "ben",  # Bengali
+        Language.BS: "bos",  # Bosnian
+        Language.BG: "bul",  # Bulgarian
+        Language.MY: "mya",  # Burmese
+        Language.YUE: "yue",  # Cantonese
+        Language.CA: "cat",  # Catalan
+        Language.CEB: "ceb",  # Cebuano
+        Language.NY: "nya",  # Chichewa
+        Language.HR: "hrv",  # Croatian
+        Language.CS: "ces",  # Czech
+        Language.DA: "dan",  # Danish
+        Language.NL: "nld",  # Dutch
+        Language.EN: "eng",  # English
+        Language.ET: "est",  # Estonian
+        Language.FIL: "fil",  # Filipino
+        Language.FI: "fin",  # Finnish
+        Language.FR: "fra",  # French
+        Language.FF: "ful",  # Fulah
+        Language.GL: "glg",  # Galician
+        Language.LG: "lug",  # Ganda
+        Language.KA: "kat",  # Georgian
+        Language.DE: "deu",  # German
+        Language.EL: "ell",  # Greek
+        Language.GU: "guj",  # Gujarati
+        Language.HA: "hau",  # Hausa
+        Language.HE: "heb",  # Hebrew
+        Language.HI: "hin",  # Hindi
+        Language.HU: "hun",  # Hungarian
+        Language.IS: "isl",  # Icelandic
+        Language.IG: "ibo",  # Igbo
+        Language.ID: "ind",  # Indonesian
+        Language.GA: "gle",  # Irish
+        Language.IT: "ita",  # Italian
+        Language.JA: "jpn",  # Japanese
+        Language.JV: "jav",  # Javanese
+        Language.KEA: "kea",  # Kabuverdianu
+        Language.KN: "kan",  # Kannada
+        Language.KK: "kaz",  # Kazakh
+        Language.KM: "khm",  # Khmer
+        Language.KO: "kor",  # Korean
+        Language.KU: "kur",  # Kurdish
+        Language.KY: "kir",  # Kyrgyz
+        Language.LO: "lao",  # Lao
+        Language.LV: "lav",  # Latvian
+        Language.LN: "lin",  # Lingala
+        Language.LT: "lit",  # Lithuanian
+        Language.LUO: "luo",  # Luo
+        Language.LB: "ltz",  # Luxembourgish
+        Language.MK: "mkd",  # Macedonian
+        Language.MS: "msa",  # Malay
+        Language.ML: "mal",  # Malayalam
+        Language.MT: "mlt",  # Maltese
+        Language.ZH: "zho",  # Mandarin Chinese
+        Language.MI: "mri",  # Māori
+        Language.MR: "mar",  # Marathi
+        Language.MN: "mon",  # Mongolian
+        Language.NE: "nep",  # Nepali
+        Language.NSO: "nso",  # Northern Sotho
+        Language.NO: "nor",  # Norwegian
+        Language.OC: "oci",  # Occitan
+        Language.OR: "ori",  # Odia
+        Language.PS: "pus",  # Pashto
+        Language.FA: "fas",  # Persian
+        Language.PL: "pol",  # Polish
+        Language.PT: "por",  # Portuguese
+        Language.PA: "pan",  # Punjabi
+        Language.RO: "ron",  # Romanian
+        Language.RU: "rus",  # Russian
+        Language.SR: "srp",  # Serbian
+        Language.SN: "sna",  # Shona
+        Language.SD: "snd",  # Sindhi
+        Language.SK: "slk",  # Slovak
+        Language.SL: "slv",  # Slovenian
+        Language.SO: "som",  # Somali
+        Language.ES: "spa",  # Spanish
+        Language.SW: "swa",  # Swahili
+        Language.SV: "swe",  # Swedish
+        Language.TA: "tam",  # Tamil
+        Language.TG: "tgk",  # Tajik
+        Language.TE: "tel",  # Telugu
+        Language.TH: "tha",  # Thai
+        Language.TR: "tur",  # Turkish
+        Language.UK: "ukr",  # Ukrainian
+        Language.UMB: "umb",  # Umbundu
+        Language.UR: "urd",  # Urdu
+        Language.UZ: "uzb",  # Uzbek
+        Language.VI: "vie",  # Vietnamese
+        Language.CY: "cym",  # Welsh
+        Language.WO: "wol",  # Wolof
+        Language.XH: "xho",  # Xhosa
+        Language.ZU: "zul",  # Zulu
+    }
+    result = BASE_LANGUAGES.get(language)
+    # If not found in base languages, try to find the base language from a variant
+    if not result:
+        lang_str = str(language.value)
+        base_code = lang_str.split("-")[0].lower()
+        result = base_code if base_code in BASE_LANGUAGES.values() else None
+    return result
+class ElevenLabsSTTService(SegmentedSTTService):
+    """Speech-to-text service using ElevenLabs' file-based API.
+    This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
+    segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
+    The service uploads audio files to ElevenLabs and receives transcription results directly.
+    """
+    class InputParams(BaseModel):
+        """Configuration parameters for ElevenLabs STT API.
+        Parameters:
+            language: Target language for transcription.
+            tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
+        """
+        language: Optional[Language] = None
+        tag_audio_events: bool = True
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        aiohttp_session: aiohttp.ClientSession,
+        base_url: str = "https://api.elevenlabs.io",
+        model: str = "scribe_v1",
+        sample_rate: Optional[int] = None,
+        params: Optional[InputParams] = None,
+        **kwargs,
+    ):
+        """Initialize the ElevenLabs STT service.
+        Args:
+            api_key: ElevenLabs API key for authentication.
+            aiohttp_session: aiohttp ClientSession for HTTP requests.
+            base_url: Base URL for ElevenLabs API.
+            model: Model ID for transcription. Defaults to "scribe_v1".
+            sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
+            params: Configuration parameters for the STT service.
+            **kwargs: Additional arguments passed to SegmentedSTTService.
+        """
+        super().__init__(
+            sample_rate=sample_rate,
+            **kwargs,
+        )
+        params = params or ElevenLabsSTTService.InputParams()
+        self._api_key = api_key
+        self._base_url = base_url
+        self._session = aiohttp_session
+        self._model_id = model
+        self._tag_audio_events = params.tag_audio_events
+        self._settings = {
+            "language": self.language_to_service_language(params.language)
+            if params.language
+            else "eng",
+        }
+    def can_generate_metrics(self) -> bool:
+        """Check if the service can generate processing metrics.
+        Returns:
+            True, as ElevenLabs STT service supports metrics generation.
+        """
+        return True
+    def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to ElevenLabs service-specific language code.
+        Args:
+            language: The language to convert.
+        Returns:
+            The ElevenLabs-specific language code, or None if not supported.
+        """
+        return language_to_elevenlabs_language(language)
+    async def set_language(self, language: Language):
+        """Set the transcription language.
+        Args:
+            language: The language to use for speech-to-text transcription.
+        """
+        self.logger.info(f"Switching STT language to: [{language}]")
+        self._settings["language"] = self.language_to_service_language(language)
+    async def set_model(self, model: str):
+        """Set the STT model.
+        Args:
+            model: The model name to use for transcription.
+        Note:
+            ElevenLabs STT API does not currently support model selection.
+            This method is provided for interface compatibility.
+        """
+        await super().set_model(model)
+        self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
+    async def _transcribe_audio(self, audio_data: bytes) -> dict:
+        """Upload audio data to ElevenLabs and get transcription result.
+        Args:
+            audio_data: Raw audio bytes in WAV format.
+        Returns:
+            The transcription result data.
+        Raises:
+            Exception: If transcription fails or returns an error.
+        """
+        url = f"{self._base_url}/v1/speech-to-text"
+        headers = {"xi-api-key": self._api_key}
+        # Create form data with the audio file
+        data = aiohttp.FormData()
+        data.add_field(
+            "file",
+            io.BytesIO(audio_data),
+            filename="audio.wav",
+            content_type="audio/x-wav",
+        )
+        # Add required model_id, language_code, and tag_audio_events
+        data.add_field("model_id", self._model_id)
+        data.add_field("language_code", self._settings["language"])
+        data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
+        async with self._session.post(url, data=data, headers=headers) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                self.logger.error(f"ElevenLabs transcription error: {error_text}")
+                raise Exception(f"Transcription failed with status {response.status}: {error_text}")
+            result = await response.json()
+            return result
+    @traced_stt
+    async def _handle_transcription(
+        self, transcript: str, is_final: bool, language: Optional[str] = None
+    ):
+        """Handle a transcription result with tracing."""
+        await self.stop_ttfb_metrics()
+        await self.stop_processing_metrics()
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Transcribe an audio segment using ElevenLabs' STT API.
+        Args:
+            audio: Raw audio bytes in WAV format (already converted by base class).
+        Yields:
+            Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
+        Note:
+            The audio is already in WAV format from the SegmentedSTTService.
+            Only non-empty transcriptions are yielded.
+        """
+        try:
+            await self.start_processing_metrics()
+            await self.start_ttfb_metrics()
+            # Upload audio and get transcription result directly
+            result = await self._transcribe_audio(audio)
+            # Extract transcription text
+            text = result.get("text", "").strip()
+            if text:
+                # Use the language_code returned by the API
+                detected_language = result.get("language_code", "eng")
+                await self._handle_transcription(text, True, detected_language)
+                self.logger.debug(f"Transcription: [{text}]")
+                yield TranscriptionFrame(
+                    text,
+                    self._user_id,
+                    time_now_iso8601(),
+                    detected_language,
+                    result=result,
+                )
+        except Exception as e:
+            self.logger.error(f"ElevenLabs STT error: {e}")
+            yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")

pipecat/services/elevenlabs/tts.py CHANGED Viewed

@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
     EndFrame,
     ErrorFrame,
     Frame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     StartFrame,
-    StartInterruptionFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
@@ -172,16 +172,24 @@ def build_elevenlabs_voice_settings(
 def calculate_word_times(
-    alignment_info: Mapping[str, Any], cumulative_time: float
-) -> List[Tuple[str, float]]:
+    alignment_info: Mapping[str, Any],
+    cumulative_time: float,
+    partial_word: str = "",
+    partial_word_start_time: float = 0.0,
+) -> tuple[List[Tuple[str, float]], str, float]:
     """Calculate word timestamps from character alignment information.
     Args:
         alignment_info: Character alignment data from ElevenLabs API.
         cumulative_time: Base time offset for this chunk.
+        partial_word: Partial word carried over from previous chunk.
+        partial_word_start_time: Start time of the partial word.
     Returns:
-        List of (word, timestamp) tuples.
+        Tuple of (word_times, new_partial_word, new_partial_word_start_time):
+        - word_times: List of (word, timestamp) tuples for complete words
+        - new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
+        - new_partial_word_start_time: Start time of the incomplete word
     """
     chars = alignment_info["chars"]
     char_start_times_ms = alignment_info["charStartTimesMs"]
@@ -190,41 +198,37 @@ def calculate_word_times(
         logger.error(
             f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
         )
-        return []
+        return ([], partial_word, partial_word_start_time)
     # Build words and track their start positions
     words = []
-    word_start_indices = []
-    current_word = ""
-    word_start_index = None
+    word_start_times = []
+    current_word = partial_word  # Start with any partial word from previous chunk
+    word_start_time = partial_word_start_time if partial_word else None
     for i, char in enumerate(chars):
         if char == " ":
             # End of current word
             if current_word:  # Only add non-empty words
                 words.append(current_word)
-                word_start_indices.append(word_start_index)
+                word_start_times.append(word_start_time)
                 current_word = ""
-                word_start_index = None
+                word_start_time = None
         else:
             # Building a word
-            if word_start_index is None:  # First character of new word
-                word_start_index = i
+            if word_start_time is None:  # First character of new word
+                # Convert from milliseconds to seconds and add cumulative offset
+                word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
             current_word += char
-    # Handle the last word if there's no trailing space
-    if current_word and word_start_index is not None:
-        words.append(current_word)
-        word_start_indices.append(word_start_index)
+    # Build result for complete words
+    word_times = list(zip(words, word_start_times))
-    # Calculate timestamps for each word
-    word_times = []
-    for word, start_idx in zip(words, word_start_indices):
-        # Convert from milliseconds to seconds and add cumulative offset
-        start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
-        word_times.append((word, start_time_seconds))
+    # Return any incomplete word at the end of this chunk
+    new_partial_word = current_word if current_word else ""
+    new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
-    return word_times
+    return (word_times, new_partial_word, new_partial_word_start_time)
 class ElevenLabsTTSService(AudioContextWordTTSService):
@@ -336,6 +340,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         # there's an interruption or TTSStoppedFrame.
         self._started = False
         self._cumulative_time = 0
+        # Track partial words that span across alignment chunks
+        self._partial_word = ""
+        self._partial_word_start_time = 0.0
         # Context management for v1 multi API
         self._context_id = None
@@ -465,7 +472,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             direction: The direction to push the frame.
         """
         await super().push_frame(frame, direction)
-        if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
+        if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
             self._started = False
             if isinstance(frame, TTSStoppedFrame):
                 await self.add_word_timestamps([("Reset", 0)])
@@ -526,6 +533,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
             )
+            await self._call_event_handler("on_connected")
         except Exception as e:
             self.logger.error(f"{self} initialization error: {e}")
             self._websocket = None
@@ -544,13 +552,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 logger.debug("Disconnected from ElevenLabs")
         except Exception as e:
             self.logger.error(f"{self} error closing websocket: {e}")
+        finally:
+            self._started = False
+            self._context_id = None
+            self._websocket = None
+            await self._call_event_handler("on_disconnected")
     def _get_websocket(self):
         if self._websocket:
             return self._websocket
         raise Exception("Websocket not connected")
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         """Handle interruption by closing the current context."""
         await super()._handle_interruption(frame, direction)
@@ -559,7 +572,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             logger.trace(f"Closing context {self._context_id} due to interruption")
             try:
                 # ElevenLabs requires that Pipecat manages the contexts and closes them
-                # when they're not longer in use. Since a StartInterruptionFrame is pushed
+                # when they're not longer in use. Since an InterruptionFrame is pushed
                 # every time the user speaks, we'll use this as a trigger to close the context
                 # and reset the state.
                 # Note: We do not need to call remove_audio_context here, as the context is
@@ -571,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 logger.error(f"Error closing context on interruption: {e}")
             self._context_id = None
             self._started = False
+            self._partial_word = ""
+            self._partial_word_start_time = 0.0
     async def _receive_messages(self):
         """Handle incoming WebSocket messages from ElevenLabs."""
@@ -610,7 +625,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             if msg.get("alignment"):
                 alignment = msg["alignment"]
-                word_times = calculate_word_times(alignment, self._cumulative_time)
+                word_times, self._partial_word, self._partial_word_start_time = (
+                    calculate_word_times(
+                        alignment,
+                        self._cumulative_time,
+                        self._partial_word,
+                        self._partial_word_start_time,
+                    )
+                )
                 if word_times:
                     await self.add_word_timestamps(word_times)
@@ -685,6 +707,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                     yield TTSStartedFrame()
                     self._started = True
                     self._cumulative_time = 0
+                    self._partial_word = ""
+                    self._partial_word_start_time = 0.0
                     # If a context ID does not exist, create a new one and
                     # register it. If an ID exists, that means the Pipeline is
                     # configured for allow_interruptions=False, so continue
@@ -758,6 +782,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
         base_url: str = "https://api.elevenlabs.io",
         sample_rate: Optional[int] = None,
         params: Optional[InputParams] = None,
+        aggregate_sentences: Optional[bool] = True,
         **kwargs,
     ):
         """Initialize the ElevenLabs HTTP TTS service.
@@ -770,10 +795,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
             base_url: Base URL for ElevenLabs HTTP API.
             sample_rate: Audio sample rate. If None, uses default.
             params: Additional input parameters for voice customization.
+            aggregate_sentences: Whether to aggregate sentences within the TTSService.
             **kwargs: Additional arguments passed to the parent service.
         """
         super().__init__(
-            aggregate_sentences=True,
+            aggregate_sentences=aggregate_sentences,
             push_text_frames=False,
             push_stop_frames=True,
             sample_rate=sample_rate,
@@ -811,6 +837,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
         # Store previous text for context within a turn
         self._previous_text = ""
+        # Track partial words that span across alignment chunks
+        self._partial_word = ""
+        self._partial_word_start_time = 0.0
     def language_to_service_language(self, language: Language) -> Optional[str]:
         """Convert pipecat Language to ElevenLabs language code.
@@ -838,6 +868,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
         self._cumulative_time = 0
         self._started = False
         self._previous_text = ""
+        self._partial_word = ""
+        self._partial_word_start_time = 0.0
         logger.debug(f"{self}: Reset internal state")
     async def start(self, frame: StartFrame):
@@ -858,7 +890,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
             direction: The direction to push the frame.
         """
         await super().push_frame(frame, direction)
-        if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
+        if isinstance(frame, (InterruptionFrame, TTSStoppedFrame)):
             # Reset timing on interruption or stop
             self._reset_state()
@@ -872,11 +904,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
     def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timing from character alignment data.
+        This method handles partial words that may span across multiple alignment chunks.
         Args:
             alignment_info: Character timing data from ElevenLabs.
         Returns:
-            List of (word, timestamp) pairs.
+            List of (word, timestamp) pairs for complete words in this chunk.
         Example input data::
@@ -902,30 +936,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
         # Build the words and find their start times
         words = []
         word_start_times = []
-        current_word = ""
-        first_char_idx = -1
+        # Start with any partial word from previous chunk
+        current_word = self._partial_word
+        word_start_time = self._partial_word_start_time if self._partial_word else None
         for i, char in enumerate(chars):
             if char == " ":
                 if current_word:  # Only add non-empty words
                     words.append(current_word)
-                    # Use time of the first character of the word, offset by cumulative time
-                    word_start_times.append(
-                        self._cumulative_time + char_start_times[first_char_idx]
-                    )
+                    word_start_times.append(word_start_time)
                     current_word = ""
-                    first_char_idx = -1
+                    word_start_time = None
             else:
-                if not current_word:  # This is the first character of a new word
-                    first_char_idx = i
+                if word_start_time is None:  # First character of a new word
+                    # Use time of the first character of the word, offset by cumulative time
+                    word_start_time = self._cumulative_time + char_start_times[i]
                 current_word += char
-        # Don't forget the last word if there's no trailing space
-        if current_word and first_char_idx >= 0:
-            words.append(current_word)
-            word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
+        # Store any incomplete word at the end of this chunk
+        self._partial_word = current_word if current_word else ""
+        self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
-        # Create word-time pairs
+        # Create word-time pairs for complete words only
         word_times = list(zip(words, word_start_times))
         return word_times
@@ -961,6 +993,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
         if self._voice_settings:
             payload["voice_settings"] = self._voice_settings
+        if self._settings["apply_text_normalization"] is not None:
+            payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
         language = self._settings["language"]
         if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
             payload["language_code"] = language
@@ -981,8 +1016,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
         }
         if self._settings["optimize_streaming_latency"] is not None:
             params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
-        if self._settings["apply_text_normalization"] is not None:
-            params["apply_text_normalization"] = self._settings["apply_text_normalization"]
         self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
@@ -1045,6 +1078,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
                         logger.error(f"Error processing response: {e}", exc_info=True)
                         continue
+                # After processing all chunks, emit any remaining partial word
+                # since this is the end of the utterance
+                if self._partial_word:
+                    final_word_time = [(self._partial_word, self._partial_word_start_time)]
+                    await self.add_word_timestamps(final_word_time)
+                    self._partial_word = ""
+                    self._partial_word_start_time = 0.0
                 # After processing all chunks, add the total utterance duration
                 # to the cumulative time to ensure next utterance starts after this one
                 if utterance_duration > 0:

dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl