PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/elevenlabs/tts.py CHANGED Viewed

@@ -4,6 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""ElevenLabs text-to-speech service implementations.
+This module provides WebSocket and HTTP-based TTS services using ElevenLabs API
+with support for streaming audio, word timestamps, and voice customization.
+"""
 import asyncio
 import base64
 import json
@@ -32,12 +38,13 @@ from pipecat.services.tts_service import (
     WordTTSService,
 )
 from pipecat.transcriptions.language import Language
-from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
 from pipecat.utils.tracing.service_decorators import traced_tts
 # See .env.example for ElevenLabs configuration needed
 try:
     import websockets
+    from websockets.asyncio.client import connect as websocket_connect
+    from websockets.protocol import State
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
@@ -57,6 +64,14 @@ ELEVENLABS_MULTILINGUAL_MODELS = {
 def language_to_elevenlabs_language(language: Language) -> Optional[str]:
+    """Convert a Language enum to ElevenLabs language code.
+    Args:
+        language: The Language enum value to convert.
+    Returns:
+        The corresponding ElevenLabs language code, or None if not supported.
+    """
     BASE_LANGUAGES = {
         Language.AR: "ar",
         Language.BG: "bg",
@@ -106,6 +121,14 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
 def output_format_from_sample_rate(sample_rate: int) -> str:
+    """Get the appropriate output format string for a given sample rate.
+    Args:
+        sample_rate: The audio sample rate in Hz.
+    Returns:
+        The ElevenLabs output format string.
+    """
     match sample_rate:
         case 8000:
             return "pcm_8000"
@@ -129,10 +152,10 @@ def build_elevenlabs_voice_settings(
     """Build voice settings dictionary for ElevenLabs based on provided settings.
     Args:
-        settings: Dictionary containing voice settings parameters
+        settings: Dictionary containing voice settings parameters.
     Returns:
-        Dictionary of voice settings or None if no valid settings are provided
+        Dictionary of voice settings or None if no valid settings are provided.
     """
     voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
@@ -151,26 +174,83 @@ def build_elevenlabs_voice_settings(
 def calculate_word_times(
     alignment_info: Mapping[str, Any], cumulative_time: float
 ) -> List[Tuple[str, float]]:
-    zipped_times = list(zip(alignment_info["chars"], alignment_info["charStartTimesMs"]))
+    """Calculate word timestamps from character alignment information.
-    words = "".join(alignment_info["chars"]).split(" ")
+    Args:
+        alignment_info: Character alignment data from ElevenLabs API.
+        cumulative_time: Base time offset for this chunk.
-    # Calculate start time for each word. We do this by finding a space character
-    # and using the previous word time, also taking into account there might not
-    # be a space at the end.
-    times = []
-    for i, (a, b) in enumerate(zipped_times):
-        if a == " " or i == len(zipped_times) - 1:
-            t = cumulative_time + (zipped_times[i - 1][1] / 1000.0)
-            times.append(t)
+    Returns:
+        List of (word, timestamp) tuples.
+    """
+    chars = alignment_info["chars"]
+    char_start_times_ms = alignment_info["charStartTimesMs"]
-    word_times = list(zip(words, times))
+    if len(chars) != len(char_start_times_ms):
+        logger.error(
+            f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
+        )
+        return []
+    # Build words and track their start positions
+    words = []
+    word_start_indices = []
+    current_word = ""
+    word_start_index = None
+    for i, char in enumerate(chars):
+        if char == " ":
+            # End of current word
+            if current_word:  # Only add non-empty words
+                words.append(current_word)
+                word_start_indices.append(word_start_index)
+                current_word = ""
+                word_start_index = None
+        else:
+            # Building a word
+            if word_start_index is None:  # First character of new word
+                word_start_index = i
+            current_word += char
+    # Handle the last word if there's no trailing space
+    if current_word and word_start_index is not None:
+        words.append(current_word)
+        word_start_indices.append(word_start_index)
+    # Calculate timestamps for each word
+    word_times = []
+    for word, start_idx in zip(words, word_start_indices):
+        # Convert from milliseconds to seconds and add cumulative offset
+        start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
+        word_times.append((word, start_time_seconds))
     return word_times
 class ElevenLabsTTSService(AudioContextWordTTSService):
+    """ElevenLabs WebSocket-based TTS service with word timestamps.
+    Provides real-time text-to-speech using ElevenLabs' WebSocket streaming API.
+    Supports word-level timestamps, audio context management, and various voice
+    customization options including stability, similarity boost, and speed controls.
+    """
     class InputParams(BaseModel):
+        """Input parameters for ElevenLabs TTS configuration.
+        Parameters:
+            language: Language to use for synthesis.
+            stability: Voice stability control (0.0 to 1.0).
+            similarity_boost: Similarity boost control (0.0 to 1.0).
+            style: Style control for voice expression (0.0 to 1.0).
+            use_speaker_boost: Whether to use speaker boost enhancement.
+            speed: Voice speed control (0.7 to 1.2).
+            auto_mode: Whether to enable automatic mode optimization.
+            enable_ssml_parsing: Whether to parse SSML tags in text.
+            enable_logging: Whether to enable ElevenLabs logging.
+            apply_text_normalization: Text normalization mode ("auto", "on", "off").
+        """
         language: Optional[Language] = None
         stability: Optional[float] = None
         similarity_boost: Optional[float] = None
@@ -180,18 +260,32 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         auto_mode: Optional[bool] = True
         enable_ssml_parsing: Optional[bool] = None
         enable_logging: Optional[bool] = None
+        apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
     def __init__(
         self,
         *,
         api_key: str,
         voice_id: str,
-        model: str = "eleven_flash_v2_5",
+        model: str = "eleven_turbo_v2_5",
         url: str = "wss://api.elevenlabs.io",
         sample_rate: Optional[int] = None,
         params: Optional[InputParams] = None,
+        aggregate_sentences: Optional[bool] = True,
         **kwargs,
     ):
+        """Initialize the ElevenLabs TTS service.
+        Args:
+            api_key: ElevenLabs API key for authentication.
+            voice_id: ID of the voice to use for synthesis.
+            model: TTS model to use (e.g., "eleven_turbo_v2_5").
+            url: WebSocket URL for ElevenLabs TTS API.
+            sample_rate: Audio sample rate. If None, uses default.
+            params: Additional input parameters for voice customization.
+            aggregate_sentences: Whether to aggregate sentences within the TTSService.
+            **kwargs: Additional arguments passed to the parent service.
+        """
         # Aggregating sentences still gives cleaner-sounding results and fewer
         # artifacts than streaming one word at a time. On average, waiting for a
         # full sentence should only "cost" us 15ms or so with GPT-4o or a Llama
@@ -207,7 +301,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         # speaking for a while, so we want the parent class to send TTSStopFrame
         # after a short period not receiving any audio.
         super().__init__(
-            aggregate_sentences=True,
+            aggregate_sentences=aggregate_sentences,
             push_text_frames=False,
             push_stop_frames=True,
             pause_frame_processing=True,
@@ -231,6 +325,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             "auto_mode": str(params.auto_mode).lower(),
             "enable_ssml_parsing": params.enable_ssml_parsing,
             "enable_logging": params.enable_logging,
+            "apply_text_normalization": params.apply_text_normalization,
         }
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -248,43 +343,114 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         self._keepalive_task = None
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as ElevenLabs service supports metrics generation.
+        """
         return True
     def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to ElevenLabs language format.
+        Args:
+            language: The language to convert.
+        Returns:
+            The ElevenLabs-specific language code, or None if not supported.
+        """
         return language_to_elevenlabs_language(language)
     def _set_voice_settings(self):
         return build_elevenlabs_voice_settings(self._settings)
     async def set_model(self, model: str):
+        """Set the TTS model and reconnect.
+        Args:
+            model: The model name to use for synthesis.
+        """
         await super().set_model(model)
         self.logger.info(f"Switching TTS model to: [{model}]")
         await self._disconnect()
         await self._connect()
     async def _update_settings(self, settings: Mapping[str, Any]):
+        """Update service settings and reconnect if voice, model, or language changed."""
+        # Track previous values for settings that require reconnection
         prev_voice = self._voice_id
+        prev_model = self.model_name
+        prev_language = self._settings.get("language")
+        # Create snapshot of current voice settings to detect changes after update
+        prev_voice_settings = self._voice_settings.copy() if self._voice_settings else None
         await super()._update_settings(settings)
-        if not prev_voice == self._voice_id:
-            logger.info(f"Switching TTS voice to: [{self._voice_id}]")
+        # Update voice settings for the next context creation
+        self._voice_settings = self._set_voice_settings()
+        # Check if URL-level settings changed (these require reconnection)
+        url_changed = (
+            prev_voice != self._voice_id
+            or prev_model != self.model_name
+            or prev_language != self._settings.get("language")
+        )
+        # Check if only voice settings changed (speed, stability, etc.)
+        voice_settings_changed = prev_voice_settings != self._voice_settings
+        if url_changed:
+            # These settings are in the WebSocket URL, so we need to reconnect
+            logger.debug(
+                f"URL-level setting changed (voice/model/language), reconnecting WebSocket"
+            )
             await self._disconnect()
             await self._connect()
             self.logger.info(f"Switching TTS voice to: [{self._voice_id}]")
+        elif voice_settings_changed and self._context_id:
+            # Voice settings can be updated by closing current context
+            # so new one gets created with updated voice settings
+            logger.debug(f"Voice settings changed, closing current context to apply changes")
+            try:
+                if self._websocket:
+                    await self._websocket.send(
+                        json.dumps({"context_id": self._context_id, "close_context": True})
+                    )
+            except Exception as e:
+                logger.warning(f"Error closing context for voice settings update: {e}")
+            self._context_id = None
+            self._started = False
     async def start(self, frame: StartFrame):
+        """Start the ElevenLabs TTS service.
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
         await super().start(frame)
         self._output_format = output_format_from_sample_rate(self.sample_rate)
         await self._connect()
     async def stop(self, frame: EndFrame):
+        """Stop the ElevenLabs TTS service.
+        Args:
+            frame: The end frame.
+        """
         await super().stop(frame)
         await self._disconnect()
     async def cancel(self, frame: CancelFrame):
+        """Cancel the ElevenLabs TTS service.
+        Args:
+            frame: The cancel frame.
+        """
         await super().cancel(frame)
         await self._disconnect()
     async def flush_audio(self):
+        """Flush any pending audio and finalize the current context."""
         if not self._context_id or not self._websocket:
             return
         self.logger.trace(f"{self}: flushing audio")
@@ -292,6 +458,12 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         await self._websocket.send(json.dumps(msg))
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        """Push a frame and handle state changes.
+        Args:
+            frame: The frame to push.
+            direction: The direction to push the frame.
+        """
         await super().push_frame(frame, direction)
         if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
             self._started = False
@@ -320,7 +492,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
     async def _connect_websocket(self):
         try:
-            if self._websocket and self._websocket.open:
+            if self._websocket and self._websocket.state is State.OPEN:
                 return
             self.logger.debug("Connecting to ElevenLabs")
@@ -336,6 +508,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             if self._settings["enable_logging"]:
                 url += f"&enable_logging={self._settings['enable_logging']}"
+            if self._settings["apply_text_normalization"] is not None:
+                url += f"&apply_text_normalization={self._settings['apply_text_normalization']}"
             # Language can only be used with the ELEVENLABS_MULTILINGUAL_MODELS
             language = self._settings["language"]
             if model in ELEVENLABS_MULTILINGUAL_MODELS and language is not None:
@@ -347,8 +522,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 )
             # Set max websocket message size to 16MB for large audio responses
-            self._websocket = await websockets.connect(
-                url, max_size=16 * 1024 * 1024, extra_headers={"xi-api-key": self._api_key}
+            self._websocket = await websocket_connect(
+                url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
             )
         except Exception as e:
@@ -366,6 +541,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 if self._context_id:
                     await self._websocket.send(json.dumps({"close_socket": True}))
                 await self._websocket.close()
+                logger.debug("Disconnected from ElevenLabs")
         except Exception as e:
             self.logger.error(f"{self} error closing websocket: {e}")
@@ -375,6 +551,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
         raise Exception("Websocket not connected")
     async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+        """Handle interruption by closing the current context."""
         await super()._handle_interruption(frame, direction)
         # Close the current context when interrupted without closing the websocket
@@ -396,9 +573,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
             self._started = False
     async def _receive_messages(self):
-        async for message in WatchdogAsyncIterator(
-            self._get_websocket(), manager=self.task_manager
-        ):
+        """Handle incoming WebSocket messages from ElevenLabs."""
+        async for message in self._get_websocket():
             msg = json.loads(message)
             received_ctx_id = msg.get("contextId")
@@ -411,10 +587,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 continue
             # Check if this message belongs to the current context.
-            # This should never happen, so warn about it.
             if not self.audio_context_available(received_ctx_id):
-                logger.warning(f"Ignoring message from unavailable context: {received_ctx_id}")
-                continue
+                if self._context_id == received_ctx_id:
+                    logger.debug(
+                        f"Received a delayed message, recreating the context: {self._context_id}"
+                    )
+                    await self.create_audio_context(self._context_id)
+                else:
+                    # This can happen if a message is received _after_ we have closed a context
+                    # due to user interruption but _before_ the `isFinal` message for the context
+                    # is received.
+                    logger.debug(f"Ignoring message from unavailable context: {received_ctx_id}")
+                    continue
             if msg.get("audio"):
                 await self.stop_ttfb_metrics()
@@ -423,18 +607,37 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 audio = base64.b64decode(msg["audio"])
                 frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
                 await self.append_to_audio_context(received_ctx_id, frame)
             if msg.get("alignment"):
-                word_times = calculate_word_times(msg["alignment"], self._cumulative_time)
-                await self.add_word_timestamps(word_times)
-                self._cumulative_time = word_times[-1][1]
+                alignment = msg["alignment"]
+                word_times = calculate_word_times(alignment, self._cumulative_time)
+                if word_times:
+                    await self.add_word_timestamps(word_times)
+                    # Calculate the actual end time of this audio chunk
+                    char_start_times_ms = alignment.get("charStartTimesMs", [])
+                    char_durations_ms = alignment.get("charDurationsMs", [])
+                    if char_start_times_ms and char_durations_ms:
+                        # End time = start time of last character + duration of last character
+                        chunk_end_time_ms = char_start_times_ms[-1] + char_durations_ms[-1]
+                        chunk_end_time_seconds = chunk_end_time_ms / 1000.0
+                        self._cumulative_time += chunk_end_time_seconds
+                    else:
+                        # Fallback: use the last word's start time (current behavior)
+                        self._cumulative_time = word_times[-1][1]
+                        logger.warning(
+                            "_receive_messages: using fallback timing method - consider investigating alignment data structure"
+                        )
     async def _keepalive_task_handler(self):
-        KEEPALIVE_SLEEP = 10 if self.task_manager.task_watchdog_enabled else 3
+        """Send periodic keepalive messages to maintain WebSocket connection."""
+        KEEPALIVE_SLEEP = 10
         while True:
-            self.reset_watchdog()
             await asyncio.sleep(KEEPALIVE_SLEEP)
             try:
-                if self._websocket and self._websocket.open:
+                if self._websocket and self._websocket.state is State.OPEN:
                     if self._context_id:
                         # Send keepalive with context ID to keep the connection alive
                         keepalive_message = {
@@ -454,16 +657,25 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                 break
     async def _send_text(self, text: str):
+        """Send text to the WebSocket for synthesis."""
         if self._websocket and self._context_id:
             msg = {"text": text, "context_id": self._context_id}
             await self._websocket.send(json.dumps(msg))
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using ElevenLabs' streaming WebSocket API.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing the synthesized speech.
+        """
         self.logger.debug(f"{self}: Generating TTS [{text}]")
         try:
-            if not self._websocket or self._websocket.closed:
+            if not self._websocket or self._websocket.state is State.CLOSED:
                 await self._connect()
                 self.logger.debug("Connected to ElevenLabs")
@@ -473,9 +685,16 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
                     yield TTSStartedFrame()
                     self._started = True
                     self._cumulative_time = 0
-                    # Create new context ID and register it
-                    self._context_id = str(uuid.uuid4())
-                    await self.create_audio_context(self._context_id)
+                    # If a context ID does not exist, create a new one and
+                    # register it. If an ID exists, that means the Pipeline is
+                    # configured for allow_interruptions=False, so continue
+                    # using the current ID. When interruptions are enabled
+                    # (e.g. allow_interruptions=True), user speech results in
+                    # an interruption, which resets the context ID.
+                    if not self._context_id:
+                        self._context_id = str(uuid.uuid4())
+                    if not self.audio_context_available(self._context_id):
+                        await self.create_audio_context(self._context_id)
                     # Initialize context with voice settings
                     msg = {"text": " ", "context_id": self._context_id}
@@ -499,19 +718,27 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
 class ElevenLabsHttpTTSService(WordTTSService):
-    """ElevenLabs Text-to-Speech service using HTTP streaming with word timestamps.
+    """ElevenLabs HTTP-based TTS service with word timestamps.
-    Args:
-        api_key: ElevenLabs API key
-        voice_id: ID of the voice to use
-        aiohttp_session: aiohttp ClientSession
-        model: Model ID (default: "eleven_flash_v2_5" for low latency)
-        base_url: API base URL
-        sample_rate: Output sample rate
-        params: Additional parameters for voice configuration
+    Provides text-to-speech using ElevenLabs' HTTP streaming API for simpler,
+    non-WebSocket integration. Suitable for use cases where streaming WebSocket
+    connection is not required or desired.
     """
     class InputParams(BaseModel):
+        """Input parameters for ElevenLabs HTTP TTS configuration.
+        Parameters:
+            language: Language to use for synthesis.
+            optimize_streaming_latency: Latency optimization level (0-4).
+            stability: Voice stability control (0.0 to 1.0).
+            similarity_boost: Similarity boost control (0.0 to 1.0).
+            style: Style control for voice expression (0.0 to 1.0).
+            use_speaker_boost: Whether to use speaker boost enhancement.
+            speed: Voice speed control (0.25 to 4.0).
+            apply_text_normalization: Text normalization mode ("auto", "on", "off").
+        """
         language: Optional[Language] = None
         optimize_streaming_latency: Optional[int] = None
         stability: Optional[float] = None
@@ -519,6 +746,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
         style: Optional[float] = None
         use_speaker_boost: Optional[bool] = None
         speed: Optional[float] = None
+        apply_text_normalization: Optional[Literal["auto", "on", "off"]] = None
     def __init__(
         self,
@@ -526,12 +754,24 @@ class ElevenLabsHttpTTSService(WordTTSService):
         api_key: str,
         voice_id: str,
         aiohttp_session: aiohttp.ClientSession,
-        model: str = "eleven_flash_v2_5",
+        model: str = "eleven_turbo_v2_5",
         base_url: str = "https://api.elevenlabs.io",
         sample_rate: Optional[int] = None,
         params: Optional[InputParams] = None,
         **kwargs,
     ):
+        """Initialize the ElevenLabs HTTP TTS service.
+        Args:
+            api_key: ElevenLabs API key for authentication.
+            voice_id: ID of the voice to use for synthesis.
+            aiohttp_session: aiohttp ClientSession for HTTP requests.
+            model: TTS model to use (e.g., "eleven_turbo_v2_5").
+            base_url: Base URL for ElevenLabs HTTP API.
+            sample_rate: Audio sample rate. If None, uses default.
+            params: Additional input parameters for voice customization.
+            **kwargs: Additional arguments passed to the parent service.
+        """
         super().__init__(
             aggregate_sentences=True,
             push_text_frames=False,
@@ -557,6 +797,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
             "style": params.style,
             "use_speaker_boost": params.use_speaker_boost,
             "speed": params.speed,
+            "apply_text_normalization": params.apply_text_normalization,
         }
         self.set_model_name(model)
         self.set_voice(voice_id)
@@ -571,11 +812,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
         self._previous_text = ""
     def language_to_service_language(self, language: Language) -> Optional[str]:
-        """Convert pipecat Language to ElevenLabs language code."""
+        """Convert pipecat Language to ElevenLabs language code.
+        Args:
+            language: The language to convert.
+        Returns:
+            The ElevenLabs-specific language code, or None if not supported.
+        """
         return language_to_elevenlabs_language(language)
     def can_generate_metrics(self) -> bool:
-        """Indicate that this service can generate usage metrics."""
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as ElevenLabs HTTP service supports metrics generation.
+        """
         return True
     def _set_voice_settings(self):
@@ -589,12 +841,22 @@ class ElevenLabsHttpTTSService(WordTTSService):
         logger.debug(f"{self}: Reset internal state")
     async def start(self, frame: StartFrame):
-        """Initialize the service upon receiving a StartFrame."""
+        """Start the ElevenLabs HTTP TTS service.
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
         await super().start(frame)
         self._output_format = output_format_from_sample_rate(self.sample_rate)
         self._reset_state()
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
+        """Push a frame and handle state changes.
+        Args:
+            frame: The frame to push.
+            direction: The direction to push the frame.
+        """
         await super().push_frame(frame, direction)
         if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
             # Reset timing on interruption or stop
@@ -610,21 +872,23 @@ class ElevenLabsHttpTTSService(WordTTSService):
     def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timing from character alignment data.
-        Example input data:
-        {
-            "characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
-            "character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
-            "character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
-        }
-        Would produce word times (with cumulative_time=0):
-        [("Hello", 0.1), ("world", 0.5)]
         Args:
-            alignment_info: Character timing data from ElevenLabs
+            alignment_info: Character timing data from ElevenLabs.
         Returns:
-            List of (word, timestamp) pairs
+            List of (word, timestamp) pairs.
+        Example input data::
+            {
+                "characters": [" ", "H", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"],
+                "character_start_times_seconds": [0.0, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+                "character_end_times_seconds": [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+            }
+        Would produce word times (with cumulative_time=0)::
+            [("Hello", 0.1), ("world", 0.5)]
         """
         chars = alignment_info.get("characters", [])
         char_start_times = alignment_info.get("character_start_times_seconds", [])
@@ -675,10 +939,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
         Includes previous text as context for better prosody continuity.
         Args:
-            text: Text to convert to speech
+            text: Text to convert to speech.
         Yields:
-            Audio and control frames
+            Frame: Audio and control frames containing the synthesized speech.
         """
         self.logger.debug(f"{self}: Generating TTS [{text}]")
@@ -717,6 +981,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
         }
         if self._settings["optimize_streaming_latency"] is not None:
             params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
+        if self._settings["apply_text_normalization"] is not None:
+            params["apply_text_normalization"] = self._settings["apply_text_normalization"]
         self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl