PyPI - intellema-vdk - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

intellema-vdk 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

intellema_vdk/__init__.py +67 -10
intellema_vdk/config.py +14 -0
intellema_vdk/providers/__init__.py +35 -0
intellema_vdk/providers/livekit/__init__.py +19 -0
intellema_vdk/providers/livekit/client.py +612 -0
intellema_vdk/providers/livekit/exceptions.py +23 -0
intellema_vdk/providers/protocols.py +33 -0
intellema_vdk/providers/retell/__init__.py +17 -0
intellema_vdk/providers/retell/client.py +468 -0
intellema_vdk/providers/retell/exceptions.py +19 -0
intellema_vdk/{retell_lib → providers/retell}/import_phone_number.py +1 -1
intellema_vdk/stt/__init__.py +17 -0
intellema_vdk/stt/client.py +482 -0
intellema_vdk/stt/exceptions.py +19 -0
intellema_vdk/tts/__init__.py +15 -0
intellema_vdk/tts/__pycache__/__init__.cpython-312.pyc +0 -0
intellema_vdk/tts/__pycache__/client.cpython-312.pyc +0 -0
intellema_vdk/tts/__pycache__/exceptions.cpython-312.pyc +0 -0
intellema_vdk/tts/__pycache__/providers.cpython-312.pyc +0 -0
intellema_vdk/tts/client.py +541 -0
intellema_vdk/tts/exceptions.py +15 -0
intellema_vdk/tts/providers.py +293 -0
intellema_vdk/utils/logger_config.py +41 -0
intellema_vdk-0.2.2.dist-info/METADATA +311 -0
intellema_vdk-0.2.2.dist-info/RECORD +29 -0
{intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/WHEEL +1 -1
intellema_vdk/livekit_lib/__init__.py +0 -3
intellema_vdk/livekit_lib/client.py +0 -280
intellema_vdk/retell_lib/retell_client.py +0 -248
intellema_vdk/speech_lib/__init__.py +0 -2
intellema_vdk/speech_lib/stt_client.py +0 -108
intellema_vdk/speech_lib/tts_streamer.py +0 -188
intellema_vdk-0.2.0.dist-info/METADATA +0 -221
intellema_vdk-0.2.0.dist-info/RECORD +0 -14
/intellema_vdk/{retell_lib/__init__.py → stt/providers.py} +0 -0
{intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/licenses/LICENSE +0 -0
{intellema_vdk-0.2.0.dist-info → intellema_vdk-0.2.2.dist-info}/top_level.txt +0 -0

intellema_vdk/tts/client.py ADDED Viewed

@@ -0,0 +1,541 @@
+"""TTS streaming client for real-time text-to-speech audio playback."""
+import queue
+import threading
+import time
+import logging
+import subprocess
+import sys
+import platform
+from typing import Optional, Union, Literal, overload, TYPE_CHECKING
+# Lazy import pyaudio - only load when TTSStreamer is instantiated
+if TYPE_CHECKING:
+    import pyaudio
+else:
+    pyaudio = None
+from .providers import (
+    TTSProvider,
+    TogetherTTSProvider,
+    OpenAITTSProvider,
+    TogetherTTSConfig,
+    OpenAITTSConfig,
+)
+from ..config import (
+    get_env,
+    TTS_AUDIO_SAMPLE_RATE,
+)
+from .exceptions import (
+    TTSConfigurationError,
+    TTSStreamError,
+    TTSAPIError,
+    TTSError
+)
+# Setup logger for this module.
+logger = logging.getLogger(__name__)
+class TTSStreamer:
+    """
+    Real-time text-to-speech streaming and playback.
+    Streams text-to-speech audio from various providers (Together AI, OpenAI)
+    and plays it in real-time with minimal latency. Designed for continuous
+    text streams (e.g., from language models) with immediate audio feedback.
+    The streamer uses separate threads for fetching and playing audio to
+    ensure smooth, non-blocking playback. Text is buffered until sentence
+    boundaries are detected, then immediately converted to speech.
+    Supported Providers:
+        - Together AI: Low-latency streaming with Orpheus model
+        - OpenAI: High-quality voices with tts-1 and tts-1-hd models
+    Attributes:
+        provider: The TTS provider instance for generating audio.
+        p: PyAudio instance for audio I/O.
+        stream: Audio stream for playback (24kHz, 16-bit PCM, mono).
+        text_queue: Thread-safe queue for incoming text sentences.
+        audio_queue: Thread-safe queue for outgoing audio chunks.
+        text_buffer: Buffer accumulating text until sentence completion.
+        is_running: Flag controlling thread execution state.
+        threads_started: Flag indicating if worker threads are active.
+    Examples:
+        Basic usage with Together AI:
+            >>> streamer = TTSStreamer(provider="together")
+            >>> streamer.feed("Hello, world. ")
+            >>> streamer.feed("How are you?")
+            >>> streamer.flush()
+            >>> streamer.close()
+        Using OpenAI with custom voice:
+            >>> streamer = TTSStreamer(
+            ...     provider="openai",
+            ...     voice="nova",
+            ...     model="tts-1-hd"
+            ... )
+            >>> for token in text.split():
+            ...     streamer.feed(token + " ")
+            >>> streamer.flush()
+            >>> streamer.close()
+        Streaming LLM output:
+            >>> streamer = TTSStreamer(provider="together")
+            >>> for chunk in llm_stream:
+            ...     streamer.feed(chunk)
+            >>> streamer.flush()
+            >>> streamer.close()
+    Notes:
+        - Call flush() to wait for current audio to finish playing
+        - Call close() to stop all threads and release audio resources
+        - Requires system audio output and PyAudio installed
+        - Works best with whole words/phrases but also handles character streaming
+        - Sentence boundaries (. ! ? \n) trigger immediate speech synthesis
+    """
+    @overload
+    def __init__(
+        self,
+        provider: Literal["together"],
+        api_key: Optional[str] = None,
+        model: Literal["canopylabs/orpheus-3b-0.1-ft"] = "canopylabs/orpheus-3b-0.1-ft",
+        voice: Literal["tara"] = "tara",
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        provider: Literal["openai"],
+        api_key: Optional[str] = None,
+        model: Literal["tts-1", "tts-1-hd"] = "tts-1",
+        voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = "alloy",
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        provider: TTSProvider,
+        api_key: Optional[str] = None,
+        **provider_kwargs
+    ) -> None: ...
+    def __init__(self,
+                 provider: Union[Literal["together", "openai"], TTSProvider] = "together",
+                 api_key: Optional[str] = None,
+                 **provider_kwargs) -> None:
+        """
+        Initialize the TTS streamer with a provider.
+        Args:
+            provider: Either a provider name or a custom TTSProvider instance.
+                - "together": Use Together AI (requires TOGETHER_API_KEY)
+                - "openai": Use OpenAI (requires OPENAI_API_KEY)
+                - TTSProvider instance: Use custom provider
+            api_key: API key for the provider. If not provided, reads from
+                environment variables (TOGETHER_API_KEY or OPENAI_API_KEY).
+            **provider_kwargs: Provider-specific configuration options:
+                For Together AI:
+                    model: Model identifier (default: "canopylabs/orpheus-3b-0.1-ft")
+                    voice: Voice identifier (default: "tara")
+                For OpenAI:
+                    model: "tts-1" (fast) or "tts-1-hd" (high quality)
+                    voice: "alloy", "echo", "fable", "onyx", "nova", or "shimmer"
+        Raises:
+            TTSConfigurationError: If API key is missing or provider is invalid.
+            TTSStreamError: If audio stream initialization fails.
+            ImportError: If required dependencies (pyaudio, provider SDK) are missing.
+        Examples:
+            >>> # Together AI with defaults
+            >>> streamer = TTSStreamer(provider="together")
+            >>> # OpenAI with custom voice
+            >>> streamer = TTSStreamer(
+            ...     provider="openai",
+            ...     voice="nova",
+            ...     model="tts-1-hd"
+            ... )
+            >>> # Custom provider
+            >>> custom = MyCustomProvider()
+            >>> streamer = TTSStreamer(provider=custom)
+        """
+        # Lazy import pyaudio - only install when actually used
+        global pyaudio
+        if pyaudio is None:
+            try:
+                import pyaudio as _pyaudio
+                pyaudio = _pyaudio
+            except ImportError:
+                print("\n" + "="*70)
+                print("PyAudio is not installed.")
+                print("="*70)
+                print("\nPyAudio requires the PortAudio library to be installed on your system.")
+                print("\nInstallation instructions by platform:")
+                print("\n  Windows:")
+                print("    Option 1: pip install pipwin && pipwin install pyaudio")
+                print("    Option 2: pip install pyaudio")
+                print("    Option 3: Download wheel from https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyaudio")
+                print("\n  macOS:")
+                print("    brew install portaudio")
+                print("    pip install pyaudio")
+                print("\n  Linux (Debian/Ubuntu):")
+                print("    sudo apt-get install portaudio19-dev")
+                print("    pip install pyaudio")
+                print("\n  Linux (Fedora):")
+                print("    sudo dnf install portaudio-devel")
+                print("    pip install pyaudio")
+                print("\n" + "="*70)
+                # Attempt automatic installation
+                current_os = platform.system()
+                print(f"\nDetected OS: {current_os}")
+                print("Attempting automatic installation...")
+                try:
+                    if current_os == "Windows":
+                        # Try pipwin first, fall back to pip
+                        try:
+                            subprocess.check_call([sys.executable, "-m", "pip", "install", "pipwin"],
+                                                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                            subprocess.check_call([sys.executable, "-m", "pipwin", "install", "pyaudio"])
+                            print("✓ PyAudio installed successfully via pipwin!")
+                        except:
+                            # Fall back to pip
+                            subprocess.check_call([sys.executable, "-m", "pip", "install", "pyaudio>=0.2.13"])
+                            print("✓ PyAudio installed successfully via pip!")
+                    else:
+                        # For macOS/Linux, just try pip (user needs to install PortAudio first)
+                        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyaudio>=0.2.13"])
+                        print("✓ PyAudio installed successfully!")
+                    import pyaudio as _pyaudio
+                    pyaudio = _pyaudio
+                except Exception as e:
+                    error_msg = (
+                        "\nFailed to install PyAudio automatically. Please install manually:\n"
+                        "  pip install intellema-vdk[audio]\n"
+                        "or follow the platform-specific instructions above.\n"
+                    )
+                    if current_os != "Windows":
+                        error_msg += "\nMake sure PortAudio is installed on your system first!\n"
+                    raise TTSConfigurationError(error_msg) from e
+        # Initialize the provider
+        if isinstance(provider, str):
+            if provider == "together":
+                api_key = api_key or get_env("TOGETHER_API_KEY")
+                if not api_key:
+                    raise TTSConfigurationError(
+                        "Together API Key is missing. Set TOGETHER_API_KEY env var or pass api_key."
+                    )
+                self.provider = TogetherTTSProvider(api_key=api_key, **provider_kwargs)
+            elif provider == "openai":
+                api_key = api_key or get_env("OPENAI_API_KEY")
+                if not api_key:
+                    raise TTSConfigurationError(
+                        "OpenAI API Key is missing. Set OPENAI_API_KEY env var or pass api_key."
+                    )
+                self.provider = OpenAITTSProvider(api_key=api_key, **provider_kwargs)
+            else:
+                raise TTSConfigurationError(
+                    f"Unknown provider: {provider}. Supported providers: 'together', 'openai'."
+                )
+        else:
+            # Custom provider instance
+            self.provider = provider
+        # Audio configuration.
+        try:
+            self.p = pyaudio.PyAudio()
+            self.stream = self.p.open(
+                format=pyaudio.paInt16, channels=1, rate=TTS_AUDIO_SAMPLE_RATE, output=True
+            )
+        except Exception as e:
+            raise TTSStreamError(
+                f"Failed to initialize audio stream: {e}") from e
+        # Queues for inter-thread communication.
+        self.text_queue = queue.Queue()
+        self.audio_queue = queue.Queue()
+        # State management.
+        self.text_buffer = ""
+        self.is_running = True
+        self.threads_started = False
+        # Thread placeholders.
+        self.fetcher_thread = None
+        self.player_thread = None
+    def _ensure_started(self) -> None:
+        """Initialize and start worker threads for audio fetching and playback.
+        Creates and starts two daemon threads:
+        - fetcher_thread: Converts text to audio using the TTS provider
+        - player_thread: Plays audio chunks through the audio device
+        This method is called lazily on the first call to feed().
+        """
+        if self.threads_started:
+            return
+        # Start the threads for fetching and playing audio.
+        self.fetcher_thread = threading.Thread(
+            target=self._tts_fetcher, daemon=True)
+        self.player_thread = threading.Thread(
+            target=self._audio_player, daemon=True)
+        self.fetcher_thread.start()
+        self.player_thread.start()
+        self.threads_started = True
+    def feed(self, text_chunk: str) -> None:
+        """
+        Feed a chunk of text to the streamer for conversion to speech.
+        Text is buffered until a sentence-ending punctuation mark is detected
+        (. ! ? or newline), then the complete sentence is queued for TTS processing.
+        Audio playback begins as soon as the first audio chunks are received.
+        This method is thread-safe and can be called repeatedly to stream text.
+        Args:
+            text_chunk: A piece of text to convert to speech. Can be a single
+                character, word, or multiple sentences. Empty strings are ignored.
+        Examples:
+            >>> streamer.feed("Hello, world. ")  # Processes "Hello, world."
+            >>> streamer.feed("How are you?")   # Processes "How are you?"
+            >>>
+            >>> # Streaming word by word
+            >>> for word in sentence.split():
+            ...     streamer.feed(word + " ")
+            >>>
+            >>> # Streaming character by character
+            >>> for char in text:
+            ...     streamer.feed(char)
+        Note:
+            The streamer will only speak complete sentences. Any partial text
+            at the end (not ending with . ! ? or \n) remains in the buffer until
+            more text is fed or flush() is called.
+        """
+        if not self.is_running or not text_chunk:
+            return
+        self._ensure_started()
+        self.text_buffer += text_chunk
+        sentence_endings = [".", "!", "?", "\n"]
+        # Split text into sentences and queue them.
+        for ending in sentence_endings:
+            if ending in self.text_buffer:
+                parts = self.text_buffer.split(ending)
+                for sentence in parts[:-1]:
+                    if sentence.strip():
+                        full_sentence = sentence.strip() + ending
+                        self.text_queue.put(full_sentence)
+                self.text_buffer = parts[-1]  # Keep the remainder.
+    def flush(self) -> None:
+        """
+        Process any remaining buffered text and wait for all audio to finish playing.
+        This method:
+        1. Converts any remaining text in the buffer to speech
+        2. Waits for the text queue to be fully processed
+        3. Waits for the audio queue to be fully played
+        4. Adds a small delay for the hardware audio buffer to drain
+        This method is non-destructive - the streamer can be reused after flushing.
+        Call this when you've finished feeding text and want to ensure all audio
+        has been spoken before continuing.
+        Examples:
+            >>> streamer.feed("Hello world")
+            >>> streamer.flush()  # Wait for "Hello world" to finish playing
+            >>> streamer.feed("More text")  # Can continue using the streamer
+            >>> streamer.flush()
+            >>> streamer.close()  # Final cleanup
+        Note:
+            If no text has been fed yet (threads not started), this method returns
+            immediately without doing anything.
+        """
+        if not self.threads_started:
+            return  # Nothing to flush if never started.
+        # Push any remaining text from the buffer.
+        if self.text_buffer.strip():
+            self.text_queue.put(self.text_buffer.strip())
+        self.text_buffer = ""
+        # Wait for both queues to be empty.
+        self.text_queue.join()
+        self.audio_queue.join()
+        # A small delay to allow the hardware audio buffer to drain.
+        time.sleep(0.5)
+    def close(self) -> None:
+        """
+        Stop all threads and immediately close the audio stream.
+        This method performs complete cleanup:
+        1. Sets the running flag to False to signal threads to stop
+        2. Sends poison pills to both queues to unblock waiting threads
+        3. Clears both queues of any pending items
+        4. Closes the PyAudio stream and terminates PyAudio
+        After calling close(), the streamer cannot be reused. Create a new
+        instance if you need to stream more audio.
+        It's safe to call this method multiple times - subsequent calls will
+        have no effect.
+        Examples:
+            >>> streamer = TTSStreamer(provider="together")
+            >>> streamer.feed("Hello world.")
+            >>> streamer.flush()  # Wait for audio to finish
+            >>> streamer.close()  # Clean up resources
+            >>>
+            >>> # Using with context manager pattern
+            >>> streamer = TTSStreamer(provider="openai")
+            >>> try:
+            ...     streamer.feed("Some text")
+            ...     streamer.flush()
+            >>> finally:
+            ...     streamer.close()
+        Note:
+            Always call close() when finished with the streamer to free system
+            resources and prevent audio device locks.
+        """
+        if not self.is_running:
+            return
+        self.is_running = False
+        # Send 'poison pill' to threads to signal them to stop.
+        self.text_queue.put(None)
+        self.audio_queue.put(None)
+        # Clear queues to unblock threads that might be waiting.
+        with self.text_queue.mutex:
+            self.text_queue.queue.clear()
+        with self.audio_queue.mutex:
+            self.audio_queue.queue.clear()
+        # Close the audio stream.
+        try:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.p.terminate()
+        except Exception as e:
+            logger.warning(f"Error during audio stream closure: {e}")
+    def _tts_fetcher(self) -> None:
+        """
+        Worker thread that fetches TTS audio from the provider API.
+        Continuously pulls text from the text_queue, sends it to the TTS provider's
+        stream() method, and pushes received audio chunks to the audio_queue.
+        Implements retry logic with up to 2 retries for failed API calls.
+        If all retries fail, the text chunk is dropped and an error is logged.
+        The thread runs until:
+        - is_running flag is set to False, or
+        - A poison pill (None) is received from the text queue
+        Note:
+            This is a daemon thread started automatically by _ensure_started().
+        """
+        while self.is_running:
+            try:
+                text = self.text_queue.get(timeout=0.5)
+            except queue.Empty:
+                continue
+            if text is None:  # Poison pill received.
+                break
+            max_retries = 2
+            for attempt in range(max_retries + 1):
+                try:
+                    # Use the provider's stream method
+                    for audio_data in self.provider.stream(text):
+                        if not self.is_running:
+                            break
+                        if audio_data:
+                            self.audio_queue.put(audio_data)
+                    break  # Success, exit retry loop.
+                except Exception as e:
+                    logger.error(
+                        f"TTS Error (Attempt {attempt + 1}/{max_retries + 1}): {e}")
+                    if attempt >= max_retries:
+                        logger.error(
+                            f"Max retries reached. Dropping text chunk: {text[:50]}...")
+            self.text_queue.task_done()
+    def _audio_player(self) -> None:
+        """
+        Worker thread that plays audio chunks from the audio queue.
+        Continuously pulls audio data from the audio_queue and writes it to the
+        PyAudio stream for playback. Maintains a small buffer to handle frame
+        alignment (audio must be written in complete 16-bit samples).
+        The thread runs until:
+        - is_running flag is set to False, or
+        - A poison pill (None) is received from the audio queue
+        Error Handling:
+            If an OSError occurs while writing to the audio stream (e.g., device
+            disconnected), the error is logged and the thread exits gracefully.
+        Note:
+            This is a daemon thread started automatically by _ensure_started().
+        """
+        buffer = b""
+        while self.is_running:
+            try:
+                audio_data = self.audio_queue.get(timeout=0.5)
+            except queue.Empty:
+                continue
+            if audio_data is None:  # Poison pill received.
+                break
+            buffer += audio_data
+            # Play audio in chunks aligned with frame boundaries.
+            if len(buffer) >= 2:
+                frame_count = len(buffer) // 2
+                bytes_to_play = frame_count * 2
+                play_chunk = buffer[:bytes_to_play]
+                buffer = buffer[bytes_to_play:]
+                try:
+                    self.stream.write(play_chunk)
+                except OSError as e:
+                    logger.error(f"Error writing to audio stream: {e}")
+                    self.audio_queue.task_done()
+                    break
+            self.audio_queue.task_done()

intellema_vdk/tts/exceptions.py ADDED Viewed

@@ -0,0 +1,15 @@
+class TTSError(Exception):
+    """Base exception for all TTS-related errors."""
+    pass
+class TTSConfigurationError(TTSError):
+    """Raised when configuration (API keys) is missing or invalid."""
+    pass
+class TTSStreamError(TTSError):
+    """Raised when there are issues with the audio stream."""
+    pass
+class TTSAPIError(TTSError):
+    """Raised when the TTS API (Together) returns an error."""
+    pass

intellema-vdk 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

intellema-vdk 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl