PyPI - smallestai - Versions diffs - 3.1.0__py3-none-any.whl → 4.0.1__py3-none-any.whl - Mend - Supply Chain Defender

smallestai 3.1.0py3-none-any.whl → 4.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of smallestai might be problematic. Click here for more details.

Files changed (135) hide show

smallestai/waves/stream_tts.py CHANGED Viewed

@@ -1,272 +1,207 @@
-import asyncio
+import json
+import base64
 import time
-from threading import Thread
-from queue import Queue, Empty
-from typing import AsyncGenerator, Optional, Union, List, Dict, Any
-from smallestai.waves.waves_client import WavesClient
-from smallestai.waves.exceptions import APIError
-from smallestai.waves.async_waves_client import AsyncWavesClient
-from smallestai.waves.utils import SENTENCE_END_REGEX
-class TextToAudioStream:
-    def __init__(
-        self,
-        tts_instance: Union[WavesClient, AsyncWavesClient],
-        queue_timeout: Optional[float] = 5.0,
-        max_retries: Optional[int] = 3,
-        verbose: bool = False
-    ):
-        """
-        A real-time text-to-speech processor that converts streaming text into audio output.
-        Useful for applications requiring immediate audio feedback from text generation,
-        such as voice assistants, live captioning, or interactive chatbots.
-        ⚠️ `add_wav_header` is disabled by default for streaming efficiency. Refer to the README for more information.
-        Features:
-        - Streams audio chunks as soon as text is available.
-        - Handles both sync and async text-to-speech engines.
-        - Automatically retries failed synthesis attempts.
-        - Low latency between text generation and speech output.
-        Args:
-            tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
-            queue_timeout: How long to wait for new text (seconds, default: 1.0)
-            max_retries: Number of retry attempts for failed synthesis (default: 3)
-            verbose: Whether to log detailed metrics about TTS requests (default: False)
-        """
-        self.tts_instance = tts_instance
-        self.tts_instance.opts.add_wav_header = False
-        self.sentence_end_regex = SENTENCE_END_REGEX
-        self.queue_timeout = queue_timeout
-        self.max_retries = max_retries
-        self.queue = Queue()
-        self.buffer_size = 250
-        self.stop_flag = False
-        self.verbose = verbose
+import threading
+import queue
+from typing import Generator
+from dataclasses import dataclass
+from websocket import WebSocketApp
+@dataclass
+class TTSConfig:
+    voice_id: str
+    api_key: str
+    language: str = "en"
+    sample_rate: int = 24000
+    speed: float = 1.0
+    consistency: float = 0.5
+    enhancement: int = 1
+    similarity: float = 0
+    max_buffer_flush_ms: int = 0
+class WavesStreamingTTS:
+    def __init__(self, config: TTSConfig):
+        self.config = config
+        self.ws_url = "wss://waves-api.smallest.ai/api/v1/lightning-v2/get_speech/stream"
+        self.ws = None
+        self.audio_queue = queue.Queue()
+        self.error_queue = queue.Queue()
+        self.is_complete = False
+        self.is_connected = False
+        self.request_id = None
-        # Metrics tracking
-        self.request_count = 0
-        self.request_logs: List[Dict[str, Any]] = []
-        self.start_time = 0
-        self.first_api_response_time = None
-        self.end_time = 0
-        if self.tts_instance.opts.model == 'lightning-large':
-            self.buffer_size = 140
-    async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
-        """
-        Streams the LLM output, splitting it into chunks based on sentence boundaries
-        or space characters if no sentence boundary is found before reaching buffer_size.
-        Parameters:
-        - llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
-        """
-        buffer = ""
-        async for chunk in llm_output:
-            buffer += chunk
-            while len(buffer) > self.buffer_size:
-                chunk_text = buffer[:self.buffer_size]
-                last_break_index = -1
-                # Find last sentence boundary using regex
-                for i in range(len(chunk_text) - 1, -1, -1):
-                    if self.sentence_end_regex.match(chunk_text[:i + 1]):
-                        last_break_index = i
-                        break
-                if last_break_index == -1:
-                    # Fallback to space if no sentence boundary found
-                    last_space = chunk_text.rfind(' ')
-                    if last_space != -1:
-                        last_break_index = last_space
-                    else:
-                        last_break_index = self.buffer_size - 1
-                # Add chunk to queue and update buffer
-                self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
-                buffer = buffer[last_break_index + 1:].strip()
-        # Don't forget the remaining text
-        if buffer:
-            self.queue.put(f'{buffer.replace("—", " ").strip()} ')
-        self.stop_flag = True
-    def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
-        """Synchronously synthesizes a given sentence."""
-        request_start_time = time.time()
-        request_id = self.request_count + 1
+    def _get_headers(self):
+        return [f"Authorization: Bearer {self.config.api_key}"]
+    def _create_payload(self, text: str, continue_stream: bool = False, flush: bool = False):
+        return {
+            "voice_id": self.config.voice_id,
+            "text": text,
+            "language": self.config.language,
+            "sample_rate": self.config.sample_rate,
+            "speed": self.config.speed,
+            "consistency": self.config.consistency,
+            "similarity": self.config.similarity,
+            "enhancement": self.config.enhancement,
+            "max_buffer_flush_ms": self.config.max_buffer_flush_ms,
+            "continue": continue_stream,
+            "flush": flush
+        }
+    def _on_open(self, ws):
+        self.is_connected = True
+    def _on_message(self, ws, message):
         try:
-            audio_content = self.tts_instance.synthesize(sentence)
-            self.request_count += 1
-            request_end_time = time.time()
+            data = json.loads(message)
+            status = data.get("status", "")
-            if self.verbose:
-                request_duration = request_end_time - request_start_time
-                if self.first_api_response_time is None:
-                    self.first_api_response_time = time.time() - self.start_time
+            if status == "error":
+                self.error_queue.put(Exception(data.get("message", "Unknown error")))
+                return
-                self.request_logs.append({
-                    "id": request_id,
-                    "text": sentence,
-                    "start_time": request_start_time - self.start_time,
-                    "end_time": request_end_time - self.start_time,
-                    "duration": request_duration,
-                    "char_count": len(sentence),
-                    "retries": retries
-                })
+            if not self.request_id:
+                self.request_id = data.get("request_id")
-            return audio_content
-        except APIError as e:
-            if retries < self.max_retries:
-                if self.verbose:
-                    print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
-                return self._synthesize_sync(sentence, retries + 1)
-            else:
-                if self.verbose:
-                    print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
-                return None
+            audio_b64 = data.get("data", {}).get("audio")
+            if audio_b64:
+                self.audio_queue.put(base64.b64decode(audio_b64))
+            if status == "complete":
+                self.is_complete = True
+                self.audio_queue.put(None)
+        except Exception as e:
+            self.error_queue.put(e)
+    def _on_error(self, ws, error):
+        self.error_queue.put(error)
+    def _on_close(self, ws, *args):
+        self.is_connected = False
+        if not self.is_complete:
+            self.audio_queue.put(None)
+    def _connect(self):
+        if self.ws:
+            self.ws.close()
-    async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
-        """Asynchronously synthesizes a given sentence."""
-        request_start_time = time.time()
-        request_id = self.request_count + 1
+        self.ws = WebSocketApp(
+            self.ws_url,
+            header=self._get_headers(),
+            on_open=self._on_open,
+            on_message=self._on_message,
+            on_error=self._on_error,
+            on_close=self._on_close
+        )
-        try:
-            audio_content = await self.tts_instance.synthesize(sentence)
-            self.request_count += 1
-            request_end_time = time.time()
+        ws_thread = threading.Thread(target=self.ws.run_forever)
+        ws_thread.daemon = True
+        ws_thread.start()
+        timeout = 5.0
+        start_time = time.time()
+        while not self.is_connected and time.time() - start_time < timeout:
+            time.sleep(0.1)
-            if self.verbose:
-                request_duration = request_end_time - request_start_time
-                if self.first_api_response_time is None:
-                    self.first_api_response_time = time.time() - self.start_time
-                self.request_logs.append({
-                    "id": request_id,
-                    "text": sentence,
-                    "start_time": request_start_time - self.start_time,
-                    "end_time": request_end_time - self.start_time,
-                    "duration": request_duration,
-                    "char_count": len(sentence),
-                    "retries": retries
-                })
+        if not self.is_connected:
+            raise Exception("Failed to connect to WebSocket")
+    def synthesize(self, text: str) -> Generator[bytes, None, None]:
+        self._reset_state()
+        self._connect()
+        payload = self._create_payload(text)
+        self.ws.send(json.dumps(payload))
+        while True:
+            if not self.error_queue.empty():
+                raise self.error_queue.get()
-            return audio_content
-        except APIError as e:
-            if retries < self.max_retries:
-                if self.verbose:
-                    print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
-                return await self._synthesize_async(sentence, retries + 1)
-            else:
-                if self.verbose:
-                    print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
-                return None
-    async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
-        """
-        Continuously synthesizes sentences from the queue, yielding audio content.
-        If no sentences are in the queue, it waits until new data is available or streaming is complete.
-        """
-        while not self.stop_flag or not self.queue.empty():
             try:
-                sentence = self.queue.get_nowait()
-                if isinstance(self.tts_instance, AsyncWavesClient):
-                    audio_content = await self._synthesize_async(sentence)
-                else:
-                    loop = asyncio.get_running_loop()
-                    audio_content = await loop.run_in_executor(None, self._synthesize_sync, sentence)
-                if audio_content:
-                    yield audio_content
-            except Empty:
-                # Quick check if we should exit
-                if self.stop_flag and self.queue.empty():
+                chunk = self.audio_queue.get(timeout=1.0)
+                if chunk is None:
                     break
-                # Short sleep to avoid busy-waiting
-                await asyncio.sleep(0.01)  # Much shorter sleep time (10ms)
-    def _print_verbose_summary(self) -> None:
-        """Print a summary of all metrics if verbose mode is enabled."""
-        if not self.verbose:
-            return
-        total_duration = self.end_time - self.start_time
-        print("\n" + "="*100)
-        print(f"TEXT-TO-AUDIO STREAM METRICS")
-        print("="*100)
+                yield chunk
+            except queue.Empty:
+                if self.is_complete:
+                    break
+                continue
+        self.ws.close()
+    def synthesize_streaming(self, text_stream: Generator[str, None, None],
+                           continue_stream: bool = True,
+                           auto_flush: bool = True) -> Generator[bytes, None, None]:
+        self._reset_state()
+        self._connect()
-        print(f"\nOVERALL STATISTICS:")
-        print(f"  Total requests made: {self.request_count}")
-        print(f"  Time to first API response: {self.first_api_response_time:.3f}s")
-        print(f"  Total processing time: {total_duration:.3f}s")
+        def send_text():
+            try:
+                for text_chunk in text_stream:
+                    if text_chunk.strip():
+                        payload = self._create_payload(text_chunk, continue_stream=continue_stream)
+                        self.ws.send(json.dumps(payload))
+                if auto_flush:
+                    flush_payload = self._create_payload("", flush=True)
+                    self.ws.send(json.dumps(flush_payload))
+            except Exception as e:
+                self.error_queue.put(e)
-        # Print table header
-        print("\nREQUEST DETAILS:")
-        header = f"{'#':4} {'Start (s)':10} {'End (s)':10} {'Duration (s)':12} {'Characters':15} {'Text'}"
-        print("\n" + header)
-        print("-" * 100)
+        sender_thread = threading.Thread(target=send_text)
+        sender_thread.daemon = True
+        sender_thread.start()
-        # Print table rows
-        for log in self.request_logs:
-            row = (
-                f"{log['id']:4} "
-                f"{log['start_time']:10.3f} "
-                f"{log['end_time']:10.3f} "
-                f"{log['duration']:12.3f} "
-                f"{log['char_count']:15} "
-                f"{log['text'][:50]}{'...' if len(log['text']) > 50 else ''}"
-            )
-            print(row)
-            # Print retry information if any
-            if log['retries'] > 0:
-                print(f"{'':4} {'':10} {'':10} {'':12} {'':15} Retries: {log['retries']}")
+        while True:
+            if not self.error_queue.empty():
+                raise self.error_queue.get()
-        print("\n" + "="*100)
-    async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
-        """
-        Convert streaming text into audio in real-time.
-        Handles the entire pipeline from receiving text to producing audio,
-        yielding audio chunks as soon as they're ready.
-        Args:
-            llm_output: An async generator that yields text chunks.
-        Yields:
-            Raw audio data chunks (without WAV headers) that can be:
-            - Played directly through an audio device
-            - Saved to a file
-            - Streamed over a network
-            - Further processed as needed
-        """
-        self.start_time = time.time()
-        llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
-        llm_thread.start()
-        async for audio_content in self._run_synthesis():
-            yield audio_content
-        llm_thread.join()
+            try:
+                chunk = self.audio_queue.get(timeout=1.0)
+                if chunk is None:
+                    break
+                yield chunk
+            except queue.Empty:
+                if self.is_complete:
+                    break
+                continue
+        self.ws.close()
+    def send_text_chunk(self, text: str, continue_stream: bool = True, flush: bool = False):
+        if not self.is_connected:
+            raise Exception("WebSocket not connected")
+        payload = self._create_payload(text, continue_stream=continue_stream, flush=flush)
+        self.ws.send(json.dumps(payload))
+    def flush_buffer(self):
+        if not self.is_connected:
+            raise Exception("WebSocket not connected")
+        payload = self._create_payload("", flush=True)
+        self.ws.send(json.dumps(payload))
+    def start_streaming_session(self) -> Generator[bytes, None, None]:
+        self._reset_state()
+        self._connect()
-        self.end_time = time.time()
-        self._print_verbose_summary()
+        while True:
+            if not self.error_queue.empty():
+                raise self.error_queue.get()
+            try:
+                chunk = self.audio_queue.get(timeout=0.1)
+                if chunk is None:
+                    break
+                yield chunk
+            except queue.Empty:
+                if self.is_complete:
+                    break
+                continue
+    def _reset_state(self):
+        self.audio_queue = queue.Queue()
+        self.error_queue = queue.Queue()
+        self.is_complete = False
+        self.is_connected = False
+        self.request_id = None

smallestai/waves/utils.py CHANGED Viewed

@@ -1,8 +1,5 @@
-import re
-import io
 from typing import List
 from typing import Optional
-from pydub import AudioSegment
 from dataclasses import dataclass
 from smallestai.waves.exceptions import ValidationError
@@ -10,7 +7,7 @@ from smallestai.waves.models import TTSModels, TTSLanguages_lightning, TTSLangua
 API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
-SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
+WEBSOCKET_URL = "wss://waves-api.smallest.ai/api/v1/lightning-v2/get_speech/stream"
 SAMPLE_WIDTH = 2
 CHANNELS = 1
 ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
@@ -22,11 +19,12 @@ class TTSOptions:
     sample_rate: int
     voice_id: str
     api_key: str
-    add_wav_header: bool
     speed: float
     consistency: float
     similarity: float
     enhancement: int
+    language: str
+    output_format: str
 def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[int] = None):
@@ -46,50 +44,6 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float, consis
         raise ValidationError(f"Invalid enhancement: {enhancement}. Must be between 0 and 2.")
-def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
-    audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
-    wav_buf = io.BytesIO()
-    audio.export(wav_buf, format="wav")
-    wav_buf.seek(0)
-    return wav_buf.read()
-def preprocess_text(text: str) -> str:
-    text = text.replace("\n", " ").replace("\t", " ")
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
-    chunks = []
-    while text:
-        if len(text) <= chunk_size:
-            chunks.append(text.strip())
-            break
-        chunk_text = text[:chunk_size]
-        last_break_index = -1
-        # Find last sentence boundary using regex
-        for i in range(len(chunk_text) - 1, -1, -1):
-            if SENTENCE_END_REGEX.match(chunk_text[:i + 1]):
-                last_break_index = i
-                break
-        if last_break_index == -1:
-            # Fallback to space if no sentence boundary found
-            last_space = chunk_text.rfind(' ')
-            if last_space != -1:
-                last_break_index = last_space
-            else:
-                last_break_index = chunk_size - 1
-        chunks.append(text[:last_break_index + 1].strip())
-        text = text[last_break_index + 1:].strip()
-    return chunks
 def get_smallest_languages(model: str = 'lightning') -> List[str]:
     if model == 'lightning':
         return TTSLanguages_lightning