PyPI - smallestai - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

smallestai 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

smallest/async_tts.py +108 -44
smallest/stream_tts.py +147 -28
smallest/tts.py +91 -40
smallest/utils.py +17 -16
{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/METADATA +118 -24
smallestai-2.2.0.dist-info/RECORD +12 -0
{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/WHEEL +1 -1
smallestai-2.0.0.dist-info/RECORD +0 -12
{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/LICENSE +0 -0
{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/top_level.txt +0 -0

smallest/async_tts.py CHANGED Viewed

@@ -4,11 +4,11 @@ import json
 import aiohttp
 import aiofiles
 import requests
-from typing import Optional, Union, List
+from typing import Optional, Union, List, AsyncIterator
 from smallest.exceptions import TTSError, APIError
 from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
-                     get_smallest_languages, get_smallest_models, API_BASE_URL)
+                     get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
 class AsyncSmallest:
@@ -19,9 +19,10 @@ class AsyncSmallest:
         sample_rate: Optional[int] = 24000,
         voice_id: Optional[str] = "emily",
         speed: Optional[float] = 1.0,
-        add_wav_header: Optional[bool] = True,
-        transliterate: Optional[bool] = False,
-        remove_extra_silence: Optional[bool] = False
+        consistency: Optional[float] = 0.5,
+        similarity: Optional[float] = 0.0,
+        enhancement: Optional[int] = 1,
+        add_wav_header: Optional[bool] = True
     ) -> None:
         """
         AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -36,9 +37,10 @@ class AsyncSmallest:
         - sample_rate (int): The sample rate for the audio output.
         - voice_id (TTSVoices): The voice to be used for synthesis.
         - speed (float): The speed of the speech synthesis.
+        - consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
+        - similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
+        - enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
         - add_wav_header (bool): Whether to add a WAV header to the output audio.
-        - transliterate (bool): Whether to transliterate the text.
-        - remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
         Methods:
         - get_languages: Returns a list of available languages for synthesis.
@@ -49,6 +51,9 @@ class AsyncSmallest:
         self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
         if not self.api_key:
             raise TTSError()
+        if model == "lightning-large" and voice_id is None:
+            voice_id = "lakshya"
         self.chunk_size = 250
         self.opts = TTSOptions(
@@ -58,8 +63,9 @@ class AsyncSmallest:
             api_key=self.api_key,
             add_wav_header=add_wav_header,
             speed=speed,
-            transliterate=transliterate,
-            remove_extra_silence=remove_extra_silence,
+            consistency=consistency,
+            similarity=similarity,
+            enhancement=enhancement
         )
         self.session = None
@@ -124,67 +130,92 @@ class AsyncSmallest:
     async def synthesize(
             self,
             text: str,
+            stream: Optional[bool] = False,
             save_as: Optional[str] = None,
             **kwargs
-        ) -> Union[bytes, None]:
+        ) -> Union[bytes, None, AsyncIterator[bytes]]:
         """
         Asynchronously synthesize speech from the provided text.
         Args:
         - text (str): The text to be converted to speech.
+        - stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
         - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
                                    The file must have a .wav extension.
         - kwargs: Additional optional parameters to override `__init__` options for this call.
         Returns:
-        - Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
-                              otherwise, returns None after saving the audio to the specified file.
+        - Union[bytes, None, Iterator[bytes]]:
+            - If `stream=True`, returns an iterator yielding audio chunks.
+            - If `save_as` is provided, saves the file and returns None.
+            - Otherwise, returns the synthesized audio content as bytes.
         Raises:
         - TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
         - APIError: If the API request fails or returns an error.
+        - ValueError: If an unexpected parameter is passed in `kwargs`.
         """
-        should_cleanup = await self._ensure_session()
+        should_cleanup = False
+        if self.session is None or self.session.closed:
+            self.session = aiohttp.ClientSession()
+            should_cleanup = True  # Cleanup only if we created a new session
         try:
             opts = copy.deepcopy(self.opts)
+            valid_keys = set(vars(opts).keys())
+            invalid_keys = [key for key in kwargs if key not in valid_keys]
+            if invalid_keys:
+                raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
             for key, value in kwargs.items():
                 setattr(opts, key, value)
-            validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed)
+            text = preprocess_text(text)
+            validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
             self.chunk_size = 250
-            if opts.model == 'ligtning-large':
+            if opts.model == 'lightning-large':
                 self.chunk_size = 140
             chunks = chunk_text(text, self.chunk_size)
-            audio_content = b""
-            for chunk in chunks:
-                payload = {
-                    "text": preprocess_text(chunk),
-                    "sample_rate": opts.sample_rate,
-                    "voice_id": opts.voice_id,
-                    "add_wav_header": False,
-                    "speed": opts.speed,
-                    "model": opts.model,
-                    "transliterate": opts.transliterate,
-                    "remove_extra_silence": opts.remove_extra_silence
-                }
-                headers = {
-                    "Authorization": f"Bearer {self.api_key}",
-                    "Content-Type": "application/json",
-                }
-                if not self.session:
-                    self.session = aiohttp.ClientSession()
-                async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
-                    if res.status != 200:
-                        raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
-                    audio_content += await res.read()
+            async def audio_stream():
+                for chunk in chunks:
+                    payload = {
+                        "text": chunk,
+                        "sample_rate": opts.sample_rate,
+                        "voice_id": opts.voice_id,
+                        "add_wav_header": False,
+                        "speed": opts.speed,
+                        "model": opts.model
+                    }
+                    if opts.model == "lightning-large":
+                        if opts.consistency is not None:
+                            payload["consistency"] = opts.consistency
+                        if opts.similarity is not None:
+                            payload["similarity"] = opts.similarity
+                        if opts.enhancement is not None:
+                            payload["enhancement"] = opts.enhancement
+                    headers = {
+                        "Authorization": f"Bearer {self.api_key}",
+                        "Content-Type": "application/json",
+                    }
+                    async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
+                        if res.status != 200:
+                            raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
+                        yield await res.read()
+            if stream:
+                return audio_stream()
+            audio_content = b"".join([chunk async for chunk in audio_stream()])
             if save_as:
                 if not save_as.endswith(".wav"):
@@ -199,7 +230,7 @@ class AsyncSmallest:
                 return add_wav_header(audio_content, opts.sample_rate)
             return audio_content
         finally:
             if should_cleanup and self.session:
                 await self.session.close()
@@ -226,7 +257,6 @@ class AsyncSmallest:
         if not os.path.exists(file_path):
             raise TTSError("Invalid file path. File does not exist.")
-        ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
         file_extension = os.path.splitext(file_path)[1].lower()
         if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
             raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
@@ -257,4 +287,38 @@ class AsyncSmallest:
             if should_cleanup and self.session:
                 await self.session.close()
                 self.session = None
+    async def delete_voice(self, voice_id: str) -> str:
+        """
+        Delete a cloned voice asynchronously.
+        Args:
+        - voice_id (str): The ID of the voice to be deleted.
+        Returns:
+        - str: The response from the API.
+        Raises:
+        - APIError: If the API request fails or returns an error.
+        """
+        url = f"{API_BASE_URL}/lightning-large"
+        payload = {'voiceId': voice_id}
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+        }
+        should_cleanup = await self._ensure_session()
+        try:
+            async with self.session.delete(url, headers=headers, json=payload) as res:
+                if res.status != 200:
+                    raise APIError(f"Failed to delete voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
+                return await res.text()
+        finally:
+            if should_cleanup and self.session:
+                await self.session.close()
+                self.session = None

smallest/stream_tts.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import asyncio
+import time
 from threading import Thread
 from queue import Queue, Empty
-from typing import AsyncGenerator, Optional, Union
+from typing import AsyncGenerator, Optional, Union, List, Dict, Any
 from smallest.tts import Smallest
 from smallest.exceptions import APIError
@@ -13,7 +14,8 @@ class TextToAudioStream:
         self,
         tts_instance: Union[Smallest, AsyncSmallest],
         queue_timeout: Optional[float] = 5.0,
-        max_retries: Optional[int] = 3
+        max_retries: Optional[int] = 3,
+        verbose: bool = False
     ):
         """
         A real-time text-to-speech processor that converts streaming text into audio output.
@@ -32,6 +34,7 @@ class TextToAudioStream:
             tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
             queue_timeout: How long to wait for new text (seconds, default: 1.0)
             max_retries: Number of retry attempts for failed synthesis (default: 3)
+            verbose: Whether to log detailed metrics about TTS requests (default: False)
         """
         self.tts_instance = tts_instance
         self.tts_instance.opts.add_wav_header = False
@@ -41,6 +44,14 @@ class TextToAudioStream:
         self.queue = Queue()
         self.buffer_size = 250
         self.stop_flag = False
+        self.verbose = verbose
+        # Metrics tracking
+        self.request_count = 0
+        self.request_logs: List[Dict[str, Any]] = []
+        self.start_time = 0
+        self.first_api_response_time = None
+        self.end_time = 0
         if self.tts_instance.opts.model == 'lightning-large':
             self.buffer_size = 140
@@ -48,60 +59,117 @@ class TextToAudioStream:
     async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
         """
-        Streams the LLM output, splitting it into sentences based on the regex
-        and chunk size, and adding each chunk to the queue.
+        Streams the LLM output, splitting it into chunks based on sentence boundaries
+        or space characters if no sentence boundary is found before reaching buffer_size.
         Parameters:
         - llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
         """
         buffer = ""
-        last_break_index = 0
         async for chunk in llm_output:
             buffer += chunk
-            i = 0
-            while i < len(buffer):
-                current_chunk = buffer[:i + 1]
-                if self.sentence_end_regex.match(current_chunk):
-                    last_break_index = i
-                if len(current_chunk) >= self.buffer_size:
-                    if last_break_index > 0:
-                        self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
-                        buffer = buffer[last_break_index + 1:]
+            while len(buffer) > self.buffer_size:
+                chunk_text = buffer[:self.buffer_size]
+                last_break_index = -1
+                # Find last sentence boundary using regex
+                for i in range(len(chunk_text) - 1, -1, -1):
+                    if self.sentence_end_regex.match(chunk_text[:i + 1]):
+                        last_break_index = i
+                        break
+                if last_break_index == -1:
+                    # Fallback to space if no sentence boundary found
+                    last_space = chunk_text.rfind(' ')
+                    if last_space != -1:
+                        last_break_index = last_space
                     else:
-                        # No sentence boundary, split at max chunk size
-                        self.queue.put(f'{buffer[:self.buffer_size].replace("—", " ").strip()} ')
-                        buffer = buffer[self.buffer_size:]
-                    last_break_index = 0
-                    i = -1
-                i += 1
+                        last_break_index = self.buffer_size - 1
+                # Add chunk to queue and update buffer
+                self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
+                buffer = buffer[last_break_index + 1:].strip()
+        # Don't forget the remaining text
         if buffer:
             self.queue.put(f'{buffer.replace("—", " ").strip()} ')
         self.stop_flag = True
     def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
         """Synchronously synthesizes a given sentence."""
+        request_start_time = time.time()
+        request_id = self.request_count + 1
         try:
-            return self.tts_instance.synthesize(sentence)
+            audio_content = self.tts_instance.synthesize(sentence)
+            self.request_count += 1
+            request_end_time = time.time()
+            if self.verbose:
+                request_duration = request_end_time - request_start_time
+                if self.first_api_response_time is None:
+                    self.first_api_response_time = time.time() - self.start_time
+                self.request_logs.append({
+                    "id": request_id,
+                    "text": sentence,
+                    "start_time": request_start_time - self.start_time,
+                    "end_time": request_end_time - self.start_time,
+                    "duration": request_duration,
+                    "char_count": len(sentence),
+                    "retries": retries
+                })
+            return audio_content
         except APIError as e:
             if retries < self.max_retries:
+                if self.verbose:
+                    print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
                 return self._synthesize_sync(sentence, retries + 1)
             else:
-                print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
+                if self.verbose:
+                    print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
                 return None
     async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
         """Asynchronously synthesizes a given sentence."""
+        request_start_time = time.time()
+        request_id = self.request_count + 1
         try:
-            return await self.tts_instance.synthesize(sentence)
+            audio_content = await self.tts_instance.synthesize(sentence)
+            self.request_count += 1
+            request_end_time = time.time()
+            if self.verbose:
+                request_duration = request_end_time - request_start_time
+                if self.first_api_response_time is None:
+                    self.first_api_response_time = time.time() - self.start_time
+                self.request_logs.append({
+                    "id": request_id,
+                    "text": sentence,
+                    "start_time": request_start_time - self.start_time,
+                    "end_time": request_end_time - self.start_time,
+                    "duration": request_duration,
+                    "char_count": len(sentence),
+                    "retries": retries
+                })
+            return audio_content
         except APIError as e:
             if retries < self.max_retries:
+                if self.verbose:
+                    print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
                 return await self._synthesize_async(sentence, retries + 1)
             else:
-                print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
+                if self.verbose:
+                    print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
                 return None
@@ -112,7 +180,8 @@ class TextToAudioStream:
         """
         while not self.stop_flag or not self.queue.empty():
             try:
-                sentence = self.queue.get(timeout=self.queue_timeout)
+                sentence = self.queue.get_nowait()
                 if isinstance(self.tts_instance, AsyncSmallest):
                     audio_content = await self._synthesize_async(sentence)
                 else:
@@ -121,10 +190,55 @@ class TextToAudioStream:
                 if audio_content:
                     yield audio_content
             except Empty:
-                if self.stop_flag:
+                # Quick check if we should exit
+                if self.stop_flag and self.queue.empty():
                     break
-                await asyncio.sleep(0.1)  # avoid busy waiting if the queue is empty
+                # Short sleep to avoid busy-waiting
+                await asyncio.sleep(0.01)  # Much shorter sleep time (10ms)
+    def _print_verbose_summary(self) -> None:
+        """Print a summary of all metrics if verbose mode is enabled."""
+        if not self.verbose:
+            return
+        total_duration = self.end_time - self.start_time
+        print("\n" + "="*100)
+        print(f"TEXT-TO-AUDIO STREAM METRICS")
+        print("="*100)
+        print(f"\nOVERALL STATISTICS:")
+        print(f"  Total requests made: {self.request_count}")
+        print(f"  Time to first API response: {self.first_api_response_time:.3f}s")
+        print(f"  Total processing time: {total_duration:.3f}s")
+        # Print table header
+        print("\nREQUEST DETAILS:")
+        header = f"{'#':4} {'Start (s)':10} {'End (s)':10} {'Duration (s)':12} {'Characters':15} {'Text'}"
+        print("\n" + header)
+        print("-" * 100)
+        # Print table rows
+        for log in self.request_logs:
+            row = (
+                f"{log['id']:4} "
+                f"{log['start_time']:10.3f} "
+                f"{log['end_time']:10.3f} "
+                f"{log['duration']:12.3f} "
+                f"{log['char_count']:15} "
+                f"{log['text'][:50]}{'...' if len(log['text']) > 50 else ''}"
+            )
+            print(row)
+            # Print retry information if any
+            if log['retries'] > 0:
+                print(f"{'':4} {'':10} {'':10} {'':12} {'':15} Retries: {log['retries']}")
+        print("\n" + "="*100)
     async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
@@ -144,6 +258,8 @@ class TextToAudioStream:
             - Streamed over a network
             - Further processed as needed
         """
+        self.start_time = time.time()
         llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
         llm_thread.start()
@@ -151,3 +267,6 @@ class TextToAudioStream:
             yield audio_content
         llm_thread.join()
+        self.end_time = time.time()
+        self._print_verbose_summary()

smallest/tts.py CHANGED Viewed

@@ -3,11 +3,11 @@ import json
 import wave
 import copy
 import requests
-from typing import Optional, Union, List
+from typing import Optional, Union, List, Iterator
 from smallest.exceptions import TTSError, APIError
 from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
-get_smallest_languages, get_smallest_models, API_BASE_URL)
+get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
 class Smallest:
     def __init__(
@@ -17,9 +17,10 @@ class Smallest:
         sample_rate: Optional[int] = 24000,
         voice_id: Optional[str] = "emily",
         speed: Optional[float] = 1.0,
-        add_wav_header: Optional[bool] = True,
-        transliterate: Optional[bool] = False,
-        remove_extra_silence: Optional[bool] = True
+        consistency: Optional[float] = 0.5,
+        similarity: Optional[float] = 0.0,
+        enhancement: Optional[int] = 1,
+        add_wav_header: Optional[bool] = True
     ) -> None:
         """
         Smallest Instance for text-to-speech synthesis.
@@ -33,9 +34,10 @@ class Smallest:
         - sample_rate (int): The sample rate for the audio output.
         - voice_id (TTSVoices): The voice to be used for synthesis.
         - speed (float): The speed of the speech synthesis.
+        - consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
+        - similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
+        - enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
         - add_wav_header (bool): Whether to add a WAV header to the output audio.
-        - transliterate (bool): Whether to transliterate the text.
-        - remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
         Methods:
         - get_languages: Returns a list of available languages for synthesis.
@@ -46,7 +48,9 @@ class Smallest:
         self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
         if not self.api_key:
             raise TTSError()
+        if model == "lightning-large" and voice_id is None:
+            voice_id = "lakshya"
         self.chunk_size = 250
         self.opts = TTSOptions(
@@ -56,8 +60,9 @@ class Smallest:
             api_key=self.api_key,
             add_wav_header=add_wav_header,
             speed=speed,
-            transliterate=transliterate,
-            remove_extra_silence=remove_extra_silence
+            consistency=consistency,
+            similarity=similarity,
+            enhancement=enhancement
         )
@@ -102,61 +107,81 @@ class Smallest:
     def synthesize(
             self,
             text: str,
+            stream: Optional[bool] = False,
             save_as: Optional[str] = None,
             **kwargs
-        ) -> Union[bytes, None]:
+        ) -> Union[bytes, None, Iterator[bytes]]:
         """
         Synthesize speech from the provided text.
-        Args:
         - text (str): The text to be converted to speech.
-        - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
+        - stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
+        - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
                                    The file must have a .wav extension.
         - kwargs: Additional optional parameters to override `__init__` options for this call.
         Returns:
-        - Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
-                              otherwise, returns None after saving the audio to the specified file.
+        - Union[bytes, None, Iterator[bytes]]:
+            - If `stream=True`, returns an iterator yielding audio chunks.
+            - If `save_as` is provided, saves the file and returns None.
+            - Otherwise, returns the synthesized audio content as bytes.
         Raises:
         - TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
         - APIError: If the API request fails or returns an error.
         """
         opts = copy.deepcopy(self.opts)
+        valid_keys = set(vars(opts).keys())
+        invalid_keys = [key for key in kwargs if key not in valid_keys]
+        if invalid_keys:
+            raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
         for key, value in kwargs.items():
             setattr(opts, key, value)
-        validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed)
+        text = preprocess_text(text)
+        validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
         self.chunk_size = 250
         if opts.model == "lightning-large":
             self.chunk_size = 140
         chunks = chunk_text(text, self.chunk_size)
-        audio_content = b""
-        for chunk in chunks:
-            payload = {
-                "text": preprocess_text(chunk),
-                "sample_rate": opts.sample_rate,
-                "voice_id": opts.voice_id,
-                "add_wav_header": False,
-                "speed": opts.speed,
-                "model": opts.model,
-                "transliterate": opts.transliterate,
-                "remove_extra_silence": opts.remove_extra_silence,
-            }
-            headers = {
-                "Authorization": f"Bearer {self.api_key}",
-                "Content-Type": "application/json",
-            }
-            res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
-            if res.status_code != 200:
-                raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
+        def audio_stream():
+            for chunk in chunks:
+                payload = {
+                    "text": chunk,
+                    "sample_rate": opts.sample_rate,
+                    "voice_id": opts.voice_id,
+                    "add_wav_header": False,
+                    "speed": opts.speed,
+                }
+                if opts.model == "lightning-large":
+                    if opts.consistency is not None:
+                        payload["consistency"] = opts.consistency
+                    if opts.similarity is not None:
+                        payload["similarity"] = opts.similarity
+                    if opts.enhancement is not None:
+                        payload["enhancement"] = opts.enhancement
+                headers = {
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json",
+                }
+                res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
+                if res.status_code != 200:
+                    raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
+                yield res.content
-            audio_content += res.content
+        if stream:
+            return audio_stream()
+        audio_content = b"".join(audio_stream())
         if save_as:
             if not save_as.endswith(".wav"):
@@ -193,7 +218,6 @@ class Smallest:
         if not os.path.isfile(file_path):
             raise TTSError("Invalid file path. File does not exist.")
-        ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
         file_extension = os.path.splitext(file_path)[1].lower()
         if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
             raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
@@ -212,3 +236,30 @@ class Smallest:
             raise APIError(f"Failed to add voice: {response.text}. For more information, visit https://waves.smallest.ai/")
         return json.dumps(response.json(), indent=4, ensure_ascii=False)
+    def delete_voice(self, voice_id: str) -> str:
+        """
+        Delete a cloned voice synchronously.
+        Args:
+        - voice_id (str): The ID of the voice to be deleted.
+        Returns:
+        - str: The response from the API.
+        Raises:
+        - APIError: If the API request fails or returns an error.
+        """
+        url = f"{API_BASE_URL}/lightning-large"
+        payload = {'voiceId': voice_id}
+        headers = {
+            'Authorization': f"Bearer {self.api_key}",
+        }
+        response = requests.delete(url, headers=headers, json=payload)
+        if response.status_code != 200:
+            raise APIError(f"Failed to delete voice: {response.text}. For more information, visit https://waves.smallest.ai/")
+        return json.dumps(response.json(), indent=4, ensure_ascii=False)

smallest/utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import re
 import io
 from typing import List
+from typing import Optional
 from pydub import AudioSegment
 from dataclasses import dataclass
-from sacremoses import MosesPunctNormalizer
 from smallest.exceptions import ValidationError
 from smallest.models import TTSModels, TTSLanguages
@@ -11,9 +11,9 @@ from smallest.models import TTSModels, TTSLanguages
 API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
 SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
-mpn = MosesPunctNormalizer()
 SAMPLE_WIDTH = 2
 CHANNELS = 1
+ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
 @dataclass
@@ -24,11 +24,12 @@ class TTSOptions:
     api_key: str
     add_wav_header: bool
     speed: float
-    transliterate: bool
-    remove_extra_silence: bool
+    consistency: float
+    similarity: float
+    enhancement: int
-def validate_input(text: str, model: str, sample_rate: int, speed: float):
+def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[int] = None):
     if not text:
         raise ValidationError("Text cannot be empty.")
     if model not in TTSModels:
@@ -37,29 +38,29 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float):
         raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
     if not 0.5 <= speed <= 2.0:
         raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
+    if consistency is not None and not 0.0 <= consistency <= 1.0:
+        raise ValidationError(f"Invalid consistency: {consistency}. Must be between 0.0 and 1.0")
+    if similarity is not None and not 0.0 <= similarity <= 1.0:
+        raise ValidationError(f"Invalid similarity: {similarity}. Must be between 0.0 and 1.0")
+    if enhancement is not None and not 0 <= enhancement <= 2:
+        raise ValidationError(f"Invalid enhancement: {enhancement}. Must be between 0 and 2.")
 def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
-        audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
-        wav_buf = io.BytesIO()
-        audio.export(wav_buf, format="wav")
-        wav_buf.seek(0)
-        return wav_buf.read()
+    audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
+    wav_buf = io.BytesIO()
+    audio.export(wav_buf, format="wav")
+    wav_buf.seek(0)
+    return wav_buf.read()
 def preprocess_text(text: str) -> str:
     text = text.replace("\n", " ").replace("\t", " ").replace("—", " ").replace("-", " ").replace("–", " ")
     text = re.sub(r'\s+', ' ', text)
-    text = mpn.normalize(text)
     return text.strip()
 def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
-    """
-    Splits the input text into chunks based on sentence boundaries
-    defined by SENTENCE_END_REGEX and the maximum chunk size.
-    Only splits at valid sentence boundaries to avoid breaking words.
-    """
     chunks = []
     while text:
         if len(text) <= chunk_size:

{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: smallestai
-Version: 2.0.0
+Version: 2.2.0
 Summary: Official Python client for the Smallest AI API
 Author-email: Smallest <support@smallest.ai>
 License: MIT
@@ -15,7 +15,6 @@ License-File: LICENSE
 Requires-Dist: aiohttp
 Requires-Dist: aiofiles
 Requires-Dist: requests
-Requires-Dist: sacremoses
 Requires-Dist: pydub
 Provides-Extra: test
 Requires-Dist: jiwer; extra == "test"
@@ -59,8 +58,11 @@ Currently, the library supports direct synthesis and the ability to synthesize s
   - [Aynchronous](#Synchronous)
   - [LLM to Speech](#llm-to-speech)
   - [Add your Voice](#add-your-voice)
-    - [Synchronously](#synchronously)
-    - [Asynchronously](#asynchronously)
+    - [Synchronously](#add-synchronously)
+    - [Asynchronously](#add-asynchronously)
+  - [Delete your Voice](#delete-your-voice)
+    - [Synchronously](#delete-synchronously)
+    - [Asynchronously](#delete-asynchronously)
 - [Available Methods](#available-methods)
 - [Technical Note: WAV Headers in Streaming Audio](#technical-note-wav-headers-in-streaming-audio)
@@ -80,14 +82,6 @@ When using an SDK in your application, make sure to pin to at least the major ve
 3. Create a new API Key and copy it.
 4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
-## Best Practices for Input Text
-While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
-For optimal voice generation results:
-1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
-2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
-3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
 ## Examples
@@ -115,9 +109,10 @@ if __name__ == "__main__":
 - `sample_rate`: Audio sample rate (default: 24000)
 - `voice_id`: Voice ID (default: "emily")
 - `speed`: Speech speed multiplier (default: 1.0)
-- `add_wav_header`: Include WAV header in output (default: True)
-- `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. (default: 0.5)
+- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. (default: 0)
+- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. (default: False)
+- `add_wav_header`: Whether to add a WAV header to the output audio.
 These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override these parameters for a specific synthesis request.
@@ -141,9 +136,8 @@ import asyncio
 import aiofiles
 from smallest import AsyncSmallest
-client = AsyncSmallest(api_key="SMALLEST_API_KEY")
 async def main():
+    client = AsyncSmallest(api_key="SMALLEST_API_KEY")
     async with client as tts:
         audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
         async with aiofiles.open("async_synthesize.wav", "wb") as f:
@@ -153,15 +147,33 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
+**Running Asynchronously in a Jupyter Notebook**
+If you are using a Jupyter Notebook, use the following approach to execute the asynchronous function within an existing event loop:
+```python
+import asyncio
+import aiofiles
+from smallest import AsyncSmallest
+async def main():
+    client = AsyncSmallest(api_key="SMALLEST_API_KEY")
+    async with client as tts:
+        audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
+        async with aiofiles.open("async_synthesize.wav", "wb") as f:
+            await f.write(audio_bytes) # alternatively you can use the `save_as` parameter.
+await main()
+```
 **Parameters:**
 - `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
 - `model`: TTS model to use (default: "lightning")
 - `sample_rate`: Audio sample rate (default: 24000)
 - `voice_id`: Voice ID (default: "emily")
 - `speed`: Speech speed multiplier (default: 1.0)
-- `add_wav_header`: Include WAV header in output (default: True)
-- `transliterate`: Enable text transliteration (default: False)
-- `remove_extra_silence`: Remove additional silence (default: True)
+- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
+- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
+- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
+- `add_wav_header`: Whether to add a WAV header to the output audio.
 These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override any of these parameters on a per-request basis.
@@ -178,6 +190,58 @@ audio_bytes = await tts.synthesize(
 The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
+#### Stream through a WebSocket
+```python
+import asyncio
+import websockets
+from groq import Groq
+from smallest import Smallest, TextToAudioStream
+# Initialize Groq (LLM) and Smallest (TTS) instances
+llm = Groq(api_key="GROQ_API_KEY")
+tts = Smallest(api_key="SMALLEST_API_KEY")
+WEBSOCKET_URL = "wss://echo.websocket.events" # Mock WebSocket server
+# Async function to stream text generation from LLM
+async def generate_text(prompt):
+    completion = llm.chat.completions.create(
+        messages=[{"role": "user", "content": prompt}],
+        model="llama3-8b-8192",
+        stream=True,
+    )
+    # Yield text as it is generated
+    for chunk in completion:
+        text = chunk.choices[0].delta.content
+        if text:
+            yield text
+# Main function to run the process
+async def main():
+    # Initialize the TTS processor
+    processor = TextToAudioStream(tts_instance=tts)
+    # Generate text from LLM
+    llm_output = generate_text("Explain text to speech like I am five in 5 sentences.")
+    # Stream the generated speech throught a websocket
+    async with websockets.connect(WEBSOCKET_URL) as ws:
+        print("Connected to WebSocket server.")
+        # Stream the generated speech
+        async for audio_chunk in processor.process(llm_output):
+            await ws.send(audio_chunk)  # Send audio chunk
+            echoed_data = await ws.recv()  # Receive the echoed message
+            print("Received from server:", echoed_data[:20], "...")  # Print first 20 bytes
+        print("WebSocket connection closed.")
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+#### Save to a File
 ```python
 import wave
 import asyncio
@@ -245,12 +309,12 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
 ## Add your Voice
 The Smallest AI SDK allows you to clone your voice by uploading an audio file. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
-### Synchronously
+### Add Synchronously
 ```python
 from smallest import Smallest
 def main():
-    client = Smallest(api_key="YOUR_API_KEY")
+    client = Smallest(api_key="SMALLEST_API_KEY")
     res = client.add_voice(display_name="My Voice", file_path="my_voice.wav")
     print(res)
@@ -258,13 +322,13 @@ if __name__ == "__main__":
     main()
 ```
-### Asynchronously
+### Add Asynchronously
 ```python
 import asyncio
 from smallest import AsyncSmallest
 async def main():
-    client = AsyncSmallest(api_key="YOUR_API_KEY")
+    client = AsyncSmallest(api_key="SMALLEST_API_KEY")
     res = await client.add_voice(display_name="My Voice", file_path="my_voice.wav")
     print(res)
@@ -272,6 +336,36 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
+## Delete your Voice
+The Smallest AI SDK allows you to delete your cloned voice. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
+### Delete Synchronously
+```python
+from smallest import Smallest
+def main():
+    client = Smallest(api_key="SMALLEST_API_KEY")
+    res = client.delete_voice(voice_id="voice_id")
+    print(res)
+if __name__ == "__main__":
+    main()
+```
+### Delete Asynchronously
+```python
+import asyncio
+from smallest import AsyncSmallest
+async def main():
+    client = AsyncSmallest(api_key="SMALLEST_API_KEY")
+    res = await client.delete_voice(voice_id="voice_id")
+    print(res)
+if __name__ == "__main__":
+    asyncio.run(main())
+```
 ## Available Methods
 ```python

smallestai-2.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
+smallest/async_tts.py,sha256=fyl1yBd4uqD2KthZMdnfsiY9ZlQlMXDK2JCWmjR03I4,12639
+smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
+smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
+smallest/stream_tts.py,sha256=MuQSOgOsZEAYcy-Hbs-ZhCNmtn0u3v9tfOk1RbfAsvY,10893
+smallest/tts.py,sha256=_0OG-1DU0Fx3ZeVlJpNGk3fz6ZceaMfvb5ktkEH3tMw,10721
+smallest/utils.py,sha256=7N4Pghv-6FQENdvWArxGpAuUF5xvcEJm2OxejJTIYnM,3349
+smallestai-2.2.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
+smallestai-2.2.0.dist-info/METADATA,sha256=-02Yij0bHSd6l1PKjJkZyuj7D5Zc5-fgZCeb5cA8T5c,14878
+smallestai-2.2.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
+smallestai-2.2.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
+smallestai-2.2.0.dist-info/RECORD,,

{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (76.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

smallestai-2.0.0.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
-smallest/async_tts.py,sha256=5qW7owlMeSWFx0rpn9dYfbO76mmNY0DXcytNjLfbbz8,9727
-smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
-smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
-smallest/stream_tts.py,sha256=SeP9A9zXJWiV62Eezv0L1J5sRIR304Llc_mwVtOOSUI,6348
-smallest/tts.py,sha256=xBBEk_byRPGT6SYkE6qvhfEupgHl6XBdAqtxmzw2rF8,8311
-smallest/utils.py,sha256=FCZkvbbHJBoN0jpBSqmt1hJjvks56t8i82we4XnqjYk,3016
-smallestai-2.0.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
-smallestai-2.0.0.dist-info/METADATA,sha256=EIyZZqzAvHgQ7jfEs5x5LUx3HjzoCUhzJoXfkb3CuoI,11538
-smallestai-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-smallestai-2.0.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
-smallestai-2.0.0.dist-info/RECORD,,

{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

smallestai 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

smallestai 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl