smallestai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of smallestai might be problematic. Click here for more details.
- smallest/__init__.py +5 -0
- smallest/async_tts.py +151 -0
- smallest/exceptions.py +15 -0
- smallest/models.py +7 -0
- smallest/stream_tts.py +135 -0
- smallest/tts.py +134 -0
- smallest/utils.py +69 -0
- smallestai-0.1.0.dist-info/LICENSE +21 -0
- smallestai-0.1.0.dist-info/METADATA +246 -0
- smallestai-0.1.0.dist-info/RECORD +12 -0
- smallestai-0.1.0.dist-info/WHEEL +5 -0
- smallestai-0.1.0.dist-info/top_level.txt +1 -0
smallest/__init__.py
ADDED
smallest/async_tts.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import copy
|
|
3
|
+
import aiohttp
|
|
4
|
+
import aiofiles
|
|
5
|
+
from typing import Optional, Union, List
|
|
6
|
+
|
|
7
|
+
from .models import TTSModels, TTSVoices
|
|
8
|
+
from .exceptions import TTSError, APIError
|
|
9
|
+
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
|
|
10
|
+
get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AsyncSmallest:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
api_key: Optional[str] = None,
|
|
17
|
+
model: TTSModels = "lightning",
|
|
18
|
+
sample_rate: int = 24000,
|
|
19
|
+
voice: TTSVoices = "emily",
|
|
20
|
+
speed: Optional[float] = 1.0,
|
|
21
|
+
add_wav_header: Optional[bool] = True,
|
|
22
|
+
transliterate: Optional[bool] = False,
|
|
23
|
+
remove_extra_silence: Optional[bool] = False
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
|
|
27
|
+
|
|
28
|
+
This class provides an asynchronous implementation of the text-to-speech functionality.
|
|
29
|
+
It allows for non-blocking synthesis of speech from text, making it suitable for applications
|
|
30
|
+
that require async processing.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
- api_key (str): The API key for authentication, export it as 'SMALLEST_API_KEY' in your environment variables.
|
|
34
|
+
- model (TTSModels): The model to be used for synthesis.
|
|
35
|
+
- sample_rate (int): The sample rate for the audio output.
|
|
36
|
+
- voice (TTSVoices): The voice to be used for synthesis.
|
|
37
|
+
- speed (float): The speed of the speech synthesis.
|
|
38
|
+
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
39
|
+
- transliterate (bool): Whether to transliterate the text.
|
|
40
|
+
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
41
|
+
|
|
42
|
+
Methods:
|
|
43
|
+
- get_languages: Returns a list of available languages for synthesis.
|
|
44
|
+
- get_voices: Returns a list of available voices for synthesis.
|
|
45
|
+
- get_models: Returns a list of available models for synthesis.
|
|
46
|
+
- synthesize: Asynchronously converts the provided text into speech and returns the audio content.
|
|
47
|
+
"""
|
|
48
|
+
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
49
|
+
if not self.api_key:
|
|
50
|
+
raise TTSError("API key is required")
|
|
51
|
+
|
|
52
|
+
self.opts = TTSOptions(
|
|
53
|
+
model=model,
|
|
54
|
+
sample_rate=sample_rate,
|
|
55
|
+
voice=voice,
|
|
56
|
+
api_key=self.api_key,
|
|
57
|
+
add_wav_header=add_wav_header,
|
|
58
|
+
speed=speed,
|
|
59
|
+
transliterate=transliterate,
|
|
60
|
+
remove_extra_silence=remove_extra_silence,
|
|
61
|
+
)
|
|
62
|
+
self.session = None
|
|
63
|
+
|
|
64
|
+
async def __aenter__(self):
|
|
65
|
+
if self.session is None:
|
|
66
|
+
self.session = aiohttp.ClientSession()
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
70
|
+
if self.session:
|
|
71
|
+
await self.session.close()
|
|
72
|
+
|
|
73
|
+
def get_languages(self) -> List[str]:
|
|
74
|
+
"""Returns a list of available languages."""
|
|
75
|
+
return get_smallest_languages()
|
|
76
|
+
|
|
77
|
+
def get_voices(self) -> List[str]:
|
|
78
|
+
"""Returns a list of available voices."""
|
|
79
|
+
return get_smallest_voices()
|
|
80
|
+
|
|
81
|
+
def get_models(self) -> List[str]:
|
|
82
|
+
"""Returns a list of available models."""
|
|
83
|
+
return get_smallest_models()
|
|
84
|
+
|
|
85
|
+
async def synthesize(
|
|
86
|
+
self,
|
|
87
|
+
text: str,
|
|
88
|
+
save_as: Optional[str] = None,
|
|
89
|
+
**kwargs
|
|
90
|
+
) -> Union[bytes, None]:
|
|
91
|
+
"""
|
|
92
|
+
Asynchronously synthesize speech from the provided text.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
- text (str): The text to be converted to speech.
|
|
96
|
+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
97
|
+
The file must have a .wav extension.
|
|
98
|
+
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
|
|
102
|
+
otherwise, returns None after saving the audio to the specified file.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
106
|
+
- APIError: If the API request fails or returns an error.
|
|
107
|
+
"""
|
|
108
|
+
opts = copy.deepcopy(self.opts)
|
|
109
|
+
for key, value in kwargs.items():
|
|
110
|
+
setattr(opts, key, value)
|
|
111
|
+
|
|
112
|
+
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
113
|
+
|
|
114
|
+
payload = {
|
|
115
|
+
"text": preprocess_text(text),
|
|
116
|
+
"sample_rate": opts.sample_rate,
|
|
117
|
+
"voice_id": opts.voice,
|
|
118
|
+
"add_wav_header": opts.add_wav_header,
|
|
119
|
+
"speed": opts.speed,
|
|
120
|
+
"model": opts.model,
|
|
121
|
+
"transliterate": opts.transliterate,
|
|
122
|
+
"remove_extra_silence": opts.remove_extra_silence
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
headers = {
|
|
126
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
127
|
+
"Content-Type": "application/json",
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if not self.session:
|
|
131
|
+
self.session = aiohttp.ClientSession()
|
|
132
|
+
|
|
133
|
+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
134
|
+
if res.status != 200:
|
|
135
|
+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
136
|
+
|
|
137
|
+
audio_content = await res.read()
|
|
138
|
+
|
|
139
|
+
if save_as:
|
|
140
|
+
if not save_as.endswith(".wav"):
|
|
141
|
+
raise TTSError("Invalid file name. Extension must be .wav")
|
|
142
|
+
|
|
143
|
+
if self.opts.add_wav_header:
|
|
144
|
+
async with aiofiles.open(save_as, mode='wb') as f:
|
|
145
|
+
await f.write(audio_content)
|
|
146
|
+
else:
|
|
147
|
+
async with aiofiles.open(save_as, mode='wb') as f:
|
|
148
|
+
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
return audio_content
|
smallest/exceptions.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
class TTSError(Exception):
|
|
2
|
+
"""Base exception for TTS SDK"""
|
|
3
|
+
pass
|
|
4
|
+
|
|
5
|
+
class APIError(TTSError):
|
|
6
|
+
"""Raised when the API returns an error"""
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
class ValidationError(TTSError):
|
|
10
|
+
"""Raised when input validation fails"""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
class AuthenticationError(TTSError):
|
|
14
|
+
"""Raised when authentication fails"""
|
|
15
|
+
pass
|
smallest/models.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
TTSModels = Literal["lightning"]
|
|
4
|
+
TTSLanguages = Literal["en", "hi"]
|
|
5
|
+
TTSVoices = Literal["emily", "jasmine", "arman", "james", "mithali", "aravind", "raj",
|
|
6
|
+
"arjun", "sanya", "saina", "pooja", "saurabh", "nisha", "mansi", "radhika", "kajal",
|
|
7
|
+
"raghav", "deepika", "niharika", "monika", "raman", "diya", "ananya", "william"]
|
smallest/stream_tts.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from threading import Thread
|
|
3
|
+
from queue import Queue, Empty
|
|
4
|
+
from typing import AsyncGenerator, Optional, Union
|
|
5
|
+
|
|
6
|
+
from .tts import Smallest
|
|
7
|
+
from .exceptions import APIError
|
|
8
|
+
from .async_tts import AsyncSmallest
|
|
9
|
+
from .utils import SENTENCE_END_REGEX
|
|
10
|
+
|
|
11
|
+
class TextToAudioStream:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
tts_instance: Union[Smallest, AsyncSmallest],
|
|
15
|
+
queue_timeout: float = 5.0,
|
|
16
|
+
max_retries: int = 3
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
A real-time text-to-speech processor that converts streaming text into audio output.
|
|
20
|
+
Useful for applications requiring immediate audio feedback from text generation,
|
|
21
|
+
such as voice assistants, live captioning, or interactive chatbots.
|
|
22
|
+
|
|
23
|
+
⚠️ `add_wav_header` is disabled by default for streaming efficiency. Refer to the README for more information.
|
|
24
|
+
|
|
25
|
+
Features:
|
|
26
|
+
- Streams audio chunks as soon as text is available.
|
|
27
|
+
- Handles both sync and async text-to-speech engines.
|
|
28
|
+
- Automatically retries failed synthesis attempts.
|
|
29
|
+
- Low latency between text generation and speech output.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
|
|
33
|
+
queue_timeout: How long to wait for new text (seconds, default: 1.0)
|
|
34
|
+
max_retries: Number of retry attempts for failed synthesis (default: 3)
|
|
35
|
+
"""
|
|
36
|
+
self.tts_instance = tts_instance
|
|
37
|
+
self.sentence_end_regex = SENTENCE_END_REGEX
|
|
38
|
+
self.queue_timeout = queue_timeout
|
|
39
|
+
self.max_retries = max_retries
|
|
40
|
+
self.queue = Queue()
|
|
41
|
+
self.buffer_size = 250
|
|
42
|
+
self.stop_flag = False
|
|
43
|
+
self.tts_instance.opts.add_wav_header = False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Streams the LLM output, splitting it into sentences and adding each to the queue.
|
|
49
|
+
|
|
50
|
+
Parameters:
|
|
51
|
+
- llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
|
|
52
|
+
"""
|
|
53
|
+
buffer = ""
|
|
54
|
+
async for chunk in llm_output:
|
|
55
|
+
buffer += chunk
|
|
56
|
+
if self.sentence_end_regex.match(buffer) or self.buffer_size > 600:
|
|
57
|
+
self.queue.put(buffer)
|
|
58
|
+
buffer = ""
|
|
59
|
+
|
|
60
|
+
if buffer:
|
|
61
|
+
self.queue.put(buffer)
|
|
62
|
+
|
|
63
|
+
self.stop_flag = True # completion flag when LLM output ends
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
67
|
+
"""Asynchronously synthesizes a given sentence."""
|
|
68
|
+
try:
|
|
69
|
+
return await self.tts_instance.synthesize(sentence)
|
|
70
|
+
except APIError as e:
|
|
71
|
+
if retries < self.max_retries:
|
|
72
|
+
return await self._synthesize_async(sentence, retries + 1)
|
|
73
|
+
else:
|
|
74
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
79
|
+
"""Synchronously synthesizes a given sentence."""
|
|
80
|
+
try:
|
|
81
|
+
return self.tts_instance.synthesize(sentence)
|
|
82
|
+
except APIError as e:
|
|
83
|
+
if retries < self.max_retries:
|
|
84
|
+
return self._synthesize_sync(sentence, retries + 1)
|
|
85
|
+
else:
|
|
86
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
|
|
91
|
+
"""
|
|
92
|
+
Continuously synthesizes sentences from the queue, yielding audio content.
|
|
93
|
+
If no sentences are in the queue, it waits until new data is available or streaming is complete.
|
|
94
|
+
"""
|
|
95
|
+
while not self.stop_flag or not self.queue.empty():
|
|
96
|
+
try:
|
|
97
|
+
sentence = self.queue.get(timeout=self.queue_timeout)
|
|
98
|
+
if isinstance(self.tts_instance, AsyncSmallest):
|
|
99
|
+
audio_content = await self._synthesize_async(sentence)
|
|
100
|
+
else:
|
|
101
|
+
loop = asyncio.get_running_loop()
|
|
102
|
+
audio_content = await loop.run_in_executor(None, self._synthesize_sync, sentence)
|
|
103
|
+
|
|
104
|
+
if audio_content:
|
|
105
|
+
yield audio_content
|
|
106
|
+
except Empty:
|
|
107
|
+
if self.stop_flag:
|
|
108
|
+
break
|
|
109
|
+
await asyncio.sleep(0.1) # avoid busy waiting if the queue is empty
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
|
|
113
|
+
"""
|
|
114
|
+
Convert streaming text into audio in real-time.
|
|
115
|
+
|
|
116
|
+
Handles the entire pipeline from receiving text to producing audio,
|
|
117
|
+
yielding audio chunks as soon as they're ready.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
llm_output: An async generator that yields text chunks.
|
|
121
|
+
|
|
122
|
+
Yields:
|
|
123
|
+
Raw audio data chunks (without WAV headers) that can be:
|
|
124
|
+
- Played directly through an audio device
|
|
125
|
+
- Saved to a file
|
|
126
|
+
- Streamed over a network
|
|
127
|
+
- Further processed as needed
|
|
128
|
+
"""
|
|
129
|
+
llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
|
|
130
|
+
llm_thread.start()
|
|
131
|
+
|
|
132
|
+
async for audio_content in self._run_synthesis():
|
|
133
|
+
yield audio_content
|
|
134
|
+
|
|
135
|
+
llm_thread.join()
|
smallest/tts.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import copy
|
|
3
|
+
import requests
|
|
4
|
+
from typing import Optional, Union, List
|
|
5
|
+
|
|
6
|
+
from .models import TTSModels, TTSVoices
|
|
7
|
+
from .exceptions import TTSError, APIError
|
|
8
|
+
from .utils import (TTSOptions, validate_input, preprocess_text,
|
|
9
|
+
get_smallest_languages, get_smallest_voices, get_smallest_models, API_BASE_URL)
|
|
10
|
+
|
|
11
|
+
class Smallest:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
api_key: Optional[str] = None,
|
|
15
|
+
model: TTSModels = "lightning",
|
|
16
|
+
sample_rate: int = 24000,
|
|
17
|
+
voice: TTSVoices = "emily",
|
|
18
|
+
speed: Optional[float] = 1.0,
|
|
19
|
+
add_wav_header: Optional[bool] = True,
|
|
20
|
+
transliterate: Optional[bool] = False,
|
|
21
|
+
remove_extra_silence: Optional[bool] = True
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Smallest Instance for text-to-speech synthesis.
|
|
25
|
+
|
|
26
|
+
This is a synchronous implementation of the text-to-speech functionality.
|
|
27
|
+
For an asynchronous version, please refer to the AsyncSmallest Instance.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
- api_key (str): The API key for authentication, export it as 'SMALLEST_API_KEY' in your environment variables.
|
|
31
|
+
- model (TTSModels): The model to be used for synthesis.
|
|
32
|
+
- sample_rate (int): The sample rate for the audio output.
|
|
33
|
+
- voice (TTSVoices): The voice to be used for synthesis.
|
|
34
|
+
- speed (float): The speed of the speech synthesis.
|
|
35
|
+
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
36
|
+
- transliterate (bool): Whether to transliterate the text.
|
|
37
|
+
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
38
|
+
|
|
39
|
+
Methods:
|
|
40
|
+
- get_languages: Returns a list of available languages for synthesis.
|
|
41
|
+
- get_voices: Returns a list of available voices for synthesis.
|
|
42
|
+
- get_models: Returns a list of available models for synthesis.
|
|
43
|
+
- synthesize: Converts the provided text into speech and returns the audio content.
|
|
44
|
+
"""
|
|
45
|
+
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
46
|
+
if not self.api_key:
|
|
47
|
+
raise TTSError("API key is required")
|
|
48
|
+
|
|
49
|
+
self.opts = TTSOptions(
|
|
50
|
+
model=model,
|
|
51
|
+
sample_rate=sample_rate,
|
|
52
|
+
voice=voice,
|
|
53
|
+
api_key=self.api_key,
|
|
54
|
+
add_wav_header=add_wav_header,
|
|
55
|
+
speed=speed,
|
|
56
|
+
transliterate=transliterate,
|
|
57
|
+
remove_extra_silence=remove_extra_silence
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def get_languages(self) -> List[str]:
|
|
61
|
+
"""Returns a list of available languages."""
|
|
62
|
+
return get_smallest_languages()
|
|
63
|
+
|
|
64
|
+
def get_voices(self) -> List[str]:
|
|
65
|
+
"""Returns a list of available voices."""
|
|
66
|
+
return get_smallest_voices()
|
|
67
|
+
|
|
68
|
+
def get_models(self) -> List[str]:
|
|
69
|
+
"""Returns a list of available models."""
|
|
70
|
+
return get_smallest_models()
|
|
71
|
+
|
|
72
|
+
def synthesize(
|
|
73
|
+
self,
|
|
74
|
+
text: str,
|
|
75
|
+
save_as: Optional[str] = None,
|
|
76
|
+
**kwargs
|
|
77
|
+
) -> Union[bytes, None]:
|
|
78
|
+
"""
|
|
79
|
+
Synthesize speech from the provided text.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
- text (str): The text to be converted to speech.
|
|
83
|
+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
84
|
+
The file must have a .wav extension.
|
|
85
|
+
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
|
|
89
|
+
otherwise, returns None after saving the audio to the specified file.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
93
|
+
- APIError: If the API request fails or returns an error.
|
|
94
|
+
"""
|
|
95
|
+
opts = copy.deepcopy(self.opts)
|
|
96
|
+
for key, value in kwargs.items():
|
|
97
|
+
setattr(opts, key, value)
|
|
98
|
+
|
|
99
|
+
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
100
|
+
|
|
101
|
+
payload = {
|
|
102
|
+
"text": preprocess_text(text),
|
|
103
|
+
"sample_rate": opts.sample_rate,
|
|
104
|
+
"voice_id": opts.voice,
|
|
105
|
+
"add_wav_header": opts.add_wav_header,
|
|
106
|
+
"speed": opts.speed,
|
|
107
|
+
"model": opts.model,
|
|
108
|
+
"transliterate": opts.transliterate,
|
|
109
|
+
"remove_extra_silence": opts.remove_extra_silence,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
headers = {
|
|
113
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
114
|
+
"Content-Type": "application/json",
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
118
|
+
if res.status_code != 200:
|
|
119
|
+
raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
|
|
120
|
+
|
|
121
|
+
audio_content = res.content
|
|
122
|
+
|
|
123
|
+
if save_as:
|
|
124
|
+
if not save_as.endswith(".wav"):
|
|
125
|
+
raise TTSError("Invalid file name. Extension must be .wav")
|
|
126
|
+
|
|
127
|
+
if self.opts.add_wav_header:
|
|
128
|
+
with open(save_as, "wb") as wf:
|
|
129
|
+
wf.write(audio_content)
|
|
130
|
+
else:
|
|
131
|
+
raise TTSError("WAV header is required for saving audio. Set 'add_wav_header=True' to add a WAV header.")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
return audio_content
|
smallest/utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import io
|
|
3
|
+
import unicodedata
|
|
4
|
+
from typing import List
|
|
5
|
+
from pydub import AudioSegment
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from sacremoses import MosesPunctNormalizer
|
|
8
|
+
|
|
9
|
+
from .exceptions import ValidationError
|
|
10
|
+
from .models import TTSModels, TTSLanguages, TTSVoices
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
14
|
+
SENTENCE_END_REGEX = re.compile(r'.*[-.!?;:…\n]$')
|
|
15
|
+
SAMPLE_WIDTH = 2
|
|
16
|
+
CHANNELS = 1
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class TTSOptions:
|
|
21
|
+
model: TTSModels
|
|
22
|
+
sample_rate: int
|
|
23
|
+
voice: TTSVoices
|
|
24
|
+
api_key: str
|
|
25
|
+
add_wav_header: bool
|
|
26
|
+
speed: float
|
|
27
|
+
transliterate: bool
|
|
28
|
+
remove_extra_silence: bool
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_input(text: str, voice: TTSVoices, model: TTSModels, sample_rate: int, speed: float):
|
|
32
|
+
if not text:
|
|
33
|
+
raise ValidationError("Text cannot be empty")
|
|
34
|
+
if voice not in TTSVoices.__args__:
|
|
35
|
+
raise ValidationError(f"Invalid voice: {voice}")
|
|
36
|
+
if model not in ['lightning']:
|
|
37
|
+
raise ValidationError(f"Invalid model: {model}")
|
|
38
|
+
if not 8000 <= sample_rate <= 48000:
|
|
39
|
+
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 48000")
|
|
40
|
+
if not 0.5 <= speed <= 2.0:
|
|
41
|
+
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
45
|
+
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
46
|
+
wav_buf = io.BytesIO()
|
|
47
|
+
audio.export(wav_buf, format="wav")
|
|
48
|
+
wav_buf.seek(0)
|
|
49
|
+
return wav_buf.read()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def preprocess_text(text: str) -> str:
|
|
53
|
+
# Replace special characters with their normal form
|
|
54
|
+
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
|
55
|
+
text = text.lower()
|
|
56
|
+
# Normalize punctuation using Moses punct normalizer
|
|
57
|
+
mpn = MosesPunctNormalizer()
|
|
58
|
+
text = mpn.normalize(text)
|
|
59
|
+
return text.strip()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_smallest_languages() -> List[str]:
|
|
63
|
+
return list(TTSLanguages.__args__)
|
|
64
|
+
|
|
65
|
+
def get_smallest_voices() -> List[str]:
|
|
66
|
+
return list(TTSVoices.__args__)
|
|
67
|
+
|
|
68
|
+
def get_smallest_models() -> List[str]:
|
|
69
|
+
return ["lightning"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 smallest.ai
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: smallestai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Official Python client for the Smallest AI API
|
|
5
|
+
Author-email: Smallest <info@smallest.ai>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/smallest-inc/smallest-python-sdk
|
|
8
|
+
Keywords: smallest,smallest.ai,tts,text-to-speech
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.9
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: aiohttp
|
|
16
|
+
Requires-Dist: aiofiles
|
|
17
|
+
Requires-Dist: requests
|
|
18
|
+
Requires-Dist: sacremoses
|
|
19
|
+
Requires-Dist: pydub
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: jiwer ; extra == 'test'
|
|
22
|
+
Requires-Dist: httpx ; extra == 'test'
|
|
23
|
+
Requires-Dist: pytest ; extra == 'test'
|
|
24
|
+
Requires-Dist: pytest-asyncio ; extra == 'test'
|
|
25
|
+
Requires-Dist: deepgram-sdk ; extra == 'test'
|
|
26
|
+
Requires-Dist: python-dotenv ; extra == 'test'
|
|
27
|
+
|
|
28
|
+

|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
<a href="https://twitter.com/smallest_AI">
|
|
33
|
+
<img src="https://img.shields.io/twitter/url/https/twitter.com/smallest_AI.svg?style=social&label=Follow%20smallest_AI" alt="Twitter">
|
|
34
|
+
</a>
|
|
35
|
+
<a href="https://discord.gg/ywShEyXHBW">
|
|
36
|
+
<img src="https://dcbadge.vercel.app/api/server/ywShEyXHBW?style=flat" alt="Discord">
|
|
37
|
+
</a>
|
|
38
|
+
<a href="https://www.linkedin.com/company/smallest">
|
|
39
|
+
<img src="https://img.shields.io/badge/LinkedIn-Connect-blue" alt="Linkedin">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://www.youtube.com/@smallest_ai">
|
|
42
|
+
<img src="https://img.shields.io/static/v1?message=smallest_ai&logo=youtube&label=&color=FF0000&logoColor=white&labelColor=&style=for-the-badge" height=20 alt="Youtube">
|
|
43
|
+
</a>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
## Official Python Client for Smallest AI API
|
|
47
|
+
|
|
48
|
+
Smallest AI builds high-speed multi-lingual voice models tailored for real-time applications, achieving ultra-realistic audio generation in as fast as ~100 milliseconds for 10 seconds of audio. With this sdk, you can easily convert text into high-quality audio with humanlike expressiveness.
|
|
49
|
+
|
|
50
|
+
Currently, the library supports direct synthesis and the ability to synthesize streamed LLM output, both synchronously and asynchronously.
|
|
51
|
+
|
|
52
|
+
## Table of Contents
|
|
53
|
+
|
|
54
|
+
- [Installation](#installation)
|
|
55
|
+
- [Get the API Key](#get-the-api-key)
|
|
56
|
+
- [Examples](#examples)
|
|
57
|
+
- [Sync](#sync)
|
|
58
|
+
- [Async](#async)
|
|
59
|
+
- [LLM to Speech](#llm-to-speech)
|
|
60
|
+
- [Available Methods](#available-methods)
|
|
61
|
+
- [Technical Note: WAV Headers in Streaming Audio](#technical-note-wav-headers-in-streaming-audio)
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
To install the package, follow these steps:
|
|
66
|
+
|
|
67
|
+
1. Clone the repository:
|
|
68
|
+
```bash
|
|
69
|
+
git clone https://github.com/smallest-inc/smallest-python-sdk.git
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
2. Navigate to the cloned directory and install the package:
|
|
73
|
+
```bash
|
|
74
|
+
cd smallest-python
|
|
75
|
+
pip install .
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Get the API Key
|
|
79
|
+
|
|
80
|
+
1. Visit [waves.smallest.ai](https://waves.smallest.ai/) and sign up for an account or log in if you already have an account.
|
|
81
|
+
2. Navigate to `API Key` tab in your account dashboard.
|
|
82
|
+
3. Create a new API Key and copy it.
|
|
83
|
+
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
84
|
+
|
|
85
|
+
## Examples
|
|
86
|
+
|
|
87
|
+
### Sync
|
|
88
|
+
A synchronous text-to-speech synthesis client.
|
|
89
|
+
|
|
90
|
+
**Basic Usage:**
|
|
91
|
+
```python
|
|
92
|
+
import os
|
|
93
|
+
from smallest import Smallest
|
|
94
|
+
|
|
95
|
+
def main():
|
|
96
|
+
client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
97
|
+
audio_data = client.synthesize("Hello, this is a test for sync synthesis function.")
|
|
98
|
+
with open("sync_synthesize.wav", "wb") as f:
|
|
99
|
+
f.write(audio_data)
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
main()
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Parameters:**
|
|
106
|
+
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
107
|
+
- `model`: TTS model to use (default: "lightning")
|
|
108
|
+
- `sample_rate`: Audio sample rate (default: 24000)
|
|
109
|
+
- `voice`: Voice ID (default: "emily")
|
|
110
|
+
- `speed`: Speech speed multiplier (default: 1.0)
|
|
111
|
+
- `add_wav_header`: Include WAV header in output (default: True)
|
|
112
|
+
- `transliterate`: Enable text transliteration (default: False)
|
|
113
|
+
- `remove_extra_silence`: Remove additional silence (default: True)
|
|
114
|
+
|
|
115
|
+
### Async
|
|
116
|
+
A synchronous text-to-speech synthesis client.
|
|
117
|
+
|
|
118
|
+
**Basic Usage:**
|
|
119
|
+
```python
|
|
120
|
+
import os
|
|
121
|
+
import asyncio
|
|
122
|
+
import aiofiles
|
|
123
|
+
from smallest import AsyncSmallest
|
|
124
|
+
|
|
125
|
+
client = AsyncSmallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
126
|
+
|
|
127
|
+
async def main():
|
|
128
|
+
async with client as tts:
|
|
129
|
+
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
130
|
+
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
131
|
+
await f.write(audio_bytes)
|
|
132
|
+
|
|
133
|
+
if __name__ == "__main__":
|
|
134
|
+
asyncio.run(main())
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**Parameters:**
|
|
138
|
+
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
139
|
+
- `model`: TTS model to use (default: "lightning")
|
|
140
|
+
- `sample_rate`: Audio sample rate (default: 24000)
|
|
141
|
+
- `voice`: Voice ID (default: "emily")
|
|
142
|
+
- `speed`: Speech speed multiplier (default: 1.0)
|
|
143
|
+
- `add_wav_header`: Include WAV header in output (default: True)
|
|
144
|
+
- `transliterate`: Enable text transliteration (default: False)
|
|
145
|
+
- `remove_extra_silence`: Remove additional silence (default: True)
|
|
146
|
+
|
|
147
|
+
### LLM to Speech
|
|
148
|
+
|
|
149
|
+
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output with minimal latency. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import os
|
|
153
|
+
import wave
|
|
154
|
+
import asyncio
|
|
155
|
+
from groq import Groq
|
|
156
|
+
from smallest import Smallest
|
|
157
|
+
from smallest import TextToAudioStream
|
|
158
|
+
|
|
159
|
+
llm = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
|
160
|
+
tts = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
161
|
+
|
|
162
|
+
async def generate_text(prompt):
|
|
163
|
+
"""Async generator for streaming text from Groq. You can use any LLM"""
|
|
164
|
+
completion = llm.chat.completions.create(
|
|
165
|
+
messages=[
|
|
166
|
+
{
|
|
167
|
+
"role": "user",
|
|
168
|
+
"content": prompt,
|
|
169
|
+
}
|
|
170
|
+
],
|
|
171
|
+
model="llama3-8b-8192",
|
|
172
|
+
stream=True,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
for chunk in completion:
|
|
176
|
+
text = chunk.choices[0].delta.content
|
|
177
|
+
if text is not None:
|
|
178
|
+
yield text
|
|
179
|
+
|
|
180
|
+
async def save_audio_to_wav(file_path, processor, llm_output):
|
|
181
|
+
with wave.open(file_path, "wb") as wav_file:
|
|
182
|
+
wav_file.setnchannels(1)
|
|
183
|
+
wav_file.setsampwidth(2)
|
|
184
|
+
wav_file.setframerate(24000)
|
|
185
|
+
|
|
186
|
+
async for audio_chunk in processor.process(llm_output):
|
|
187
|
+
wav_file.writeframes(audio_chunk)
|
|
188
|
+
|
|
189
|
+
async def main():
|
|
190
|
+
# Initialize the TTS processor with the TTS instance
|
|
191
|
+
processor = TextToAudioStream(tts_instance=tts)
|
|
192
|
+
|
|
193
|
+
# Generate text asynchronously and process it
|
|
194
|
+
llm_output = generate_text("Explain text to speech like I am five in 5 sentences.")
|
|
195
|
+
|
|
196
|
+
# As an example, save the generated audio to a WAV file.
|
|
197
|
+
await save_audio_to_wav("llm_to_speech.wav", processor, llm_output)
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
asyncio.run(main())
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
**Parameters:**
|
|
204
|
+
|
|
205
|
+
- `tts_instance`: Text-to-speech engine (Smallest or AsyncSmallest)
|
|
206
|
+
- `queue_timeout`: Wait time for new text (seconds, default: 5.0)
|
|
207
|
+
- `max_retries`: Number of retry attempts for failed synthesis (default: 3)
|
|
208
|
+
|
|
209
|
+
**Output Format:**
|
|
210
|
+
The processor yields raw audio data chunks without WAV headers for streaming efficiency. These chunks can be:
|
|
211
|
+
|
|
212
|
+
- Played directly through an audio device
|
|
213
|
+
- Saved to a file
|
|
214
|
+
- Streamed over a network
|
|
215
|
+
- Further processed as needed
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
## Available Methods
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from smallest.tts import Smallest
|
|
222
|
+
|
|
223
|
+
client = Smallest()
|
|
224
|
+
|
|
225
|
+
print(f"Avalaible Languages: {client.get_languages()}")
|
|
226
|
+
print(f"Available Voices: {client.get_voices()}")
|
|
227
|
+
print(f"Available Models: {client.get_models()}")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## Technical Note: WAV Headers in Streaming Audio
|
|
231
|
+
|
|
232
|
+
When implementing audio streaming with chunks of synthesized speech, WAV headers are omitted from individual chunks because:
|
|
233
|
+
|
|
234
|
+
#### Technical Issues
|
|
235
|
+
- Each WAV header contains metadata about the entire audio file.
|
|
236
|
+
- Multiple headers would make chunks appear as separate audio files and add redundancy.
|
|
237
|
+
- Headers contain file-specific data (like total size) that's invalid for chunks.
|
|
238
|
+
- Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
|
|
239
|
+
- Audio players would try to reinitialize audio settings for each chunk.
|
|
240
|
+
|
|
241
|
+
### Best Practices
|
|
242
|
+
1. Stream raw PCM audio data without headers
|
|
243
|
+
2. Add a single WAV header only when:
|
|
244
|
+
- Saving the complete stream to a file
|
|
245
|
+
- Initializing the audio playback system
|
|
246
|
+
- Converting the stream to a standard audio format
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
+
smallest/async_tts.py,sha256=w_SY1Oetn5Zorq-8JXA7lGeRHR3kTtBzqotc_hF0hOQ,6010
|
|
3
|
+
smallest/exceptions.py,sha256=41GLVvNTfRQMQsPLGk0lHuhK2mak8_dVtiFLEtT23Dc,333
|
|
4
|
+
smallest/models.py,sha256=R5UZZA9SibrJ2DsWPi_mkKI13WfyC-MLd-7kptfjns4,390
|
|
5
|
+
smallest/stream_tts.py,sha256=1j4JpAwrAmwprC98mKQwuhXf0HFxFTlMcZ3_JAdcAK0,5416
|
|
6
|
+
smallest/tts.py,sha256=Gr13I-O0qH7EclnR_g29qcpiqITWjgfjCFxFwNxyZrA,5410
|
|
7
|
+
smallest/utils.py,sha256=hAgyEfZEnvayzu8qS4LXhpZR8qK7z4gatLWGVOkS3Yg,2183
|
|
8
|
+
smallestai-0.1.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
+
smallestai-0.1.0.dist-info/METADATA,sha256=SFv_JOl8POo4AdpYJV-tgqekj9jz24nf0usA5O1gRvg,8696
|
|
10
|
+
smallestai-0.1.0.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
|
11
|
+
smallestai-0.1.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
+
smallestai-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
smallest
|