smallestai 1.3.4__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of smallestai might be problematic. Click here for more details.
- smallest/async_tts.py +213 -59
- smallest/models.py +4 -22
- smallest/stream_tts.py +49 -44
- smallest/tts.py +137 -34
- smallest/utils.py +32 -44
- {smallestai-1.3.4.dist-info → smallestai-2.1.0.dist-info}/METADATA +164 -37
- smallestai-2.1.0.dist-info/RECORD +12 -0
- {smallestai-1.3.4.dist-info → smallestai-2.1.0.dist-info}/WHEEL +1 -1
- smallestai-1.3.4.dist-info/RECORD +0 -12
- {smallestai-1.3.4.dist-info → smallestai-2.1.0.dist-info}/LICENSE +0 -0
- {smallestai-1.3.4.dist-info → smallestai-2.1.0.dist-info}/top_level.txt +0 -0
smallest/async_tts.py
CHANGED
|
@@ -1,26 +1,25 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import copy
|
|
3
|
+
import json
|
|
3
4
|
import aiohttp
|
|
4
5
|
import aiofiles
|
|
6
|
+
import requests
|
|
5
7
|
from typing import Optional, Union, List
|
|
6
8
|
|
|
7
|
-
from smallest.models import TTSModels, TTSVoices
|
|
8
9
|
from smallest.exceptions import TTSError, APIError
|
|
9
|
-
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
|
|
10
|
-
get_smallest_languages,
|
|
10
|
+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
11
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class AsyncSmallest:
|
|
14
15
|
def __init__(
|
|
15
16
|
self,
|
|
16
17
|
api_key: str = None,
|
|
17
|
-
model:
|
|
18
|
-
sample_rate: int = 24000,
|
|
19
|
-
|
|
18
|
+
model: Optional[str] = "lightning",
|
|
19
|
+
sample_rate: Optional[int] = 24000,
|
|
20
|
+
voice_id: Optional[str] = "emily",
|
|
20
21
|
speed: Optional[float] = 1.0,
|
|
21
|
-
add_wav_header: Optional[bool] = True
|
|
22
|
-
transliterate: Optional[bool] = False,
|
|
23
|
-
remove_extra_silence: Optional[bool] = False
|
|
22
|
+
add_wav_header: Optional[bool] = True
|
|
24
23
|
) -> None:
|
|
25
24
|
"""
|
|
26
25
|
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
|
|
@@ -33,11 +32,9 @@ class AsyncSmallest:
|
|
|
33
32
|
- api_key (str): The API key for authentication, export it as 'SMALLEST_API_KEY' in your environment variables.
|
|
34
33
|
- model (TTSModels): The model to be used for synthesis.
|
|
35
34
|
- sample_rate (int): The sample rate for the audio output.
|
|
36
|
-
-
|
|
35
|
+
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
37
36
|
- speed (float): The speed of the speech synthesis.
|
|
38
37
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
39
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
40
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
41
38
|
|
|
42
39
|
Methods:
|
|
43
40
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -48,45 +45,85 @@ class AsyncSmallest:
|
|
|
48
45
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
49
46
|
if not self.api_key:
|
|
50
47
|
raise TTSError()
|
|
48
|
+
if model == "lightning-large":
|
|
49
|
+
voice_id = "lakshya"
|
|
50
|
+
|
|
51
51
|
self.chunk_size = 250
|
|
52
52
|
|
|
53
53
|
self.opts = TTSOptions(
|
|
54
54
|
model=model,
|
|
55
55
|
sample_rate=sample_rate,
|
|
56
|
-
|
|
56
|
+
voice_id=voice_id,
|
|
57
57
|
api_key=self.api_key,
|
|
58
58
|
add_wav_header=add_wav_header,
|
|
59
|
-
speed=speed
|
|
60
|
-
transliterate=transliterate,
|
|
61
|
-
remove_extra_silence=remove_extra_silence,
|
|
59
|
+
speed=speed
|
|
62
60
|
)
|
|
63
61
|
self.session = None
|
|
64
62
|
|
|
63
|
+
|
|
65
64
|
async def __aenter__(self):
|
|
66
65
|
if self.session is None:
|
|
67
66
|
self.session = aiohttp.ClientSession()
|
|
68
67
|
return self
|
|
69
68
|
|
|
69
|
+
|
|
70
70
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
71
71
|
if self.session:
|
|
72
72
|
await self.session.close()
|
|
73
73
|
|
|
74
74
|
|
|
75
|
+
async def _ensure_session(self):
|
|
76
|
+
"""Ensure session exists for direct calls"""
|
|
77
|
+
if not self.session:
|
|
78
|
+
self.session = aiohttp.ClientSession()
|
|
79
|
+
return True
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
75
83
|
def get_languages(self) -> List[str]:
|
|
76
84
|
"""Returns a list of available languages."""
|
|
77
85
|
return get_smallest_languages()
|
|
78
86
|
|
|
79
|
-
def
|
|
87
|
+
def get_cloned_voices(self) -> str:
|
|
88
|
+
"""Returns a list of your cloned voices."""
|
|
89
|
+
headers = {
|
|
90
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
res = requests.request("GET", f"{API_BASE_URL}/lightning-large/get_cloned_voices", headers=headers)
|
|
94
|
+
if res.status_code != 200:
|
|
95
|
+
raise APIError(f"Failed to get cloned voices: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
96
|
+
|
|
97
|
+
return json.dumps(res.json(), indent=4, ensure_ascii=False)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_voices(
|
|
101
|
+
self,
|
|
102
|
+
model: Optional[str] = "lightning"
|
|
103
|
+
) -> str:
|
|
80
104
|
"""Returns a list of available voices."""
|
|
81
|
-
|
|
105
|
+
headers = {
|
|
106
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
res = requests.request("GET", f"{API_BASE_URL}/{model}/get_voices", headers=headers)
|
|
110
|
+
if res.status_code != 200:
|
|
111
|
+
raise APIError(f"Failed to get voices: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
112
|
+
|
|
113
|
+
return json.dumps(res.json(), indent=4, ensure_ascii=False)
|
|
114
|
+
|
|
82
115
|
|
|
83
116
|
def get_models(self) -> List[str]:
|
|
84
117
|
"""Returns a list of available models."""
|
|
85
118
|
return get_smallest_models()
|
|
86
119
|
|
|
120
|
+
|
|
87
121
|
async def synthesize(
|
|
88
122
|
self,
|
|
89
123
|
text: str,
|
|
124
|
+
consistency: Optional[float] = 0.5,
|
|
125
|
+
similarity: Optional[float] = 0,
|
|
126
|
+
enhancement: Optional[bool] = False,
|
|
90
127
|
save_as: Optional[str] = None,
|
|
91
128
|
**kwargs
|
|
92
129
|
) -> Union[bytes, None]:
|
|
@@ -97,6 +134,9 @@ class AsyncSmallest:
|
|
|
97
134
|
- text (str): The text to be converted to speech.
|
|
98
135
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
99
136
|
The file must have a .wav extension.
|
|
137
|
+
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
138
|
+
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
139
|
+
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
100
140
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
101
141
|
|
|
102
142
|
Returns:
|
|
@@ -106,52 +146,166 @@ class AsyncSmallest:
|
|
|
106
146
|
Raises:
|
|
107
147
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
108
148
|
- APIError: If the API request fails or returns an error.
|
|
149
|
+
- ValueError: If an unexpected parameter is passed in `kwargs`.
|
|
109
150
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
"
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
151
|
+
should_cleanup = False
|
|
152
|
+
|
|
153
|
+
if self.session is None or self.session.closed:
|
|
154
|
+
self.session = aiohttp.ClientSession()
|
|
155
|
+
should_cleanup = True # Cleanup only if we created a new session
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
opts = copy.deepcopy(self.opts)
|
|
159
|
+
valid_keys = set(vars(opts).keys())
|
|
160
|
+
|
|
161
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
162
|
+
if invalid_keys:
|
|
163
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
164
|
+
|
|
165
|
+
for key, value in kwargs.items():
|
|
166
|
+
setattr(opts, key, value)
|
|
167
|
+
|
|
168
|
+
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed, consistency, similarity, enhancement)
|
|
169
|
+
|
|
170
|
+
self.chunk_size = 250
|
|
171
|
+
if opts.model == 'lightning-large':
|
|
172
|
+
self.chunk_size = 140
|
|
173
|
+
|
|
174
|
+
chunks = chunk_text(text, self.chunk_size)
|
|
175
|
+
audio_content = b""
|
|
176
|
+
|
|
177
|
+
for chunk in chunks:
|
|
178
|
+
payload = {
|
|
179
|
+
"text": preprocess_text(chunk),
|
|
180
|
+
"sample_rate": opts.sample_rate,
|
|
181
|
+
"voice_id": opts.voice_id,
|
|
182
|
+
"add_wav_header": False,
|
|
183
|
+
"speed": opts.speed,
|
|
184
|
+
"model": opts.model
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if opts.model == "lightning-large":
|
|
188
|
+
if consistency:
|
|
189
|
+
payload["consistency"] = consistency
|
|
190
|
+
if similarity:
|
|
191
|
+
payload["similarity"] = similarity
|
|
192
|
+
if enhancement:
|
|
193
|
+
payload["enhancement"] = enhancement
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
headers = {
|
|
197
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
198
|
+
"Content-Type": "application/json",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
202
|
+
if res.status != 200:
|
|
203
|
+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
204
|
+
|
|
205
|
+
audio_content += await res.read()
|
|
206
|
+
|
|
207
|
+
if save_as:
|
|
208
|
+
if not save_as.endswith(".wav"):
|
|
209
|
+
raise TTSError("Invalid file name. Extension must be .wav")
|
|
210
|
+
|
|
211
|
+
async with aiofiles.open(save_as, mode='wb') as f:
|
|
212
|
+
await f.write(add_wav_header(audio_content, opts.sample_rate))
|
|
213
|
+
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
if opts.add_wav_header:
|
|
217
|
+
return add_wav_header(audio_content, opts.sample_rate)
|
|
218
|
+
|
|
219
|
+
return audio_content
|
|
220
|
+
|
|
221
|
+
finally:
|
|
222
|
+
if should_cleanup and self.session:
|
|
223
|
+
await self.session.close()
|
|
224
|
+
self.session = None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
async def add_voice(self, display_name: str, file_path: str) -> str:
|
|
228
|
+
"""
|
|
229
|
+
Instantly clone your voice asynchronously.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
- display_name (str): The display name for the new voice.
|
|
233
|
+
- file_path (str): The path to the reference audio file to be cloned.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
- str: The response from the API as a formatted JSON string.
|
|
237
|
+
|
|
238
|
+
Raises:
|
|
239
|
+
- TTSError: If the file does not exist or is not a valid audio file.
|
|
240
|
+
- APIError: If the API request fails or returns an error.
|
|
241
|
+
"""
|
|
242
|
+
url = f"{API_BASE_URL}/lightning-large/add_voice"
|
|
243
|
+
|
|
244
|
+
if not os.path.exists(file_path):
|
|
245
|
+
raise TTSError("Invalid file path. File does not exist.")
|
|
246
|
+
|
|
247
|
+
file_extension = os.path.splitext(file_path)[1].lower()
|
|
248
|
+
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
249
|
+
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
250
|
+
|
|
251
|
+
headers = {
|
|
252
|
+
'Authorization': f"Bearer {self.api_key}",
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
should_cleanup = await self._ensure_session()
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
async with aiofiles.open(file_path, 'rb') as f:
|
|
259
|
+
file_data = await f.read()
|
|
260
|
+
|
|
261
|
+
data = aiohttp.FormData()
|
|
262
|
+
content_type = file_extension[1:]
|
|
263
|
+
|
|
264
|
+
data.add_field('displayName', display_name)
|
|
265
|
+
data.add_field('file', file_data, filename=file_path, content_type=f"audio/{content_type}")
|
|
266
|
+
|
|
267
|
+
async with self.session.post(url, headers=headers, data=data) as res:
|
|
140
268
|
if res.status != 200:
|
|
141
|
-
raise APIError(f"Failed to
|
|
269
|
+
raise APIError(f"Failed to add voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
270
|
+
|
|
271
|
+
return json.dumps(await res.json(), indent=4, ensure_ascii=False)
|
|
272
|
+
|
|
273
|
+
finally:
|
|
274
|
+
if should_cleanup and self.session:
|
|
275
|
+
await self.session.close()
|
|
276
|
+
self.session = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
async def delete_voice(self, voice_id: str) -> str:
|
|
280
|
+
"""
|
|
281
|
+
Delete a cloned voice asynchronously.
|
|
142
282
|
|
|
143
|
-
|
|
283
|
+
Args:
|
|
284
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
144
285
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
raise TTSError("Invalid file name. Extension must be .wav")
|
|
286
|
+
Returns:
|
|
287
|
+
- str: The response from the API.
|
|
148
288
|
|
|
149
|
-
|
|
150
|
-
|
|
289
|
+
Raises:
|
|
290
|
+
- APIError: If the API request fails or returns an error.
|
|
291
|
+
"""
|
|
292
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
293
|
+
payload = {'voiceId': voice_id}
|
|
294
|
+
|
|
295
|
+
headers = {
|
|
296
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
297
|
+
}
|
|
151
298
|
|
|
152
|
-
|
|
299
|
+
should_cleanup = await self._ensure_session()
|
|
153
300
|
|
|
154
|
-
|
|
155
|
-
|
|
301
|
+
try:
|
|
302
|
+
async with self.session.delete(url, headers=headers, json=payload) as res:
|
|
303
|
+
if res.status != 200:
|
|
304
|
+
raise APIError(f"Failed to delete voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
156
305
|
|
|
157
|
-
|
|
306
|
+
return await res.text()
|
|
307
|
+
|
|
308
|
+
finally:
|
|
309
|
+
if should_cleanup and self.session:
|
|
310
|
+
await self.session.close()
|
|
311
|
+
self.session = None
|
smallest/models.py
CHANGED
|
@@ -1,23 +1,5 @@
|
|
|
1
|
-
from typing import Literal, List, Tuple, cast
|
|
2
|
-
import aiohttp
|
|
3
|
-
import asyncio
|
|
4
|
-
|
|
5
|
-
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
6
|
-
|
|
7
|
-
async def _fetch_voice_and_model() -> Tuple[List[str], List[str]]:
|
|
8
|
-
async with aiohttp.ClientSession() as session:
|
|
9
|
-
async with session.get(f"{API_BASE_URL}/voice/get-all-models") as response:
|
|
10
|
-
api_response = await response.json()
|
|
11
|
-
|
|
12
|
-
voices = []
|
|
13
|
-
for model in api_response:
|
|
14
|
-
for voice in model['voiceIds']:
|
|
15
|
-
voices.append(voice['voiceId'])
|
|
16
|
-
models = [model['modelName'] for model in api_response]
|
|
17
|
-
return models, voices
|
|
18
|
-
|
|
19
|
-
models, voices = asyncio.run(_fetch_voice_and_model())
|
|
20
|
-
|
|
21
1
|
TTSLanguages = ["en", "hi"]
|
|
22
|
-
TTSModels =
|
|
23
|
-
|
|
2
|
+
TTSModels = [
|
|
3
|
+
"lightning",
|
|
4
|
+
"lightning-large"
|
|
5
|
+
]
|
smallest/stream_tts.py
CHANGED
|
@@ -12,8 +12,8 @@ class TextToAudioStream:
|
|
|
12
12
|
def __init__(
|
|
13
13
|
self,
|
|
14
14
|
tts_instance: Union[Smallest, AsyncSmallest],
|
|
15
|
-
queue_timeout: float = 5.0,
|
|
16
|
-
max_retries: int = 3
|
|
15
|
+
queue_timeout: Optional[float] = 5.0,
|
|
16
|
+
max_retries: Optional[int] = 3
|
|
17
17
|
):
|
|
18
18
|
"""
|
|
19
19
|
A real-time text-to-speech processor that converts streaming text into audio output.
|
|
@@ -30,12 +30,11 @@ class TextToAudioStream:
|
|
|
30
30
|
|
|
31
31
|
Args:
|
|
32
32
|
tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
|
|
33
|
-
queue_timeout: How long to wait for new text (seconds, default:
|
|
33
|
+
queue_timeout: How long to wait for new text (seconds, default: 5.0)
|
|
34
34
|
max_retries: Number of retry attempts for failed synthesis (default: 3)
|
|
35
35
|
"""
|
|
36
36
|
self.tts_instance = tts_instance
|
|
37
37
|
self.tts_instance.opts.add_wav_header = False
|
|
38
|
-
|
|
39
38
|
self.sentence_end_regex = SENTENCE_END_REGEX
|
|
40
39
|
self.queue_timeout = queue_timeout
|
|
41
40
|
self.max_retries = max_retries
|
|
@@ -43,69 +42,72 @@ class TextToAudioStream:
|
|
|
43
42
|
self.buffer_size = 250
|
|
44
43
|
self.stop_flag = False
|
|
45
44
|
|
|
45
|
+
if self.tts_instance.opts.model == 'lightning-large':
|
|
46
|
+
self.buffer_size = 140
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
|
|
48
50
|
"""
|
|
49
|
-
Streams the LLM output, splitting it into
|
|
50
|
-
|
|
51
|
+
Streams the LLM output, splitting it into chunks based on sentence boundaries
|
|
52
|
+
or space characters if no sentence boundary is found before reaching buffer_size.
|
|
51
53
|
|
|
52
54
|
Parameters:
|
|
53
55
|
- llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
|
|
54
56
|
"""
|
|
55
57
|
buffer = ""
|
|
56
|
-
last_break_index = 0
|
|
57
58
|
|
|
58
59
|
async for chunk in llm_output:
|
|
59
60
|
buffer += chunk
|
|
60
|
-
i = 0
|
|
61
|
-
|
|
62
|
-
while i < len(buffer):
|
|
63
|
-
current_chunk = buffer[:i + 1]
|
|
64
|
-
if self.sentence_end_regex.match(current_chunk):
|
|
65
|
-
last_break_index = i
|
|
66
61
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
62
|
+
while len(buffer) > self.buffer_size:
|
|
63
|
+
chunk_text = buffer[:self.buffer_size]
|
|
64
|
+
last_break_index = -1
|
|
65
|
+
|
|
66
|
+
# Find last sentence boundary using regex
|
|
67
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
68
|
+
if self.sentence_end_regex.match(chunk_text[:i + 1]):
|
|
69
|
+
last_break_index = i
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
if last_break_index == -1:
|
|
73
|
+
# Fallback to space if no sentence boundary found
|
|
74
|
+
last_space = chunk_text.rfind(' ')
|
|
75
|
+
if last_space != -1:
|
|
76
|
+
last_break_index = last_space
|
|
71
77
|
else:
|
|
72
|
-
|
|
73
|
-
self.queue.put(buffer[:self.buffer_size].replace("—", " ").strip())
|
|
74
|
-
buffer = buffer[self.buffer_size:]
|
|
78
|
+
last_break_index = self.buffer_size - 1
|
|
75
79
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
i += 1
|
|
80
|
+
# Add chunk to queue and update buffer
|
|
81
|
+
self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
|
|
82
|
+
buffer = buffer[last_break_index + 1:].strip()
|
|
80
83
|
|
|
84
|
+
# Don't forget the remaining text
|
|
81
85
|
if buffer:
|
|
82
|
-
self.queue.put(buffer.replace("—", " ").strip())
|
|
86
|
+
self.queue.put(f'{buffer.replace("—", " ").strip()} ')
|
|
83
87
|
|
|
84
|
-
self.stop_flag = True
|
|
88
|
+
self.stop_flag = True
|
|
85
89
|
|
|
86
90
|
|
|
87
|
-
|
|
88
|
-
"""
|
|
91
|
+
def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
92
|
+
"""Synchronously synthesizes a given sentence."""
|
|
89
93
|
try:
|
|
90
|
-
return
|
|
94
|
+
return self.tts_instance.synthesize(sentence)
|
|
91
95
|
except APIError as e:
|
|
92
96
|
if retries < self.max_retries:
|
|
93
|
-
return
|
|
97
|
+
return self._synthesize_sync(sentence, retries + 1)
|
|
94
98
|
else:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
99
|
+
raise APIError(f"Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
100
|
+
|
|
98
101
|
|
|
99
|
-
def
|
|
100
|
-
"""
|
|
102
|
+
async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
103
|
+
"""Asynchronously synthesizes a given sentence."""
|
|
101
104
|
try:
|
|
102
|
-
return self.tts_instance.synthesize(sentence)
|
|
105
|
+
return await self.tts_instance.synthesize(sentence)
|
|
103
106
|
except APIError as e:
|
|
104
107
|
if retries < self.max_retries:
|
|
105
|
-
return self.
|
|
108
|
+
return await self._synthesize_async(sentence, retries + 1)
|
|
106
109
|
else:
|
|
107
|
-
|
|
108
|
-
return None
|
|
110
|
+
raise APIError(f"Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
109
111
|
|
|
110
112
|
|
|
111
113
|
async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
|
|
@@ -147,10 +149,13 @@ class TextToAudioStream:
|
|
|
147
149
|
- Streamed over a network
|
|
148
150
|
- Further processed as needed
|
|
149
151
|
"""
|
|
150
|
-
|
|
151
|
-
llm_thread.start()
|
|
152
|
+
stream_task = asyncio.create_task(self._stream_llm_output(llm_output))
|
|
152
153
|
|
|
153
|
-
|
|
154
|
-
|
|
154
|
+
try:
|
|
155
|
+
async for audio_content in self._run_synthesis():
|
|
156
|
+
yield audio_content
|
|
157
|
+
except Exception as e:
|
|
158
|
+
raise APIError(f"Error during synthesis processing: {e}")
|
|
155
159
|
|
|
156
|
-
|
|
160
|
+
finally:
|
|
161
|
+
await stream_task
|
smallest/tts.py
CHANGED
|
@@ -1,25 +1,23 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import json
|
|
2
3
|
import wave
|
|
3
4
|
import copy
|
|
4
5
|
import requests
|
|
5
6
|
from typing import Optional, Union, List
|
|
6
7
|
|
|
7
|
-
from smallest.models import TTSModels, TTSVoices
|
|
8
8
|
from smallest.exceptions import TTSError, APIError
|
|
9
|
-
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header,
|
|
10
|
-
get_smallest_languages,
|
|
9
|
+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
10
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
11
11
|
|
|
12
12
|
class Smallest:
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
15
|
api_key: str = None,
|
|
16
|
-
model:
|
|
17
|
-
sample_rate: int = 24000,
|
|
18
|
-
|
|
16
|
+
model: Optional[str] = "lightning",
|
|
17
|
+
sample_rate: Optional[int] = 24000,
|
|
18
|
+
voice_id: Optional[str] = "emily",
|
|
19
19
|
speed: Optional[float] = 1.0,
|
|
20
|
-
add_wav_header: Optional[bool] = True
|
|
21
|
-
transliterate: Optional[bool] = False,
|
|
22
|
-
remove_extra_silence: Optional[bool] = True
|
|
20
|
+
add_wav_header: Optional[bool] = True
|
|
23
21
|
) -> None:
|
|
24
22
|
"""
|
|
25
23
|
Smallest Instance for text-to-speech synthesis.
|
|
@@ -31,11 +29,9 @@ class Smallest:
|
|
|
31
29
|
- api_key (str): The API key for authentication, export it as 'SMALLEST_API_KEY' in your environment variables.
|
|
32
30
|
- model (TTSModels): The model to be used for synthesis.
|
|
33
31
|
- sample_rate (int): The sample rate for the audio output.
|
|
34
|
-
-
|
|
32
|
+
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
35
33
|
- speed (float): The speed of the speech synthesis.
|
|
36
34
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
37
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
38
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
39
35
|
|
|
40
36
|
Methods:
|
|
41
37
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -46,18 +42,18 @@ class Smallest:
|
|
|
46
42
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
47
43
|
if not self.api_key:
|
|
48
44
|
raise TTSError()
|
|
49
|
-
|
|
45
|
+
if model == "lightning-large":
|
|
46
|
+
voice_id = "lakshya"
|
|
47
|
+
|
|
50
48
|
self.chunk_size = 250
|
|
51
49
|
|
|
52
50
|
self.opts = TTSOptions(
|
|
53
51
|
model=model,
|
|
54
52
|
sample_rate=sample_rate,
|
|
55
|
-
|
|
53
|
+
voice_id=voice_id,
|
|
56
54
|
api_key=self.api_key,
|
|
57
55
|
add_wav_header=add_wav_header,
|
|
58
|
-
speed=speed
|
|
59
|
-
transliterate=transliterate,
|
|
60
|
-
remove_extra_silence=remove_extra_silence
|
|
56
|
+
speed=speed
|
|
61
57
|
)
|
|
62
58
|
|
|
63
59
|
|
|
@@ -65,17 +61,46 @@ class Smallest:
|
|
|
65
61
|
"""Returns a list of available languages."""
|
|
66
62
|
return get_smallest_languages()
|
|
67
63
|
|
|
68
|
-
def
|
|
64
|
+
def get_cloned_voices(self) -> str:
|
|
65
|
+
"""Returns a list of your cloned voices."""
|
|
66
|
+
headers = {
|
|
67
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
res = requests.request("GET", f"{API_BASE_URL}/lightning-large/get_cloned_voices", headers=headers)
|
|
71
|
+
if res.status_code != 200:
|
|
72
|
+
raise APIError(f"Failed to get cloned voices: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
73
|
+
|
|
74
|
+
return json.dumps(res.json(), indent=4, ensure_ascii=False)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_voices(
|
|
78
|
+
self,
|
|
79
|
+
model: Optional[str] = "lightning"
|
|
80
|
+
) -> str:
|
|
69
81
|
"""Returns a list of available voices."""
|
|
70
|
-
|
|
82
|
+
headers = {
|
|
83
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
res = requests.request("GET", f"{API_BASE_URL}/{model}/get_voices", headers=headers)
|
|
87
|
+
if res.status_code != 200:
|
|
88
|
+
raise APIError(f"Failed to get voices: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
89
|
+
|
|
90
|
+
return json.dumps(res.json(), indent=4, ensure_ascii=False)
|
|
91
|
+
|
|
71
92
|
|
|
72
93
|
def get_models(self) -> List[str]:
|
|
73
94
|
"""Returns a list of available models."""
|
|
74
95
|
return get_smallest_models()
|
|
75
96
|
|
|
97
|
+
|
|
76
98
|
def synthesize(
|
|
77
99
|
self,
|
|
78
100
|
text: str,
|
|
101
|
+
consistency: Optional[float] = 0.5,
|
|
102
|
+
similarity: Optional[float] = 0,
|
|
103
|
+
enhancement: Optional[bool] = False,
|
|
79
104
|
save_as: Optional[str] = None,
|
|
80
105
|
**kwargs
|
|
81
106
|
) -> Union[bytes, None]:
|
|
@@ -86,6 +111,9 @@ class Smallest:
|
|
|
86
111
|
- text (str): The text to be converted to speech.
|
|
87
112
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
88
113
|
The file must have a .wav extension.
|
|
114
|
+
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
115
|
+
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
116
|
+
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
89
117
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
90
118
|
|
|
91
119
|
Returns:
|
|
@@ -97,26 +125,41 @@ class Smallest:
|
|
|
97
125
|
- APIError: If the API request fails or returns an error.
|
|
98
126
|
"""
|
|
99
127
|
opts = copy.deepcopy(self.opts)
|
|
128
|
+
valid_keys = set(vars(opts).keys())
|
|
129
|
+
|
|
130
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
131
|
+
if invalid_keys:
|
|
132
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
133
|
+
|
|
100
134
|
for key, value in kwargs.items():
|
|
101
135
|
setattr(opts, key, value)
|
|
102
136
|
|
|
103
|
-
validate_input(preprocess_text(text), opts.
|
|
137
|
+
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed, consistency, similarity, enhancement)
|
|
138
|
+
|
|
139
|
+
self.chunk_size = 250
|
|
140
|
+
if opts.model == "lightning-large":
|
|
141
|
+
self.chunk_size = 140
|
|
104
142
|
|
|
105
|
-
chunks =
|
|
143
|
+
chunks = chunk_text(text, self.chunk_size)
|
|
106
144
|
audio_content = b""
|
|
107
145
|
|
|
108
146
|
for chunk in chunks:
|
|
109
147
|
payload = {
|
|
110
148
|
"text": preprocess_text(chunk),
|
|
111
149
|
"sample_rate": opts.sample_rate,
|
|
112
|
-
"voice_id": opts.
|
|
150
|
+
"voice_id": opts.voice_id,
|
|
113
151
|
"add_wav_header": False,
|
|
114
152
|
"speed": opts.speed,
|
|
115
|
-
"model": opts.model,
|
|
116
|
-
"transliterate": opts.transliterate,
|
|
117
|
-
"remove_extra_silence": opts.remove_extra_silence,
|
|
118
153
|
}
|
|
119
154
|
|
|
155
|
+
if opts.model == "lightning-large":
|
|
156
|
+
if consistency:
|
|
157
|
+
payload["consistency"] = consistency
|
|
158
|
+
if similarity:
|
|
159
|
+
payload["similarity"] = similarity
|
|
160
|
+
if enhancement:
|
|
161
|
+
payload["enhancement"] = enhancement
|
|
162
|
+
|
|
120
163
|
headers = {
|
|
121
164
|
"Authorization": f"Bearer {self.api_key}",
|
|
122
165
|
"Content-Type": "application/json",
|
|
@@ -124,15 +167,10 @@ class Smallest:
|
|
|
124
167
|
|
|
125
168
|
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
126
169
|
if res.status_code != 200:
|
|
127
|
-
raise APIError(f"Failed to synthesize speech: {res.text}.
|
|
170
|
+
raise APIError(f"Failed to synthesize speech: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
128
171
|
|
|
129
172
|
audio_content += res.content
|
|
130
173
|
|
|
131
|
-
|
|
132
|
-
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
133
|
-
if res.status_code != 200:
|
|
134
|
-
raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
|
|
135
|
-
|
|
136
174
|
if save_as:
|
|
137
175
|
if not save_as.endswith(".wav"):
|
|
138
176
|
raise TTSError("Invalid file name. Extension must be .wav")
|
|
@@ -140,11 +178,76 @@ class Smallest:
|
|
|
140
178
|
with wave.open(save_as, "wb") as wf:
|
|
141
179
|
wf.setnchannels(1)
|
|
142
180
|
wf.setsampwidth(2)
|
|
143
|
-
wf.setframerate(
|
|
181
|
+
wf.setframerate(opts.sample_rate)
|
|
144
182
|
wf.writeframes(audio_content)
|
|
145
183
|
return None
|
|
146
184
|
|
|
147
|
-
if
|
|
148
|
-
return add_wav_header(audio_content,
|
|
185
|
+
if opts.add_wav_header:
|
|
186
|
+
return add_wav_header(audio_content, opts.sample_rate)
|
|
149
187
|
|
|
150
188
|
return audio_content
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def add_voice(self, display_name: str, file_path: str) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Instantly clone your voice synchronously.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
- display_name (str): The display name for the new voice.
|
|
197
|
+
- file_path (str): The path to the reference audio file to be cloned.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
- str: The response from the API as a formatted JSON string.
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
- TTSError: If the file does not exist or is not a valid audio file.
|
|
204
|
+
- APIError: If the API request fails or returns an error.
|
|
205
|
+
"""
|
|
206
|
+
if not os.path.isfile(file_path):
|
|
207
|
+
raise TTSError("Invalid file path. File does not exist.")
|
|
208
|
+
|
|
209
|
+
file_extension = os.path.splitext(file_path)[1].lower()
|
|
210
|
+
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
211
|
+
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
212
|
+
|
|
213
|
+
url = f"{API_BASE_URL}/lightning-large/add_voice"
|
|
214
|
+
payload = {'displayName': display_name}
|
|
215
|
+
|
|
216
|
+
files = [('file', (os.path.basename(file_path), open(file_path, 'rb'), 'audio/wav'))]
|
|
217
|
+
|
|
218
|
+
headers = {
|
|
219
|
+
'Authorization': f"Bearer {self.api_key}",
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
response = requests.post(url, headers=headers, data=payload, files=files)
|
|
223
|
+
if response.status_code != 200:
|
|
224
|
+
raise APIError(f"Failed to add voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
225
|
+
|
|
226
|
+
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def delete_voice(self, voice_id: str) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Delete a cloned voice synchronously.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
- str: The response from the API.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
- APIError: If the API request fails or returns an error.
|
|
241
|
+
"""
|
|
242
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
243
|
+
payload = {'voiceId': voice_id}
|
|
244
|
+
|
|
245
|
+
headers = {
|
|
246
|
+
'Authorization': f"Bearer {self.api_key}",
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
response = requests.delete(url, headers=headers, json=payload)
|
|
250
|
+
if response.status_code != 200:
|
|
251
|
+
raise APIError(f"Failed to delete voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
252
|
+
|
|
253
|
+
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
smallest/utils.py
CHANGED
|
@@ -1,109 +1,97 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import io
|
|
3
|
-
import unicodedata
|
|
4
3
|
from typing import List
|
|
4
|
+
from typing import Optional
|
|
5
5
|
from pydub import AudioSegment
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from sacremoses import MosesPunctNormalizer
|
|
8
8
|
|
|
9
9
|
from smallest.exceptions import ValidationError
|
|
10
|
-
from smallest.models import TTSModels, TTSLanguages
|
|
10
|
+
from smallest.models import TTSModels, TTSLanguages
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
14
|
-
SENTENCE_END_REGEX = re.compile(r'.*[
|
|
15
|
-
|
|
14
|
+
SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
|
|
15
|
+
mpn = MosesPunctNormalizer()
|
|
16
16
|
SAMPLE_WIDTH = 2
|
|
17
17
|
CHANNELS = 1
|
|
18
|
+
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
@dataclass
|
|
21
22
|
class TTSOptions:
|
|
22
|
-
model:
|
|
23
|
+
model: str
|
|
23
24
|
sample_rate: int
|
|
24
|
-
|
|
25
|
+
voice_id: str
|
|
25
26
|
api_key: str
|
|
26
27
|
add_wav_header: bool
|
|
27
28
|
speed: float
|
|
28
|
-
transliterate: bool
|
|
29
|
-
remove_extra_silence: bool
|
|
30
29
|
|
|
31
30
|
|
|
32
|
-
def validate_input(text: str,
|
|
31
|
+
def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[bool] = None):
|
|
33
32
|
if not text:
|
|
34
|
-
raise ValidationError("Text cannot be empty")
|
|
35
|
-
if voice not in TTSVoices:
|
|
36
|
-
raise ValidationError(f"Invalid voice: {voice}")
|
|
33
|
+
raise ValidationError("Text cannot be empty.")
|
|
37
34
|
if model not in TTSModels:
|
|
38
|
-
raise ValidationError(f"Invalid model: {model}")
|
|
35
|
+
raise ValidationError(f"Invalid model: {model}. Must be one of {TTSModels}")
|
|
39
36
|
if not 8000 <= sample_rate <= 24000:
|
|
40
37
|
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
|
|
41
38
|
if not 0.5 <= speed <= 2.0:
|
|
42
39
|
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
40
|
+
if consistency is not None and not 0.0 <= consistency <= 1.0:
|
|
41
|
+
raise ValidationError(f"Invalid consistency: {consistency}. Must be between 0.0 and 1.0")
|
|
42
|
+
if similarity is not None and not 0.0 <= similarity <= 1.0:
|
|
43
|
+
raise ValidationError(f"Invalid similarity: {similarity}. Must be between 0.0 and 1.0")
|
|
44
|
+
if enhancement is not None and not isinstance(enhancement, bool):
|
|
45
|
+
raise ValidationError(f"Invalid enhancement: {enhancement}. Must be a boolean value.")
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
50
|
+
wav_buf = io.BytesIO()
|
|
51
|
+
audio.export(wav_buf, format="wav")
|
|
52
|
+
wav_buf.seek(0)
|
|
53
|
+
return wav_buf.read()
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
def preprocess_text(text: str) -> str:
|
|
54
|
-
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ")
|
|
57
|
+
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ").replace("-", " ").replace("–", " ")
|
|
55
58
|
text = re.sub(r'\s+', ' ', text)
|
|
56
|
-
mpn = MosesPunctNormalizer()
|
|
57
59
|
text = mpn.normalize(text)
|
|
58
60
|
return text.strip()
|
|
59
61
|
|
|
60
62
|
|
|
61
|
-
def
|
|
62
|
-
"""
|
|
63
|
-
Splits the input text into chunks based on sentence boundaries
|
|
64
|
-
defined by SENTENCE_END_REGEX and the maximum chunk size.
|
|
65
|
-
Only splits at valid sentence boundaries to avoid breaking words.
|
|
66
|
-
"""
|
|
63
|
+
def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
|
|
67
64
|
chunks = []
|
|
68
65
|
while text:
|
|
69
|
-
|
|
70
|
-
if len(text) <= CHUNK_SIZE:
|
|
66
|
+
if len(text) <= chunk_size:
|
|
71
67
|
chunks.append(text.strip())
|
|
72
68
|
break
|
|
73
69
|
|
|
74
|
-
|
|
75
|
-
chunk_text = text[:CHUNK_SIZE]
|
|
70
|
+
chunk_text = text[:chunk_size]
|
|
76
71
|
last_break_index = -1
|
|
77
72
|
|
|
78
|
-
#
|
|
73
|
+
# Find last sentence boundary using regex
|
|
79
74
|
for i in range(len(chunk_text) - 1, -1, -1):
|
|
80
|
-
if chunk_text[i
|
|
75
|
+
if SENTENCE_END_REGEX.match(chunk_text[:i + 1]):
|
|
81
76
|
last_break_index = i
|
|
82
77
|
break
|
|
83
78
|
|
|
84
79
|
if last_break_index == -1:
|
|
85
|
-
#
|
|
86
|
-
# to avoid breaking words
|
|
80
|
+
# Fallback to space if no sentence boundary found
|
|
87
81
|
last_space = chunk_text.rfind(' ')
|
|
88
82
|
if last_space != -1:
|
|
89
|
-
last_break_index = last_space
|
|
83
|
+
last_break_index = last_space
|
|
90
84
|
else:
|
|
91
|
-
|
|
92
|
-
last_break_index = CHUNK_SIZE - 1
|
|
85
|
+
last_break_index = chunk_size - 1
|
|
93
86
|
|
|
94
|
-
# Add the chunk up to the break point
|
|
95
87
|
chunks.append(text[:last_break_index + 1].strip())
|
|
96
|
-
# Continue with remaining text
|
|
97
88
|
text = text[last_break_index + 1:].strip()
|
|
98
89
|
|
|
99
90
|
return chunks
|
|
100
91
|
|
|
101
92
|
|
|
102
93
|
def get_smallest_languages() -> List[str]:
|
|
103
|
-
return
|
|
104
|
-
|
|
105
|
-
def get_smallest_voices() -> List[str]:
|
|
106
|
-
return list(TTSVoices)
|
|
94
|
+
return TTSLanguages
|
|
107
95
|
|
|
108
96
|
def get_smallest_models() -> List[str]:
|
|
109
|
-
return
|
|
97
|
+
return TTSModels
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
5
|
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
@@ -55,9 +55,15 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
55
55
|
- [Get the API Key](#get-the-api-key)
|
|
56
56
|
- [Best Practices for Input Text](#best-practices-for-input-text)
|
|
57
57
|
- [Examples](#examples)
|
|
58
|
-
- [
|
|
59
|
-
- [
|
|
58
|
+
- [Synchronous](#Synchronous)
|
|
59
|
+
- [Aynchronous](#Synchronous)
|
|
60
60
|
- [LLM to Speech](#llm-to-speech)
|
|
61
|
+
- [Add your Voice](#add-your-voice)
|
|
62
|
+
- [Synchronously](#add-synchronously)
|
|
63
|
+
- [Asynchronously](#add-asynchronously)
|
|
64
|
+
- [Delete your Voice](#delete-your-voice)
|
|
65
|
+
- [Synchronously](#delete-synchronously)
|
|
66
|
+
- [Asynchronously](#delete-asynchronously)
|
|
61
67
|
- [Available Methods](#available-methods)
|
|
62
68
|
- [Technical Note: WAV Headers in Streaming Audio](#technical-note-wav-headers-in-streaming-audio)
|
|
63
69
|
|
|
@@ -77,28 +83,22 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
77
83
|
3. Create a new API Key and copy it.
|
|
78
84
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
79
85
|
|
|
80
|
-
## Best Practices for Input Text
|
|
81
|
-
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
82
|
-
|
|
83
|
-
For optimal voice generation results:
|
|
84
|
-
|
|
85
|
-
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
86
|
-
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
87
|
-
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
88
86
|
|
|
89
87
|
## Examples
|
|
90
88
|
|
|
91
|
-
###
|
|
89
|
+
### Synchronous
|
|
92
90
|
A synchronous text-to-speech synthesis client.
|
|
93
91
|
|
|
94
92
|
**Basic Usage:**
|
|
95
93
|
```python
|
|
96
|
-
import os
|
|
97
94
|
from smallest import Smallest
|
|
98
95
|
|
|
99
96
|
def main():
|
|
100
|
-
client = Smallest(api_key=
|
|
101
|
-
client.synthesize(
|
|
97
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
98
|
+
client.synthesize(
|
|
99
|
+
text="Hello, this is a test for sync synthesis function.",
|
|
100
|
+
save_as="sync_synthesize.wav"
|
|
101
|
+
)
|
|
102
102
|
|
|
103
103
|
if __name__ == "__main__":
|
|
104
104
|
main()
|
|
@@ -108,11 +108,12 @@ if __name__ == "__main__":
|
|
|
108
108
|
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
109
109
|
- `model`: TTS model to use (default: "lightning")
|
|
110
110
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
111
|
-
- `
|
|
111
|
+
- `voice_id`: Voice ID (default: "emily")
|
|
112
112
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
113
|
-
- `
|
|
114
|
-
- `
|
|
115
|
-
- `
|
|
113
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. (default: 0.5)
|
|
114
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. (default: 0)
|
|
115
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. (default: False)
|
|
116
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
116
117
|
|
|
117
118
|
These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override these parameters for a specific synthesis request.
|
|
118
119
|
|
|
@@ -127,19 +128,17 @@ client.synthesize(
|
|
|
127
128
|
```
|
|
128
129
|
|
|
129
130
|
|
|
130
|
-
###
|
|
131
|
+
### Asynchronous
|
|
131
132
|
Asynchronous text-to-speech synthesis client.
|
|
132
133
|
|
|
133
134
|
**Basic Usage:**
|
|
134
135
|
```python
|
|
135
|
-
import os
|
|
136
136
|
import asyncio
|
|
137
137
|
import aiofiles
|
|
138
138
|
from smallest import AsyncSmallest
|
|
139
139
|
|
|
140
|
-
client = AsyncSmallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
141
|
-
|
|
142
140
|
async def main():
|
|
141
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
143
142
|
async with client as tts:
|
|
144
143
|
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
145
144
|
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
@@ -149,15 +148,33 @@ if __name__ == "__main__":
|
|
|
149
148
|
asyncio.run(main())
|
|
150
149
|
```
|
|
151
150
|
|
|
151
|
+
**Running Asynchronously in a Jupyter Notebook**
|
|
152
|
+
If you are using a Jupyter Notebook, use the following approach to execute the asynchronous function within an existing event loop:
|
|
153
|
+
```python
|
|
154
|
+
import asyncio
|
|
155
|
+
import aiofiles
|
|
156
|
+
from smallest import AsyncSmallest
|
|
157
|
+
|
|
158
|
+
async def main():
|
|
159
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
160
|
+
async with client as tts:
|
|
161
|
+
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
162
|
+
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
163
|
+
await f.write(audio_bytes) # alternatively you can use the `save_as` parameter.
|
|
164
|
+
|
|
165
|
+
await main()
|
|
166
|
+
```
|
|
167
|
+
|
|
152
168
|
**Parameters:**
|
|
153
169
|
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
154
170
|
- `model`: TTS model to use (default: "lightning")
|
|
155
171
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
156
|
-
- `
|
|
172
|
+
- `voice_id`: Voice ID (default: "emily")
|
|
157
173
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
158
|
-
- `
|
|
159
|
-
- `
|
|
160
|
-
- `
|
|
174
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
175
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
176
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
177
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
161
178
|
|
|
162
179
|
These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override any of these parameters on a per-request basis.
|
|
163
180
|
|
|
@@ -174,16 +191,66 @@ audio_bytes = await tts.synthesize(
|
|
|
174
191
|
|
|
175
192
|
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
176
193
|
|
|
194
|
+
#### Stream through a WebSocket
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
import asyncio
|
|
198
|
+
import websockets
|
|
199
|
+
from groq import Groq
|
|
200
|
+
from smallest import Smallest, TextToAudioStream
|
|
201
|
+
|
|
202
|
+
# Initialize Groq (LLM) and Smallest (TTS) instances
|
|
203
|
+
llm = Groq(api_key="GROQ_API_KEY")
|
|
204
|
+
tts = Smallest(api_key="SMALLEST_API_KEY")
|
|
205
|
+
WEBSOCKET_URL = "wss://echo.websocket.events" # Mock WebSocket server
|
|
206
|
+
|
|
207
|
+
# Async function to stream text generation from LLM
|
|
208
|
+
async def generate_text(prompt):
|
|
209
|
+
completion = llm.chat.completions.create(
|
|
210
|
+
messages=[{"role": "user", "content": prompt}],
|
|
211
|
+
model="llama3-8b-8192",
|
|
212
|
+
stream=True,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Yield text as it is generated
|
|
216
|
+
for chunk in completion:
|
|
217
|
+
text = chunk.choices[0].delta.content
|
|
218
|
+
if text:
|
|
219
|
+
yield text
|
|
220
|
+
|
|
221
|
+
# Main function to run the process
|
|
222
|
+
async def main():
|
|
223
|
+
# Initialize the TTS processor
|
|
224
|
+
processor = TextToAudioStream(tts_instance=tts)
|
|
225
|
+
|
|
226
|
+
# Generate text from LLM
|
|
227
|
+
llm_output = generate_text("Explain text to speech like I am five in 5 sentences.")
|
|
228
|
+
|
|
229
|
+
# Stream the generated speech throught a websocket
|
|
230
|
+
async with websockets.connect(WEBSOCKET_URL) as ws:
|
|
231
|
+
print("Connected to WebSocket server.")
|
|
232
|
+
|
|
233
|
+
# Stream the generated speech
|
|
234
|
+
async for audio_chunk in processor.process(llm_output):
|
|
235
|
+
await ws.send(audio_chunk) # Send audio chunk
|
|
236
|
+
echoed_data = await ws.recv() # Receive the echoed message
|
|
237
|
+
print("Received from server:", echoed_data[:20], "...") # Print first 20 bytes
|
|
238
|
+
|
|
239
|
+
print("WebSocket connection closed.")
|
|
240
|
+
|
|
241
|
+
if __name__ == "__main__":
|
|
242
|
+
asyncio.run(main())
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
#### Save to a File
|
|
177
246
|
```python
|
|
178
|
-
import os
|
|
179
247
|
import wave
|
|
180
248
|
import asyncio
|
|
181
249
|
from groq import Groq
|
|
182
|
-
from smallest import Smallest
|
|
183
|
-
from smallest import TextToAudioStream
|
|
250
|
+
from smallest import Smallest, TextToAudioStream
|
|
184
251
|
|
|
185
|
-
llm = Groq(api_key=
|
|
186
|
-
tts = Smallest(api_key=
|
|
252
|
+
llm = Groq(api_key="GROQ_API_KEY")
|
|
253
|
+
tts = Smallest(api_key="SMALLEST_API_KEY")
|
|
187
254
|
|
|
188
255
|
async def generate_text(prompt):
|
|
189
256
|
"""Async generator for streaming text from Groq. You can use any LLM"""
|
|
@@ -240,16 +307,76 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
240
307
|
- Streamed over a network
|
|
241
308
|
- Further processed as needed
|
|
242
309
|
|
|
310
|
+
## Add your Voice
|
|
311
|
+
The Smallest AI SDK allows you to clone your voice by uploading an audio file. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
312
|
+
|
|
313
|
+
### Add Synchronously
|
|
314
|
+
```python
|
|
315
|
+
from smallest import Smallest
|
|
316
|
+
|
|
317
|
+
def main():
|
|
318
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
319
|
+
res = client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
320
|
+
print(res)
|
|
321
|
+
|
|
322
|
+
if __name__ == "__main__":
|
|
323
|
+
main()
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### Add Asynchronously
|
|
327
|
+
```python
|
|
328
|
+
import asyncio
|
|
329
|
+
from smallest import AsyncSmallest
|
|
330
|
+
|
|
331
|
+
async def main():
|
|
332
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
333
|
+
res = await client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
334
|
+
print(res)
|
|
335
|
+
|
|
336
|
+
if __name__ == "__main__":
|
|
337
|
+
asyncio.run(main())
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
## Delete your Voice
|
|
341
|
+
The Smallest AI SDK allows you to delete your cloned voice. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
342
|
+
|
|
343
|
+
### Delete Synchronously
|
|
344
|
+
```python
|
|
345
|
+
from smallest import Smallest
|
|
346
|
+
|
|
347
|
+
def main():
|
|
348
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
349
|
+
res = client.delete_voice(voice_id="voice_id")
|
|
350
|
+
print(res)
|
|
351
|
+
|
|
352
|
+
if __name__ == "__main__":
|
|
353
|
+
main()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Delete Asynchronously
|
|
357
|
+
```python
|
|
358
|
+
import asyncio
|
|
359
|
+
from smallest import AsyncSmallest
|
|
360
|
+
|
|
361
|
+
async def main():
|
|
362
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
363
|
+
res = await client.delete_voice(voice_id="voice_id")
|
|
364
|
+
print(res)
|
|
365
|
+
|
|
366
|
+
if __name__ == "__main__":
|
|
367
|
+
asyncio.run(main())
|
|
368
|
+
```
|
|
243
369
|
|
|
244
370
|
## Available Methods
|
|
245
371
|
|
|
246
372
|
```python
|
|
247
|
-
from smallest
|
|
373
|
+
from smallest import Smallest
|
|
248
374
|
|
|
249
|
-
client = Smallest(api_key=
|
|
375
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
250
376
|
|
|
251
|
-
print(f"
|
|
252
|
-
print(f"Available Voices: {client.get_voices()}")
|
|
377
|
+
print(f"Available Languages: {client.get_languages()}")
|
|
378
|
+
print(f"Available Voices: {client.get_voices(model='lightning')}")
|
|
379
|
+
print(f"Available Voices: {client.get_cloned_voices()}")
|
|
253
380
|
print(f"Available Models: {client.get_models()}")
|
|
254
381
|
```
|
|
255
382
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
+
smallest/async_tts.py,sha256=Jr7IID5tJrnMx_d2217foUJqfFvAFsddvy_0HG5tKGc,11905
|
|
3
|
+
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
+
smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
|
|
5
|
+
smallest/stream_tts.py,sha256=dUxoY0VkXecsMZ41QA8RkX4t_pD5-7mMIJhaB01tQrk,6512
|
|
6
|
+
smallest/tts.py,sha256=bSL7EYmLpd5yT42dbUXVb-IgZ_xIcXpyHvCu2-hHtMs,10024
|
|
7
|
+
smallest/utils.py,sha256=HDpDjPkUeeQLqDhrV-zPTLtOH9hJueae0q9SNq486GQ,3396
|
|
8
|
+
smallestai-2.1.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
+
smallestai-2.1.0.dist-info/METADATA,sha256=BwCUFiVZTRActimZBQcPJg8vHJy0M-6vYA_yHvaFpDk,14904
|
|
10
|
+
smallestai-2.1.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
11
|
+
smallestai-2.1.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
+
smallestai-2.1.0.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
-
smallest/async_tts.py,sha256=2BrNMxq0PDtF7CCZqYPnrp9D0qxZndCgT31EbdrnV-E,6084
|
|
3
|
-
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
-
smallest/models.py,sha256=Ndmek9f5VWDjxaNPfSmNk-xP55Y6uXzkzI5V54FnuvU,771
|
|
5
|
-
smallest/stream_tts.py,sha256=9sSGR9F_BiSSB1IsiUJP-How0t4-3qdYyTJ-H7ESkMk,6230
|
|
6
|
-
smallest/tts.py,sha256=Km3-rFf4D_-XXLi8CAVsiYrw5D-OQRLDHl-LTUh83ec,6030
|
|
7
|
-
smallest/utils.py,sha256=kIlS3wQaICT3R4B8R3HpywmXMABJUkCgbvFziStfno8,3527
|
|
8
|
-
smallestai-1.3.4.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
-
smallestai-1.3.4.dist-info/METADATA,sha256=yfREoK1kPNxKBTWcE_aRp8ByEF-m86nkdyiiBpF2Q4k,10584
|
|
10
|
-
smallestai-1.3.4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
11
|
-
smallestai-1.3.4.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
-
smallestai-1.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|