smallestai 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of smallestai might be problematic. Click here for more details.
- smallest/async_tts.py +71 -20
- smallest/stream_tts.py +37 -29
- smallest/tts.py +55 -16
- smallest/utils.py +14 -13
- {smallestai-2.0.0.dist-info → smallestai-2.1.0.dist-info}/METADATA +118 -23
- smallestai-2.1.0.dist-info/RECORD +12 -0
- {smallestai-2.0.0.dist-info → smallestai-2.1.0.dist-info}/WHEEL +1 -1
- smallestai-2.0.0.dist-info/RECORD +0 -12
- {smallestai-2.0.0.dist-info → smallestai-2.1.0.dist-info}/LICENSE +0 -0
- {smallestai-2.0.0.dist-info → smallestai-2.1.0.dist-info}/top_level.txt +0 -0
smallest/async_tts.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Optional, Union, List
|
|
|
8
8
|
|
|
9
9
|
from smallest.exceptions import TTSError, APIError
|
|
10
10
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
11
|
-
get_smallest_languages, get_smallest_models, API_BASE_URL)
|
|
11
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class AsyncSmallest:
|
|
@@ -19,9 +19,7 @@ class AsyncSmallest:
|
|
|
19
19
|
sample_rate: Optional[int] = 24000,
|
|
20
20
|
voice_id: Optional[str] = "emily",
|
|
21
21
|
speed: Optional[float] = 1.0,
|
|
22
|
-
add_wav_header: Optional[bool] = True
|
|
23
|
-
transliterate: Optional[bool] = False,
|
|
24
|
-
remove_extra_silence: Optional[bool] = False
|
|
22
|
+
add_wav_header: Optional[bool] = True
|
|
25
23
|
) -> None:
|
|
26
24
|
"""
|
|
27
25
|
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
|
|
@@ -37,8 +35,6 @@ class AsyncSmallest:
|
|
|
37
35
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
38
36
|
- speed (float): The speed of the speech synthesis.
|
|
39
37
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
40
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
41
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
42
38
|
|
|
43
39
|
Methods:
|
|
44
40
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -49,6 +45,9 @@ class AsyncSmallest:
|
|
|
49
45
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
50
46
|
if not self.api_key:
|
|
51
47
|
raise TTSError()
|
|
48
|
+
if model == "lightning-large":
|
|
49
|
+
voice_id = "lakshya"
|
|
50
|
+
|
|
52
51
|
self.chunk_size = 250
|
|
53
52
|
|
|
54
53
|
self.opts = TTSOptions(
|
|
@@ -57,9 +56,7 @@ class AsyncSmallest:
|
|
|
57
56
|
voice_id=voice_id,
|
|
58
57
|
api_key=self.api_key,
|
|
59
58
|
add_wav_header=add_wav_header,
|
|
60
|
-
speed=speed
|
|
61
|
-
transliterate=transliterate,
|
|
62
|
-
remove_extra_silence=remove_extra_silence,
|
|
59
|
+
speed=speed
|
|
63
60
|
)
|
|
64
61
|
self.session = None
|
|
65
62
|
|
|
@@ -124,6 +121,9 @@ class AsyncSmallest:
|
|
|
124
121
|
async def synthesize(
|
|
125
122
|
self,
|
|
126
123
|
text: str,
|
|
124
|
+
consistency: Optional[float] = 0.5,
|
|
125
|
+
similarity: Optional[float] = 0,
|
|
126
|
+
enhancement: Optional[bool] = False,
|
|
127
127
|
save_as: Optional[str] = None,
|
|
128
128
|
**kwargs
|
|
129
129
|
) -> Union[bytes, None]:
|
|
@@ -134,6 +134,9 @@ class AsyncSmallest:
|
|
|
134
134
|
- text (str): The text to be converted to speech.
|
|
135
135
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
136
136
|
The file must have a .wav extension.
|
|
137
|
+
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
138
|
+
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
139
|
+
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
137
140
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
138
141
|
|
|
139
142
|
Returns:
|
|
@@ -143,18 +146,29 @@ class AsyncSmallest:
|
|
|
143
146
|
Raises:
|
|
144
147
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
145
148
|
- APIError: If the API request fails or returns an error.
|
|
149
|
+
- ValueError: If an unexpected parameter is passed in `kwargs`.
|
|
146
150
|
"""
|
|
147
|
-
should_cleanup =
|
|
151
|
+
should_cleanup = False
|
|
152
|
+
|
|
153
|
+
if self.session is None or self.session.closed:
|
|
154
|
+
self.session = aiohttp.ClientSession()
|
|
155
|
+
should_cleanup = True # Cleanup only if we created a new session
|
|
148
156
|
|
|
149
157
|
try:
|
|
150
158
|
opts = copy.deepcopy(self.opts)
|
|
159
|
+
valid_keys = set(vars(opts).keys())
|
|
160
|
+
|
|
161
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
162
|
+
if invalid_keys:
|
|
163
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
164
|
+
|
|
151
165
|
for key, value in kwargs.items():
|
|
152
166
|
setattr(opts, key, value)
|
|
153
167
|
|
|
154
|
-
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed)
|
|
168
|
+
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed, consistency, similarity, enhancement)
|
|
155
169
|
|
|
156
170
|
self.chunk_size = 250
|
|
157
|
-
if opts.model == '
|
|
171
|
+
if opts.model == 'lightning-large':
|
|
158
172
|
self.chunk_size = 140
|
|
159
173
|
|
|
160
174
|
chunks = chunk_text(text, self.chunk_size)
|
|
@@ -167,19 +181,23 @@ class AsyncSmallest:
|
|
|
167
181
|
"voice_id": opts.voice_id,
|
|
168
182
|
"add_wav_header": False,
|
|
169
183
|
"speed": opts.speed,
|
|
170
|
-
"model": opts.model
|
|
171
|
-
"transliterate": opts.transliterate,
|
|
172
|
-
"remove_extra_silence": opts.remove_extra_silence
|
|
184
|
+
"model": opts.model
|
|
173
185
|
}
|
|
186
|
+
|
|
187
|
+
if opts.model == "lightning-large":
|
|
188
|
+
if consistency:
|
|
189
|
+
payload["consistency"] = consistency
|
|
190
|
+
if similarity:
|
|
191
|
+
payload["similarity"] = similarity
|
|
192
|
+
if enhancement:
|
|
193
|
+
payload["enhancement"] = enhancement
|
|
194
|
+
|
|
174
195
|
|
|
175
196
|
headers = {
|
|
176
197
|
"Authorization": f"Bearer {self.api_key}",
|
|
177
198
|
"Content-Type": "application/json",
|
|
178
199
|
}
|
|
179
200
|
|
|
180
|
-
if not self.session:
|
|
181
|
-
self.session = aiohttp.ClientSession()
|
|
182
|
-
|
|
183
201
|
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
184
202
|
if res.status != 200:
|
|
185
203
|
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
@@ -199,7 +217,7 @@ class AsyncSmallest:
|
|
|
199
217
|
return add_wav_header(audio_content, opts.sample_rate)
|
|
200
218
|
|
|
201
219
|
return audio_content
|
|
202
|
-
|
|
220
|
+
|
|
203
221
|
finally:
|
|
204
222
|
if should_cleanup and self.session:
|
|
205
223
|
await self.session.close()
|
|
@@ -226,7 +244,6 @@ class AsyncSmallest:
|
|
|
226
244
|
if not os.path.exists(file_path):
|
|
227
245
|
raise TTSError("Invalid file path. File does not exist.")
|
|
228
246
|
|
|
229
|
-
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
230
247
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
231
248
|
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
232
249
|
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
@@ -257,4 +274,38 @@ class AsyncSmallest:
|
|
|
257
274
|
if should_cleanup and self.session:
|
|
258
275
|
await self.session.close()
|
|
259
276
|
self.session = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
async def delete_voice(self, voice_id: str) -> str:
|
|
280
|
+
"""
|
|
281
|
+
Delete a cloned voice asynchronously.
|
|
260
282
|
|
|
283
|
+
Args:
|
|
284
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
- str: The response from the API.
|
|
288
|
+
|
|
289
|
+
Raises:
|
|
290
|
+
- APIError: If the API request fails or returns an error.
|
|
291
|
+
"""
|
|
292
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
293
|
+
payload = {'voiceId': voice_id}
|
|
294
|
+
|
|
295
|
+
headers = {
|
|
296
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
should_cleanup = await self._ensure_session()
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
async with self.session.delete(url, headers=headers, json=payload) as res:
|
|
303
|
+
if res.status != 200:
|
|
304
|
+
raise APIError(f"Failed to delete voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
305
|
+
|
|
306
|
+
return await res.text()
|
|
307
|
+
|
|
308
|
+
finally:
|
|
309
|
+
if should_cleanup and self.session:
|
|
310
|
+
await self.session.close()
|
|
311
|
+
self.session = None
|
smallest/stream_tts.py
CHANGED
|
@@ -30,7 +30,7 @@ class TextToAudioStream:
|
|
|
30
30
|
|
|
31
31
|
Args:
|
|
32
32
|
tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
|
|
33
|
-
queue_timeout: How long to wait for new text (seconds, default:
|
|
33
|
+
queue_timeout: How long to wait for new text (seconds, default: 5.0)
|
|
34
34
|
max_retries: Number of retry attempts for failed synthesis (default: 3)
|
|
35
35
|
"""
|
|
36
36
|
self.tts_instance = tts_instance
|
|
@@ -48,36 +48,43 @@ class TextToAudioStream:
|
|
|
48
48
|
|
|
49
49
|
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
|
|
50
50
|
"""
|
|
51
|
-
Streams the LLM output, splitting it into
|
|
52
|
-
|
|
51
|
+
Streams the LLM output, splitting it into chunks based on sentence boundaries
|
|
52
|
+
or space characters if no sentence boundary is found before reaching buffer_size.
|
|
53
53
|
|
|
54
54
|
Parameters:
|
|
55
55
|
- llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
|
|
56
56
|
"""
|
|
57
57
|
buffer = ""
|
|
58
|
-
last_break_index = 0
|
|
59
58
|
|
|
60
59
|
async for chunk in llm_output:
|
|
61
60
|
buffer += chunk
|
|
62
|
-
|
|
63
|
-
while
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
61
|
+
|
|
62
|
+
while len(buffer) > self.buffer_size:
|
|
63
|
+
chunk_text = buffer[:self.buffer_size]
|
|
64
|
+
last_break_index = -1
|
|
65
|
+
|
|
66
|
+
# Find last sentence boundary using regex
|
|
67
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
68
|
+
if self.sentence_end_regex.match(chunk_text[:i + 1]):
|
|
69
|
+
last_break_index = i
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
if last_break_index == -1:
|
|
73
|
+
# Fallback to space if no sentence boundary found
|
|
74
|
+
last_space = chunk_text.rfind(' ')
|
|
75
|
+
if last_space != -1:
|
|
76
|
+
last_break_index = last_space
|
|
71
77
|
else:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
78
|
+
last_break_index = self.buffer_size - 1
|
|
79
|
+
|
|
80
|
+
# Add chunk to queue and update buffer
|
|
81
|
+
self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
|
|
82
|
+
buffer = buffer[last_break_index + 1:].strip()
|
|
83
|
+
|
|
84
|
+
# Don't forget the remaining text
|
|
79
85
|
if buffer:
|
|
80
86
|
self.queue.put(f'{buffer.replace("—", " ").strip()} ')
|
|
87
|
+
|
|
81
88
|
self.stop_flag = True
|
|
82
89
|
|
|
83
90
|
|
|
@@ -89,8 +96,7 @@ class TextToAudioStream:
|
|
|
89
96
|
if retries < self.max_retries:
|
|
90
97
|
return self._synthesize_sync(sentence, retries + 1)
|
|
91
98
|
else:
|
|
92
|
-
|
|
93
|
-
return None
|
|
99
|
+
raise APIError(f"Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
94
100
|
|
|
95
101
|
|
|
96
102
|
async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
@@ -101,8 +107,7 @@ class TextToAudioStream:
|
|
|
101
107
|
if retries < self.max_retries:
|
|
102
108
|
return await self._synthesize_async(sentence, retries + 1)
|
|
103
109
|
else:
|
|
104
|
-
|
|
105
|
-
return None
|
|
110
|
+
raise APIError(f"Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
106
111
|
|
|
107
112
|
|
|
108
113
|
async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
|
|
@@ -144,10 +149,13 @@ class TextToAudioStream:
|
|
|
144
149
|
- Streamed over a network
|
|
145
150
|
- Further processed as needed
|
|
146
151
|
"""
|
|
147
|
-
|
|
148
|
-
llm_thread.start()
|
|
152
|
+
stream_task = asyncio.create_task(self._stream_llm_output(llm_output))
|
|
149
153
|
|
|
150
|
-
|
|
151
|
-
|
|
154
|
+
try:
|
|
155
|
+
async for audio_content in self._run_synthesis():
|
|
156
|
+
yield audio_content
|
|
157
|
+
except Exception as e:
|
|
158
|
+
raise APIError(f"Error during synthesis processing: {e}")
|
|
152
159
|
|
|
153
|
-
|
|
160
|
+
finally:
|
|
161
|
+
await stream_task
|
smallest/tts.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Optional, Union, List
|
|
|
7
7
|
|
|
8
8
|
from smallest.exceptions import TTSError, APIError
|
|
9
9
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
10
|
-
get_smallest_languages, get_smallest_models, API_BASE_URL)
|
|
10
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
11
11
|
|
|
12
12
|
class Smallest:
|
|
13
13
|
def __init__(
|
|
@@ -17,9 +17,7 @@ class Smallest:
|
|
|
17
17
|
sample_rate: Optional[int] = 24000,
|
|
18
18
|
voice_id: Optional[str] = "emily",
|
|
19
19
|
speed: Optional[float] = 1.0,
|
|
20
|
-
add_wav_header: Optional[bool] = True
|
|
21
|
-
transliterate: Optional[bool] = False,
|
|
22
|
-
remove_extra_silence: Optional[bool] = True
|
|
20
|
+
add_wav_header: Optional[bool] = True
|
|
23
21
|
) -> None:
|
|
24
22
|
"""
|
|
25
23
|
Smallest Instance for text-to-speech synthesis.
|
|
@@ -34,8 +32,6 @@ class Smallest:
|
|
|
34
32
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
35
33
|
- speed (float): The speed of the speech synthesis.
|
|
36
34
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
37
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
38
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
39
35
|
|
|
40
36
|
Methods:
|
|
41
37
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -46,7 +42,9 @@ class Smallest:
|
|
|
46
42
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
47
43
|
if not self.api_key:
|
|
48
44
|
raise TTSError()
|
|
49
|
-
|
|
45
|
+
if model == "lightning-large":
|
|
46
|
+
voice_id = "lakshya"
|
|
47
|
+
|
|
50
48
|
self.chunk_size = 250
|
|
51
49
|
|
|
52
50
|
self.opts = TTSOptions(
|
|
@@ -55,9 +53,7 @@ class Smallest:
|
|
|
55
53
|
voice_id=voice_id,
|
|
56
54
|
api_key=self.api_key,
|
|
57
55
|
add_wav_header=add_wav_header,
|
|
58
|
-
speed=speed
|
|
59
|
-
transliterate=transliterate,
|
|
60
|
-
remove_extra_silence=remove_extra_silence
|
|
56
|
+
speed=speed
|
|
61
57
|
)
|
|
62
58
|
|
|
63
59
|
|
|
@@ -102,6 +98,9 @@ class Smallest:
|
|
|
102
98
|
def synthesize(
|
|
103
99
|
self,
|
|
104
100
|
text: str,
|
|
101
|
+
consistency: Optional[float] = 0.5,
|
|
102
|
+
similarity: Optional[float] = 0,
|
|
103
|
+
enhancement: Optional[bool] = False,
|
|
105
104
|
save_as: Optional[str] = None,
|
|
106
105
|
**kwargs
|
|
107
106
|
) -> Union[bytes, None]:
|
|
@@ -112,6 +111,9 @@ class Smallest:
|
|
|
112
111
|
- text (str): The text to be converted to speech.
|
|
113
112
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
114
113
|
The file must have a .wav extension.
|
|
114
|
+
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
115
|
+
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
116
|
+
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
115
117
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
116
118
|
|
|
117
119
|
Returns:
|
|
@@ -123,10 +125,16 @@ class Smallest:
|
|
|
123
125
|
- APIError: If the API request fails or returns an error.
|
|
124
126
|
"""
|
|
125
127
|
opts = copy.deepcopy(self.opts)
|
|
128
|
+
valid_keys = set(vars(opts).keys())
|
|
129
|
+
|
|
130
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
131
|
+
if invalid_keys:
|
|
132
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
133
|
+
|
|
126
134
|
for key, value in kwargs.items():
|
|
127
135
|
setattr(opts, key, value)
|
|
128
136
|
|
|
129
|
-
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed)
|
|
137
|
+
validate_input(preprocess_text(text), opts.model, opts.sample_rate, opts.speed, consistency, similarity, enhancement)
|
|
130
138
|
|
|
131
139
|
self.chunk_size = 250
|
|
132
140
|
if opts.model == "lightning-large":
|
|
@@ -142,11 +150,16 @@ class Smallest:
|
|
|
142
150
|
"voice_id": opts.voice_id,
|
|
143
151
|
"add_wav_header": False,
|
|
144
152
|
"speed": opts.speed,
|
|
145
|
-
"model": opts.model,
|
|
146
|
-
"transliterate": opts.transliterate,
|
|
147
|
-
"remove_extra_silence": opts.remove_extra_silence,
|
|
148
153
|
}
|
|
149
154
|
|
|
155
|
+
if opts.model == "lightning-large":
|
|
156
|
+
if consistency:
|
|
157
|
+
payload["consistency"] = consistency
|
|
158
|
+
if similarity:
|
|
159
|
+
payload["similarity"] = similarity
|
|
160
|
+
if enhancement:
|
|
161
|
+
payload["enhancement"] = enhancement
|
|
162
|
+
|
|
150
163
|
headers = {
|
|
151
164
|
"Authorization": f"Bearer {self.api_key}",
|
|
152
165
|
"Content-Type": "application/json",
|
|
@@ -154,7 +167,7 @@ class Smallest:
|
|
|
154
167
|
|
|
155
168
|
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
156
169
|
if res.status_code != 200:
|
|
157
|
-
raise APIError(f"Failed to synthesize speech: {res.text}.
|
|
170
|
+
raise APIError(f"Failed to synthesize speech: {res.text}. For more information, visit https://waves.smallest.ai/")
|
|
158
171
|
|
|
159
172
|
audio_content += res.content
|
|
160
173
|
|
|
@@ -193,7 +206,6 @@ class Smallest:
|
|
|
193
206
|
if not os.path.isfile(file_path):
|
|
194
207
|
raise TTSError("Invalid file path. File does not exist.")
|
|
195
208
|
|
|
196
|
-
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
197
209
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
198
210
|
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
199
211
|
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
@@ -212,3 +224,30 @@ class Smallest:
|
|
|
212
224
|
raise APIError(f"Failed to add voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
213
225
|
|
|
214
226
|
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def delete_voice(self, voice_id: str) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Delete a cloned voice synchronously.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
- str: The response from the API.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
- APIError: If the API request fails or returns an error.
|
|
241
|
+
"""
|
|
242
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
243
|
+
payload = {'voiceId': voice_id}
|
|
244
|
+
|
|
245
|
+
headers = {
|
|
246
|
+
'Authorization': f"Bearer {self.api_key}",
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
response = requests.delete(url, headers=headers, json=payload)
|
|
250
|
+
if response.status_code != 200:
|
|
251
|
+
raise APIError(f"Failed to delete voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
252
|
+
|
|
253
|
+
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
smallest/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import io
|
|
3
3
|
from typing import List
|
|
4
|
+
from typing import Optional
|
|
4
5
|
from pydub import AudioSegment
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from sacremoses import MosesPunctNormalizer
|
|
@@ -14,6 +15,7 @@ SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
|
|
|
14
15
|
mpn = MosesPunctNormalizer()
|
|
15
16
|
SAMPLE_WIDTH = 2
|
|
16
17
|
CHANNELS = 1
|
|
18
|
+
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
@dataclass
|
|
@@ -24,11 +26,9 @@ class TTSOptions:
|
|
|
24
26
|
api_key: str
|
|
25
27
|
add_wav_header: bool
|
|
26
28
|
speed: float
|
|
27
|
-
transliterate: bool
|
|
28
|
-
remove_extra_silence: bool
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def validate_input(text: str, model: str, sample_rate: int, speed: float):
|
|
31
|
+
def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[bool] = None):
|
|
32
32
|
if not text:
|
|
33
33
|
raise ValidationError("Text cannot be empty.")
|
|
34
34
|
if model not in TTSModels:
|
|
@@ -37,14 +37,20 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float):
|
|
|
37
37
|
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
|
|
38
38
|
if not 0.5 <= speed <= 2.0:
|
|
39
39
|
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
40
|
+
if consistency is not None and not 0.0 <= consistency <= 1.0:
|
|
41
|
+
raise ValidationError(f"Invalid consistency: {consistency}. Must be between 0.0 and 1.0")
|
|
42
|
+
if similarity is not None and not 0.0 <= similarity <= 1.0:
|
|
43
|
+
raise ValidationError(f"Invalid similarity: {similarity}. Must be between 0.0 and 1.0")
|
|
44
|
+
if enhancement is not None and not isinstance(enhancement, bool):
|
|
45
|
+
raise ValidationError(f"Invalid enhancement: {enhancement}. Must be a boolean value.")
|
|
40
46
|
|
|
41
47
|
|
|
42
48
|
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
49
|
+
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
50
|
+
wav_buf = io.BytesIO()
|
|
51
|
+
audio.export(wav_buf, format="wav")
|
|
52
|
+
wav_buf.seek(0)
|
|
53
|
+
return wav_buf.read()
|
|
48
54
|
|
|
49
55
|
|
|
50
56
|
def preprocess_text(text: str) -> str:
|
|
@@ -55,11 +61,6 @@ def preprocess_text(text: str) -> str:
|
|
|
55
61
|
|
|
56
62
|
|
|
57
63
|
def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
|
|
58
|
-
"""
|
|
59
|
-
Splits the input text into chunks based on sentence boundaries
|
|
60
|
-
defined by SENTENCE_END_REGEX and the maximum chunk size.
|
|
61
|
-
Only splits at valid sentence boundaries to avoid breaking words.
|
|
62
|
-
"""
|
|
63
64
|
chunks = []
|
|
64
65
|
while text:
|
|
65
66
|
if len(text) <= chunk_size:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
5
|
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
@@ -59,8 +59,11 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
59
59
|
- [Aynchronous](#Synchronous)
|
|
60
60
|
- [LLM to Speech](#llm-to-speech)
|
|
61
61
|
- [Add your Voice](#add-your-voice)
|
|
62
|
-
- [Synchronously](#synchronously)
|
|
63
|
-
- [Asynchronously](#asynchronously)
|
|
62
|
+
- [Synchronously](#add-synchronously)
|
|
63
|
+
- [Asynchronously](#add-asynchronously)
|
|
64
|
+
- [Delete your Voice](#delete-your-voice)
|
|
65
|
+
- [Synchronously](#delete-synchronously)
|
|
66
|
+
- [Asynchronously](#delete-asynchronously)
|
|
64
67
|
- [Available Methods](#available-methods)
|
|
65
68
|
- [Technical Note: WAV Headers in Streaming Audio](#technical-note-wav-headers-in-streaming-audio)
|
|
66
69
|
|
|
@@ -80,14 +83,6 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
80
83
|
3. Create a new API Key and copy it.
|
|
81
84
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
82
85
|
|
|
83
|
-
## Best Practices for Input Text
|
|
84
|
-
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
85
|
-
|
|
86
|
-
For optimal voice generation results:
|
|
87
|
-
|
|
88
|
-
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
89
|
-
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
90
|
-
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
91
86
|
|
|
92
87
|
## Examples
|
|
93
88
|
|
|
@@ -115,9 +110,10 @@ if __name__ == "__main__":
|
|
|
115
110
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
116
111
|
- `voice_id`: Voice ID (default: "emily")
|
|
117
112
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
118
|
-
- `
|
|
119
|
-
- `
|
|
120
|
-
- `
|
|
113
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. (default: 0.5)
|
|
114
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. (default: 0)
|
|
115
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. (default: False)
|
|
116
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
121
117
|
|
|
122
118
|
These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override these parameters for a specific synthesis request.
|
|
123
119
|
|
|
@@ -141,9 +137,8 @@ import asyncio
|
|
|
141
137
|
import aiofiles
|
|
142
138
|
from smallest import AsyncSmallest
|
|
143
139
|
|
|
144
|
-
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
145
|
-
|
|
146
140
|
async def main():
|
|
141
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
147
142
|
async with client as tts:
|
|
148
143
|
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
149
144
|
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
@@ -153,15 +148,33 @@ if __name__ == "__main__":
|
|
|
153
148
|
asyncio.run(main())
|
|
154
149
|
```
|
|
155
150
|
|
|
151
|
+
**Running Asynchronously in a Jupyter Notebook**
|
|
152
|
+
If you are using a Jupyter Notebook, use the following approach to execute the asynchronous function within an existing event loop:
|
|
153
|
+
```python
|
|
154
|
+
import asyncio
|
|
155
|
+
import aiofiles
|
|
156
|
+
from smallest import AsyncSmallest
|
|
157
|
+
|
|
158
|
+
async def main():
|
|
159
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
160
|
+
async with client as tts:
|
|
161
|
+
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
162
|
+
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
163
|
+
await f.write(audio_bytes) # alternatively you can use the `save_as` parameter.
|
|
164
|
+
|
|
165
|
+
await main()
|
|
166
|
+
```
|
|
167
|
+
|
|
156
168
|
**Parameters:**
|
|
157
169
|
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
158
170
|
- `model`: TTS model to use (default: "lightning")
|
|
159
171
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
160
172
|
- `voice_id`: Voice ID (default: "emily")
|
|
161
173
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
162
|
-
- `
|
|
163
|
-
- `
|
|
164
|
-
- `
|
|
174
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
175
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
176
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
177
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
165
178
|
|
|
166
179
|
These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override any of these parameters on a per-request basis.
|
|
167
180
|
|
|
@@ -178,6 +191,58 @@ audio_bytes = await tts.synthesize(
|
|
|
178
191
|
|
|
179
192
|
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
180
193
|
|
|
194
|
+
#### Stream through a WebSocket
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
import asyncio
|
|
198
|
+
import websockets
|
|
199
|
+
from groq import Groq
|
|
200
|
+
from smallest import Smallest, TextToAudioStream
|
|
201
|
+
|
|
202
|
+
# Initialize Groq (LLM) and Smallest (TTS) instances
|
|
203
|
+
llm = Groq(api_key="GROQ_API_KEY")
|
|
204
|
+
tts = Smallest(api_key="SMALLEST_API_KEY")
|
|
205
|
+
WEBSOCKET_URL = "wss://echo.websocket.events" # Mock WebSocket server
|
|
206
|
+
|
|
207
|
+
# Async function to stream text generation from LLM
|
|
208
|
+
async def generate_text(prompt):
|
|
209
|
+
completion = llm.chat.completions.create(
|
|
210
|
+
messages=[{"role": "user", "content": prompt}],
|
|
211
|
+
model="llama3-8b-8192",
|
|
212
|
+
stream=True,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Yield text as it is generated
|
|
216
|
+
for chunk in completion:
|
|
217
|
+
text = chunk.choices[0].delta.content
|
|
218
|
+
if text:
|
|
219
|
+
yield text
|
|
220
|
+
|
|
221
|
+
# Main function to run the process
|
|
222
|
+
async def main():
|
|
223
|
+
# Initialize the TTS processor
|
|
224
|
+
processor = TextToAudioStream(tts_instance=tts)
|
|
225
|
+
|
|
226
|
+
# Generate text from LLM
|
|
227
|
+
llm_output = generate_text("Explain text to speech like I am five in 5 sentences.")
|
|
228
|
+
|
|
229
|
+
# Stream the generated speech throught a websocket
|
|
230
|
+
async with websockets.connect(WEBSOCKET_URL) as ws:
|
|
231
|
+
print("Connected to WebSocket server.")
|
|
232
|
+
|
|
233
|
+
# Stream the generated speech
|
|
234
|
+
async for audio_chunk in processor.process(llm_output):
|
|
235
|
+
await ws.send(audio_chunk) # Send audio chunk
|
|
236
|
+
echoed_data = await ws.recv() # Receive the echoed message
|
|
237
|
+
print("Received from server:", echoed_data[:20], "...") # Print first 20 bytes
|
|
238
|
+
|
|
239
|
+
print("WebSocket connection closed.")
|
|
240
|
+
|
|
241
|
+
if __name__ == "__main__":
|
|
242
|
+
asyncio.run(main())
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
#### Save to a File
|
|
181
246
|
```python
|
|
182
247
|
import wave
|
|
183
248
|
import asyncio
|
|
@@ -245,12 +310,12 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
245
310
|
## Add your Voice
|
|
246
311
|
The Smallest AI SDK allows you to clone your voice by uploading an audio file. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
247
312
|
|
|
248
|
-
### Synchronously
|
|
313
|
+
### Add Synchronously
|
|
249
314
|
```python
|
|
250
315
|
from smallest import Smallest
|
|
251
316
|
|
|
252
317
|
def main():
|
|
253
|
-
client = Smallest(api_key="
|
|
318
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
254
319
|
res = client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
255
320
|
print(res)
|
|
256
321
|
|
|
@@ -258,13 +323,13 @@ if __name__ == "__main__":
|
|
|
258
323
|
main()
|
|
259
324
|
```
|
|
260
325
|
|
|
261
|
-
### Asynchronously
|
|
326
|
+
### Add Asynchronously
|
|
262
327
|
```python
|
|
263
328
|
import asyncio
|
|
264
329
|
from smallest import AsyncSmallest
|
|
265
330
|
|
|
266
331
|
async def main():
|
|
267
|
-
client = AsyncSmallest(api_key="
|
|
332
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
268
333
|
res = await client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
269
334
|
print(res)
|
|
270
335
|
|
|
@@ -272,6 +337,36 @@ if __name__ == "__main__":
|
|
|
272
337
|
asyncio.run(main())
|
|
273
338
|
```
|
|
274
339
|
|
|
340
|
+
## Delete your Voice
|
|
341
|
+
The Smallest AI SDK allows you to delete your cloned voice. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
342
|
+
|
|
343
|
+
### Delete Synchronously
|
|
344
|
+
```python
|
|
345
|
+
from smallest import Smallest
|
|
346
|
+
|
|
347
|
+
def main():
|
|
348
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
349
|
+
res = client.delete_voice(voice_id="voice_id")
|
|
350
|
+
print(res)
|
|
351
|
+
|
|
352
|
+
if __name__ == "__main__":
|
|
353
|
+
main()
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Delete Asynchronously
|
|
357
|
+
```python
|
|
358
|
+
import asyncio
|
|
359
|
+
from smallest import AsyncSmallest
|
|
360
|
+
|
|
361
|
+
async def main():
|
|
362
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
363
|
+
res = await client.delete_voice(voice_id="voice_id")
|
|
364
|
+
print(res)
|
|
365
|
+
|
|
366
|
+
if __name__ == "__main__":
|
|
367
|
+
asyncio.run(main())
|
|
368
|
+
```
|
|
369
|
+
|
|
275
370
|
## Available Methods
|
|
276
371
|
|
|
277
372
|
```python
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
+
smallest/async_tts.py,sha256=Jr7IID5tJrnMx_d2217foUJqfFvAFsddvy_0HG5tKGc,11905
|
|
3
|
+
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
+
smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
|
|
5
|
+
smallest/stream_tts.py,sha256=dUxoY0VkXecsMZ41QA8RkX4t_pD5-7mMIJhaB01tQrk,6512
|
|
6
|
+
smallest/tts.py,sha256=bSL7EYmLpd5yT42dbUXVb-IgZ_xIcXpyHvCu2-hHtMs,10024
|
|
7
|
+
smallest/utils.py,sha256=HDpDjPkUeeQLqDhrV-zPTLtOH9hJueae0q9SNq486GQ,3396
|
|
8
|
+
smallestai-2.1.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
+
smallestai-2.1.0.dist-info/METADATA,sha256=BwCUFiVZTRActimZBQcPJg8vHJy0M-6vYA_yHvaFpDk,14904
|
|
10
|
+
smallestai-2.1.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
11
|
+
smallestai-2.1.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
+
smallestai-2.1.0.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
-
smallest/async_tts.py,sha256=5qW7owlMeSWFx0rpn9dYfbO76mmNY0DXcytNjLfbbz8,9727
|
|
3
|
-
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
-
smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
|
|
5
|
-
smallest/stream_tts.py,sha256=SeP9A9zXJWiV62Eezv0L1J5sRIR304Llc_mwVtOOSUI,6348
|
|
6
|
-
smallest/tts.py,sha256=xBBEk_byRPGT6SYkE6qvhfEupgHl6XBdAqtxmzw2rF8,8311
|
|
7
|
-
smallest/utils.py,sha256=FCZkvbbHJBoN0jpBSqmt1hJjvks56t8i82we4XnqjYk,3016
|
|
8
|
-
smallestai-2.0.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
-
smallestai-2.0.0.dist-info/METADATA,sha256=EIyZZqzAvHgQ7jfEs5x5LUx3HjzoCUhzJoXfkb3CuoI,11538
|
|
10
|
-
smallestai-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
11
|
-
smallestai-2.0.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
-
smallestai-2.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|