smallestai 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smallest/async_tts.py +108 -44
- smallest/stream_tts.py +147 -28
- smallest/tts.py +91 -40
- smallest/utils.py +17 -16
- {smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/METADATA +118 -24
- smallestai-2.2.0.dist-info/RECORD +12 -0
- {smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/WHEEL +1 -1
- smallestai-2.0.0.dist-info/RECORD +0 -12
- {smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/LICENSE +0 -0
- {smallestai-2.0.0.dist-info → smallestai-2.2.0.dist-info}/top_level.txt +0 -0
smallest/async_tts.py
CHANGED
|
@@ -4,11 +4,11 @@ import json
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
import aiofiles
|
|
6
6
|
import requests
|
|
7
|
-
from typing import Optional, Union, List
|
|
7
|
+
from typing import Optional, Union, List, AsyncIterator
|
|
8
8
|
|
|
9
9
|
from smallest.exceptions import TTSError, APIError
|
|
10
10
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
11
|
-
get_smallest_languages, get_smallest_models, API_BASE_URL)
|
|
11
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class AsyncSmallest:
|
|
@@ -19,9 +19,10 @@ class AsyncSmallest:
|
|
|
19
19
|
sample_rate: Optional[int] = 24000,
|
|
20
20
|
voice_id: Optional[str] = "emily",
|
|
21
21
|
speed: Optional[float] = 1.0,
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
consistency: Optional[float] = 0.5,
|
|
23
|
+
similarity: Optional[float] = 0.0,
|
|
24
|
+
enhancement: Optional[int] = 1,
|
|
25
|
+
add_wav_header: Optional[bool] = True
|
|
25
26
|
) -> None:
|
|
26
27
|
"""
|
|
27
28
|
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
|
|
@@ -36,9 +37,10 @@ class AsyncSmallest:
|
|
|
36
37
|
- sample_rate (int): The sample rate for the audio output.
|
|
37
38
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
38
39
|
- speed (float): The speed of the speech synthesis.
|
|
40
|
+
- consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
|
|
41
|
+
- similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
|
|
42
|
+
- enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
|
|
39
43
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
40
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
41
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
42
44
|
|
|
43
45
|
Methods:
|
|
44
46
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -49,6 +51,9 @@ class AsyncSmallest:
|
|
|
49
51
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
50
52
|
if not self.api_key:
|
|
51
53
|
raise TTSError()
|
|
54
|
+
if model == "lightning-large" and voice_id is None:
|
|
55
|
+
voice_id = "lakshya"
|
|
56
|
+
|
|
52
57
|
self.chunk_size = 250
|
|
53
58
|
|
|
54
59
|
self.opts = TTSOptions(
|
|
@@ -58,8 +63,9 @@ class AsyncSmallest:
|
|
|
58
63
|
api_key=self.api_key,
|
|
59
64
|
add_wav_header=add_wav_header,
|
|
60
65
|
speed=speed,
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
consistency=consistency,
|
|
67
|
+
similarity=similarity,
|
|
68
|
+
enhancement=enhancement
|
|
63
69
|
)
|
|
64
70
|
self.session = None
|
|
65
71
|
|
|
@@ -124,67 +130,92 @@ class AsyncSmallest:
|
|
|
124
130
|
async def synthesize(
|
|
125
131
|
self,
|
|
126
132
|
text: str,
|
|
133
|
+
stream: Optional[bool] = False,
|
|
127
134
|
save_as: Optional[str] = None,
|
|
128
135
|
**kwargs
|
|
129
|
-
) -> Union[bytes, None]:
|
|
136
|
+
) -> Union[bytes, None, AsyncIterator[bytes]]:
|
|
130
137
|
"""
|
|
131
138
|
Asynchronously synthesize speech from the provided text.
|
|
132
139
|
|
|
133
140
|
Args:
|
|
134
141
|
- text (str): The text to be converted to speech.
|
|
142
|
+
- stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
|
|
135
143
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
136
144
|
The file must have a .wav extension.
|
|
137
145
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
138
146
|
|
|
139
147
|
Returns:
|
|
140
|
-
- Union[bytes, None]:
|
|
141
|
-
|
|
148
|
+
- Union[bytes, None, Iterator[bytes]]:
|
|
149
|
+
- If `stream=True`, returns an iterator yielding audio chunks.
|
|
150
|
+
- If `save_as` is provided, saves the file and returns None.
|
|
151
|
+
- Otherwise, returns the synthesized audio content as bytes.
|
|
142
152
|
|
|
143
153
|
Raises:
|
|
144
154
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
145
155
|
- APIError: If the API request fails or returns an error.
|
|
156
|
+
- ValueError: If an unexpected parameter is passed in `kwargs`.
|
|
146
157
|
"""
|
|
147
|
-
should_cleanup =
|
|
158
|
+
should_cleanup = False
|
|
159
|
+
|
|
160
|
+
if self.session is None or self.session.closed:
|
|
161
|
+
self.session = aiohttp.ClientSession()
|
|
162
|
+
should_cleanup = True # Cleanup only if we created a new session
|
|
148
163
|
|
|
149
164
|
try:
|
|
150
165
|
opts = copy.deepcopy(self.opts)
|
|
166
|
+
valid_keys = set(vars(opts).keys())
|
|
167
|
+
|
|
168
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
169
|
+
if invalid_keys:
|
|
170
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
171
|
+
|
|
151
172
|
for key, value in kwargs.items():
|
|
152
173
|
setattr(opts, key, value)
|
|
153
174
|
|
|
154
|
-
|
|
175
|
+
text = preprocess_text(text)
|
|
176
|
+
validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
|
|
155
177
|
|
|
156
178
|
self.chunk_size = 250
|
|
157
|
-
if opts.model == '
|
|
179
|
+
if opts.model == 'lightning-large':
|
|
158
180
|
self.chunk_size = 140
|
|
159
181
|
|
|
160
182
|
chunks = chunk_text(text, self.chunk_size)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
183
|
+
|
|
184
|
+
async def audio_stream():
|
|
185
|
+
for chunk in chunks:
|
|
186
|
+
payload = {
|
|
187
|
+
"text": chunk,
|
|
188
|
+
"sample_rate": opts.sample_rate,
|
|
189
|
+
"voice_id": opts.voice_id,
|
|
190
|
+
"add_wav_header": False,
|
|
191
|
+
"speed": opts.speed,
|
|
192
|
+
"model": opts.model
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if opts.model == "lightning-large":
|
|
196
|
+
if opts.consistency is not None:
|
|
197
|
+
payload["consistency"] = opts.consistency
|
|
198
|
+
if opts.similarity is not None:
|
|
199
|
+
payload["similarity"] = opts.similarity
|
|
200
|
+
if opts.enhancement is not None:
|
|
201
|
+
payload["enhancement"] = opts.enhancement
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
headers = {
|
|
205
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
206
|
+
"Content-Type": "application/json",
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
210
|
+
if res.status != 200:
|
|
211
|
+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
212
|
+
|
|
213
|
+
yield await res.read()
|
|
214
|
+
|
|
215
|
+
if stream:
|
|
216
|
+
return audio_stream()
|
|
217
|
+
|
|
218
|
+
audio_content = b"".join([chunk async for chunk in audio_stream()])
|
|
188
219
|
|
|
189
220
|
if save_as:
|
|
190
221
|
if not save_as.endswith(".wav"):
|
|
@@ -199,7 +230,7 @@ class AsyncSmallest:
|
|
|
199
230
|
return add_wav_header(audio_content, opts.sample_rate)
|
|
200
231
|
|
|
201
232
|
return audio_content
|
|
202
|
-
|
|
233
|
+
|
|
203
234
|
finally:
|
|
204
235
|
if should_cleanup and self.session:
|
|
205
236
|
await self.session.close()
|
|
@@ -226,7 +257,6 @@ class AsyncSmallest:
|
|
|
226
257
|
if not os.path.exists(file_path):
|
|
227
258
|
raise TTSError("Invalid file path. File does not exist.")
|
|
228
259
|
|
|
229
|
-
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
230
260
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
231
261
|
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
232
262
|
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
@@ -257,4 +287,38 @@ class AsyncSmallest:
|
|
|
257
287
|
if should_cleanup and self.session:
|
|
258
288
|
await self.session.close()
|
|
259
289
|
self.session = None
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
async def delete_voice(self, voice_id: str) -> str:
|
|
293
|
+
"""
|
|
294
|
+
Delete a cloned voice asynchronously.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
- str: The response from the API.
|
|
260
301
|
|
|
302
|
+
Raises:
|
|
303
|
+
- APIError: If the API request fails or returns an error.
|
|
304
|
+
"""
|
|
305
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
306
|
+
payload = {'voiceId': voice_id}
|
|
307
|
+
|
|
308
|
+
headers = {
|
|
309
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
should_cleanup = await self._ensure_session()
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
async with self.session.delete(url, headers=headers, json=payload) as res:
|
|
316
|
+
if res.status != 200:
|
|
317
|
+
raise APIError(f"Failed to delete voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
318
|
+
|
|
319
|
+
return await res.text()
|
|
320
|
+
|
|
321
|
+
finally:
|
|
322
|
+
if should_cleanup and self.session:
|
|
323
|
+
await self.session.close()
|
|
324
|
+
self.session = None
|
smallest/stream_tts.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import time
|
|
2
3
|
from threading import Thread
|
|
3
4
|
from queue import Queue, Empty
|
|
4
|
-
from typing import AsyncGenerator, Optional, Union
|
|
5
|
+
from typing import AsyncGenerator, Optional, Union, List, Dict, Any
|
|
5
6
|
|
|
6
7
|
from smallest.tts import Smallest
|
|
7
8
|
from smallest.exceptions import APIError
|
|
@@ -13,7 +14,8 @@ class TextToAudioStream:
|
|
|
13
14
|
self,
|
|
14
15
|
tts_instance: Union[Smallest, AsyncSmallest],
|
|
15
16
|
queue_timeout: Optional[float] = 5.0,
|
|
16
|
-
max_retries: Optional[int] = 3
|
|
17
|
+
max_retries: Optional[int] = 3,
|
|
18
|
+
verbose: bool = False
|
|
17
19
|
):
|
|
18
20
|
"""
|
|
19
21
|
A real-time text-to-speech processor that converts streaming text into audio output.
|
|
@@ -32,6 +34,7 @@ class TextToAudioStream:
|
|
|
32
34
|
tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
|
|
33
35
|
queue_timeout: How long to wait for new text (seconds, default: 1.0)
|
|
34
36
|
max_retries: Number of retry attempts for failed synthesis (default: 3)
|
|
37
|
+
verbose: Whether to log detailed metrics about TTS requests (default: False)
|
|
35
38
|
"""
|
|
36
39
|
self.tts_instance = tts_instance
|
|
37
40
|
self.tts_instance.opts.add_wav_header = False
|
|
@@ -41,6 +44,14 @@ class TextToAudioStream:
|
|
|
41
44
|
self.queue = Queue()
|
|
42
45
|
self.buffer_size = 250
|
|
43
46
|
self.stop_flag = False
|
|
47
|
+
self.verbose = verbose
|
|
48
|
+
|
|
49
|
+
# Metrics tracking
|
|
50
|
+
self.request_count = 0
|
|
51
|
+
self.request_logs: List[Dict[str, Any]] = []
|
|
52
|
+
self.start_time = 0
|
|
53
|
+
self.first_api_response_time = None
|
|
54
|
+
self.end_time = 0
|
|
44
55
|
|
|
45
56
|
if self.tts_instance.opts.model == 'lightning-large':
|
|
46
57
|
self.buffer_size = 140
|
|
@@ -48,60 +59,117 @@ class TextToAudioStream:
|
|
|
48
59
|
|
|
49
60
|
async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
|
|
50
61
|
"""
|
|
51
|
-
Streams the LLM output, splitting it into
|
|
52
|
-
|
|
62
|
+
Streams the LLM output, splitting it into chunks based on sentence boundaries
|
|
63
|
+
or space characters if no sentence boundary is found before reaching buffer_size.
|
|
53
64
|
|
|
54
65
|
Parameters:
|
|
55
66
|
- llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
|
|
56
67
|
"""
|
|
57
68
|
buffer = ""
|
|
58
|
-
last_break_index = 0
|
|
59
69
|
|
|
60
70
|
async for chunk in llm_output:
|
|
61
71
|
buffer += chunk
|
|
62
|
-
|
|
63
|
-
while
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
72
|
+
|
|
73
|
+
while len(buffer) > self.buffer_size:
|
|
74
|
+
chunk_text = buffer[:self.buffer_size]
|
|
75
|
+
last_break_index = -1
|
|
76
|
+
|
|
77
|
+
# Find last sentence boundary using regex
|
|
78
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
79
|
+
if self.sentence_end_regex.match(chunk_text[:i + 1]):
|
|
80
|
+
last_break_index = i
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
if last_break_index == -1:
|
|
84
|
+
# Fallback to space if no sentence boundary found
|
|
85
|
+
last_space = chunk_text.rfind(' ')
|
|
86
|
+
if last_space != -1:
|
|
87
|
+
last_break_index = last_space
|
|
71
88
|
else:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
89
|
+
last_break_index = self.buffer_size - 1
|
|
90
|
+
|
|
91
|
+
# Add chunk to queue and update buffer
|
|
92
|
+
self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
|
|
93
|
+
buffer = buffer[last_break_index + 1:].strip()
|
|
94
|
+
|
|
95
|
+
# Don't forget the remaining text
|
|
79
96
|
if buffer:
|
|
80
97
|
self.queue.put(f'{buffer.replace("—", " ").strip()} ')
|
|
98
|
+
|
|
81
99
|
self.stop_flag = True
|
|
82
100
|
|
|
83
101
|
|
|
84
102
|
def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
85
103
|
"""Synchronously synthesizes a given sentence."""
|
|
104
|
+
request_start_time = time.time()
|
|
105
|
+
request_id = self.request_count + 1
|
|
106
|
+
|
|
86
107
|
try:
|
|
87
|
-
|
|
108
|
+
audio_content = self.tts_instance.synthesize(sentence)
|
|
109
|
+
self.request_count += 1
|
|
110
|
+
request_end_time = time.time()
|
|
111
|
+
|
|
112
|
+
if self.verbose:
|
|
113
|
+
request_duration = request_end_time - request_start_time
|
|
114
|
+
if self.first_api_response_time is None:
|
|
115
|
+
self.first_api_response_time = time.time() - self.start_time
|
|
116
|
+
|
|
117
|
+
self.request_logs.append({
|
|
118
|
+
"id": request_id,
|
|
119
|
+
"text": sentence,
|
|
120
|
+
"start_time": request_start_time - self.start_time,
|
|
121
|
+
"end_time": request_end_time - self.start_time,
|
|
122
|
+
"duration": request_duration,
|
|
123
|
+
"char_count": len(sentence),
|
|
124
|
+
"retries": retries
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
return audio_content
|
|
88
128
|
except APIError as e:
|
|
89
129
|
if retries < self.max_retries:
|
|
130
|
+
if self.verbose:
|
|
131
|
+
print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
|
|
90
132
|
return self._synthesize_sync(sentence, retries + 1)
|
|
91
133
|
else:
|
|
92
|
-
|
|
134
|
+
if self.verbose:
|
|
135
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
93
136
|
return None
|
|
94
137
|
|
|
95
138
|
|
|
96
139
|
async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
97
140
|
"""Asynchronously synthesizes a given sentence."""
|
|
141
|
+
request_start_time = time.time()
|
|
142
|
+
request_id = self.request_count + 1
|
|
143
|
+
|
|
98
144
|
try:
|
|
99
|
-
|
|
145
|
+
audio_content = await self.tts_instance.synthesize(sentence)
|
|
146
|
+
self.request_count += 1
|
|
147
|
+
request_end_time = time.time()
|
|
148
|
+
|
|
149
|
+
if self.verbose:
|
|
150
|
+
request_duration = request_end_time - request_start_time
|
|
151
|
+
if self.first_api_response_time is None:
|
|
152
|
+
self.first_api_response_time = time.time() - self.start_time
|
|
153
|
+
|
|
154
|
+
self.request_logs.append({
|
|
155
|
+
"id": request_id,
|
|
156
|
+
"text": sentence,
|
|
157
|
+
"start_time": request_start_time - self.start_time,
|
|
158
|
+
"end_time": request_end_time - self.start_time,
|
|
159
|
+
"duration": request_duration,
|
|
160
|
+
"char_count": len(sentence),
|
|
161
|
+
"retries": retries
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
return audio_content
|
|
100
165
|
except APIError as e:
|
|
101
166
|
if retries < self.max_retries:
|
|
167
|
+
if self.verbose:
|
|
168
|
+
print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
|
|
102
169
|
return await self._synthesize_async(sentence, retries + 1)
|
|
103
170
|
else:
|
|
104
|
-
|
|
171
|
+
if self.verbose:
|
|
172
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
105
173
|
return None
|
|
106
174
|
|
|
107
175
|
|
|
@@ -112,7 +180,8 @@ class TextToAudioStream:
|
|
|
112
180
|
"""
|
|
113
181
|
while not self.stop_flag or not self.queue.empty():
|
|
114
182
|
try:
|
|
115
|
-
sentence = self.queue.
|
|
183
|
+
sentence = self.queue.get_nowait()
|
|
184
|
+
|
|
116
185
|
if isinstance(self.tts_instance, AsyncSmallest):
|
|
117
186
|
audio_content = await self._synthesize_async(sentence)
|
|
118
187
|
else:
|
|
@@ -121,10 +190,55 @@ class TextToAudioStream:
|
|
|
121
190
|
|
|
122
191
|
if audio_content:
|
|
123
192
|
yield audio_content
|
|
193
|
+
|
|
124
194
|
except Empty:
|
|
125
|
-
if
|
|
195
|
+
# Quick check if we should exit
|
|
196
|
+
if self.stop_flag and self.queue.empty():
|
|
126
197
|
break
|
|
127
|
-
|
|
198
|
+
|
|
199
|
+
# Short sleep to avoid busy-waiting
|
|
200
|
+
await asyncio.sleep(0.01) # Much shorter sleep time (10ms)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _print_verbose_summary(self) -> None:
|
|
204
|
+
"""Print a summary of all metrics if verbose mode is enabled."""
|
|
205
|
+
if not self.verbose:
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
total_duration = self.end_time - self.start_time
|
|
209
|
+
|
|
210
|
+
print("\n" + "="*100)
|
|
211
|
+
print(f"TEXT-TO-AUDIO STREAM METRICS")
|
|
212
|
+
print("="*100)
|
|
213
|
+
|
|
214
|
+
print(f"\nOVERALL STATISTICS:")
|
|
215
|
+
print(f" Total requests made: {self.request_count}")
|
|
216
|
+
print(f" Time to first API response: {self.first_api_response_time:.3f}s")
|
|
217
|
+
print(f" Total processing time: {total_duration:.3f}s")
|
|
218
|
+
|
|
219
|
+
# Print table header
|
|
220
|
+
print("\nREQUEST DETAILS:")
|
|
221
|
+
header = f"{'#':4} {'Start (s)':10} {'End (s)':10} {'Duration (s)':12} {'Characters':15} {'Text'}"
|
|
222
|
+
print("\n" + header)
|
|
223
|
+
print("-" * 100)
|
|
224
|
+
|
|
225
|
+
# Print table rows
|
|
226
|
+
for log in self.request_logs:
|
|
227
|
+
row = (
|
|
228
|
+
f"{log['id']:4} "
|
|
229
|
+
f"{log['start_time']:10.3f} "
|
|
230
|
+
f"{log['end_time']:10.3f} "
|
|
231
|
+
f"{log['duration']:12.3f} "
|
|
232
|
+
f"{log['char_count']:15} "
|
|
233
|
+
f"{log['text'][:50]}{'...' if len(log['text']) > 50 else ''}"
|
|
234
|
+
)
|
|
235
|
+
print(row)
|
|
236
|
+
|
|
237
|
+
# Print retry information if any
|
|
238
|
+
if log['retries'] > 0:
|
|
239
|
+
print(f"{'':4} {'':10} {'':10} {'':12} {'':15} Retries: {log['retries']}")
|
|
240
|
+
|
|
241
|
+
print("\n" + "="*100)
|
|
128
242
|
|
|
129
243
|
|
|
130
244
|
async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
|
|
@@ -144,6 +258,8 @@ class TextToAudioStream:
|
|
|
144
258
|
- Streamed over a network
|
|
145
259
|
- Further processed as needed
|
|
146
260
|
"""
|
|
261
|
+
self.start_time = time.time()
|
|
262
|
+
|
|
147
263
|
llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
|
|
148
264
|
llm_thread.start()
|
|
149
265
|
|
|
@@ -151,3 +267,6 @@ class TextToAudioStream:
|
|
|
151
267
|
yield audio_content
|
|
152
268
|
|
|
153
269
|
llm_thread.join()
|
|
270
|
+
|
|
271
|
+
self.end_time = time.time()
|
|
272
|
+
self._print_verbose_summary()
|
smallest/tts.py
CHANGED
|
@@ -3,11 +3,11 @@ import json
|
|
|
3
3
|
import wave
|
|
4
4
|
import copy
|
|
5
5
|
import requests
|
|
6
|
-
from typing import Optional, Union, List
|
|
6
|
+
from typing import Optional, Union, List, Iterator
|
|
7
7
|
|
|
8
8
|
from smallest.exceptions import TTSError, APIError
|
|
9
9
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
10
|
-
get_smallest_languages, get_smallest_models, API_BASE_URL)
|
|
10
|
+
get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
|
|
11
11
|
|
|
12
12
|
class Smallest:
|
|
13
13
|
def __init__(
|
|
@@ -17,9 +17,10 @@ class Smallest:
|
|
|
17
17
|
sample_rate: Optional[int] = 24000,
|
|
18
18
|
voice_id: Optional[str] = "emily",
|
|
19
19
|
speed: Optional[float] = 1.0,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
consistency: Optional[float] = 0.5,
|
|
21
|
+
similarity: Optional[float] = 0.0,
|
|
22
|
+
enhancement: Optional[int] = 1,
|
|
23
|
+
add_wav_header: Optional[bool] = True
|
|
23
24
|
) -> None:
|
|
24
25
|
"""
|
|
25
26
|
Smallest Instance for text-to-speech synthesis.
|
|
@@ -33,9 +34,10 @@ class Smallest:
|
|
|
33
34
|
- sample_rate (int): The sample rate for the audio output.
|
|
34
35
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
35
36
|
- speed (float): The speed of the speech synthesis.
|
|
37
|
+
- consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
|
|
38
|
+
- similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
|
|
39
|
+
- enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
|
|
36
40
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
37
|
-
- transliterate (bool): Whether to transliterate the text.
|
|
38
|
-
- remove_extra_silence (bool): Whether to remove extra silence from the synthesized audio.
|
|
39
41
|
|
|
40
42
|
Methods:
|
|
41
43
|
- get_languages: Returns a list of available languages for synthesis.
|
|
@@ -46,7 +48,9 @@ class Smallest:
|
|
|
46
48
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
47
49
|
if not self.api_key:
|
|
48
50
|
raise TTSError()
|
|
49
|
-
|
|
51
|
+
if model == "lightning-large" and voice_id is None:
|
|
52
|
+
voice_id = "lakshya"
|
|
53
|
+
|
|
50
54
|
self.chunk_size = 250
|
|
51
55
|
|
|
52
56
|
self.opts = TTSOptions(
|
|
@@ -56,8 +60,9 @@ class Smallest:
|
|
|
56
60
|
api_key=self.api_key,
|
|
57
61
|
add_wav_header=add_wav_header,
|
|
58
62
|
speed=speed,
|
|
59
|
-
|
|
60
|
-
|
|
63
|
+
consistency=consistency,
|
|
64
|
+
similarity=similarity,
|
|
65
|
+
enhancement=enhancement
|
|
61
66
|
)
|
|
62
67
|
|
|
63
68
|
|
|
@@ -102,61 +107,81 @@ class Smallest:
|
|
|
102
107
|
def synthesize(
|
|
103
108
|
self,
|
|
104
109
|
text: str,
|
|
110
|
+
stream: Optional[bool] = False,
|
|
105
111
|
save_as: Optional[str] = None,
|
|
106
112
|
**kwargs
|
|
107
|
-
) -> Union[bytes, None]:
|
|
113
|
+
) -> Union[bytes, None, Iterator[bytes]]:
|
|
108
114
|
"""
|
|
109
115
|
Synthesize speech from the provided text.
|
|
110
116
|
|
|
111
|
-
Args:
|
|
112
117
|
- text (str): The text to be converted to speech.
|
|
113
|
-
-
|
|
118
|
+
- stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
|
|
119
|
+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
114
120
|
The file must have a .wav extension.
|
|
115
121
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
116
122
|
|
|
117
123
|
Returns:
|
|
118
|
-
- Union[bytes, None]:
|
|
119
|
-
|
|
124
|
+
- Union[bytes, None, Iterator[bytes]]:
|
|
125
|
+
- If `stream=True`, returns an iterator yielding audio chunks.
|
|
126
|
+
- If `save_as` is provided, saves the file and returns None.
|
|
127
|
+
- Otherwise, returns the synthesized audio content as bytes.
|
|
120
128
|
|
|
121
129
|
Raises:
|
|
122
130
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
123
131
|
- APIError: If the API request fails or returns an error.
|
|
124
132
|
"""
|
|
125
133
|
opts = copy.deepcopy(self.opts)
|
|
134
|
+
valid_keys = set(vars(opts).keys())
|
|
135
|
+
|
|
136
|
+
invalid_keys = [key for key in kwargs if key not in valid_keys]
|
|
137
|
+
if invalid_keys:
|
|
138
|
+
raise ValueError(f"Invalid parameter(s) in kwargs: {', '.join(invalid_keys)}. Allowed parameters are: {', '.join(valid_keys)}")
|
|
139
|
+
|
|
126
140
|
for key, value in kwargs.items():
|
|
127
141
|
setattr(opts, key, value)
|
|
128
142
|
|
|
129
|
-
|
|
143
|
+
text = preprocess_text(text)
|
|
144
|
+
validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
|
|
130
145
|
|
|
131
146
|
self.chunk_size = 250
|
|
132
147
|
if opts.model == "lightning-large":
|
|
133
148
|
self.chunk_size = 140
|
|
134
149
|
|
|
135
150
|
chunks = chunk_text(text, self.chunk_size)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
"
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
151
|
+
|
|
152
|
+
def audio_stream():
|
|
153
|
+
for chunk in chunks:
|
|
154
|
+
payload = {
|
|
155
|
+
"text": chunk,
|
|
156
|
+
"sample_rate": opts.sample_rate,
|
|
157
|
+
"voice_id": opts.voice_id,
|
|
158
|
+
"add_wav_header": False,
|
|
159
|
+
"speed": opts.speed,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if opts.model == "lightning-large":
|
|
163
|
+
if opts.consistency is not None:
|
|
164
|
+
payload["consistency"] = opts.consistency
|
|
165
|
+
if opts.similarity is not None:
|
|
166
|
+
payload["similarity"] = opts.similarity
|
|
167
|
+
if opts.enhancement is not None:
|
|
168
|
+
payload["enhancement"] = opts.enhancement
|
|
169
|
+
|
|
170
|
+
headers = {
|
|
171
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
172
|
+
"Content-Type": "application/json",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
176
|
+
if res.status_code != 200:
|
|
177
|
+
raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
|
|
178
|
+
|
|
179
|
+
yield res.content
|
|
158
180
|
|
|
159
|
-
|
|
181
|
+
if stream:
|
|
182
|
+
return audio_stream()
|
|
183
|
+
|
|
184
|
+
audio_content = b"".join(audio_stream())
|
|
160
185
|
|
|
161
186
|
if save_as:
|
|
162
187
|
if not save_as.endswith(".wav"):
|
|
@@ -193,7 +218,6 @@ class Smallest:
|
|
|
193
218
|
if not os.path.isfile(file_path):
|
|
194
219
|
raise TTSError("Invalid file path. File does not exist.")
|
|
195
220
|
|
|
196
|
-
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
197
221
|
file_extension = os.path.splitext(file_path)[1].lower()
|
|
198
222
|
if file_extension not in ALLOWED_AUDIO_EXTENSIONS:
|
|
199
223
|
raise TTSError(f"Invalid file type. Supported formats are: {ALLOWED_AUDIO_EXTENSIONS}")
|
|
@@ -212,3 +236,30 @@ class Smallest:
|
|
|
212
236
|
raise APIError(f"Failed to add voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
213
237
|
|
|
214
238
|
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def delete_voice(self, voice_id: str) -> str:
|
|
242
|
+
"""
|
|
243
|
+
Delete a cloned voice synchronously.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
- voice_id (str): The ID of the voice to be deleted.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
- str: The response from the API.
|
|
250
|
+
|
|
251
|
+
Raises:
|
|
252
|
+
- APIError: If the API request fails or returns an error.
|
|
253
|
+
"""
|
|
254
|
+
url = f"{API_BASE_URL}/lightning-large"
|
|
255
|
+
payload = {'voiceId': voice_id}
|
|
256
|
+
|
|
257
|
+
headers = {
|
|
258
|
+
'Authorization': f"Bearer {self.api_key}",
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
response = requests.delete(url, headers=headers, json=payload)
|
|
262
|
+
if response.status_code != 200:
|
|
263
|
+
raise APIError(f"Failed to delete voice: {response.text}. For more information, visit https://waves.smallest.ai/")
|
|
264
|
+
|
|
265
|
+
return json.dumps(response.json(), indent=4, ensure_ascii=False)
|
smallest/utils.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import io
|
|
3
3
|
from typing import List
|
|
4
|
+
from typing import Optional
|
|
4
5
|
from pydub import AudioSegment
|
|
5
6
|
from dataclasses import dataclass
|
|
6
|
-
from sacremoses import MosesPunctNormalizer
|
|
7
7
|
|
|
8
8
|
from smallest.exceptions import ValidationError
|
|
9
9
|
from smallest.models import TTSModels, TTSLanguages
|
|
@@ -11,9 +11,9 @@ from smallest.models import TTSModels, TTSLanguages
|
|
|
11
11
|
|
|
12
12
|
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
13
13
|
SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
|
|
14
|
-
mpn = MosesPunctNormalizer()
|
|
15
14
|
SAMPLE_WIDTH = 2
|
|
16
15
|
CHANNELS = 1
|
|
16
|
+
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
@dataclass
|
|
@@ -24,11 +24,12 @@ class TTSOptions:
|
|
|
24
24
|
api_key: str
|
|
25
25
|
add_wav_header: bool
|
|
26
26
|
speed: float
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
consistency: float
|
|
28
|
+
similarity: float
|
|
29
|
+
enhancement: int
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
def validate_input(text: str, model: str, sample_rate: int, speed: float):
|
|
32
|
+
def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[int] = None):
|
|
32
33
|
if not text:
|
|
33
34
|
raise ValidationError("Text cannot be empty.")
|
|
34
35
|
if model not in TTSModels:
|
|
@@ -37,29 +38,29 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float):
|
|
|
37
38
|
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
|
|
38
39
|
if not 0.5 <= speed <= 2.0:
|
|
39
40
|
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
41
|
+
if consistency is not None and not 0.0 <= consistency <= 1.0:
|
|
42
|
+
raise ValidationError(f"Invalid consistency: {consistency}. Must be between 0.0 and 1.0")
|
|
43
|
+
if similarity is not None and not 0.0 <= similarity <= 1.0:
|
|
44
|
+
raise ValidationError(f"Invalid similarity: {similarity}. Must be between 0.0 and 1.0")
|
|
45
|
+
if enhancement is not None and not 0 <= enhancement <= 2:
|
|
46
|
+
raise ValidationError(f"Invalid enhancement: {enhancement}. Must be between 0 and 2.")
|
|
40
47
|
|
|
41
48
|
|
|
42
49
|
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
50
|
+
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
51
|
+
wav_buf = io.BytesIO()
|
|
52
|
+
audio.export(wav_buf, format="wav")
|
|
53
|
+
wav_buf.seek(0)
|
|
54
|
+
return wav_buf.read()
|
|
48
55
|
|
|
49
56
|
|
|
50
57
|
def preprocess_text(text: str) -> str:
|
|
51
58
|
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ").replace("-", " ").replace("–", " ")
|
|
52
59
|
text = re.sub(r'\s+', ' ', text)
|
|
53
|
-
text = mpn.normalize(text)
|
|
54
60
|
return text.strip()
|
|
55
61
|
|
|
56
62
|
|
|
57
63
|
def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
|
|
58
|
-
"""
|
|
59
|
-
Splits the input text into chunks based on sentence boundaries
|
|
60
|
-
defined by SENTENCE_END_REGEX and the maximum chunk size.
|
|
61
|
-
Only splits at valid sentence boundaries to avoid breaking words.
|
|
62
|
-
"""
|
|
63
64
|
chunks = []
|
|
64
65
|
while text:
|
|
65
66
|
if len(text) <= chunk_size:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
5
|
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
@@ -15,7 +15,6 @@ License-File: LICENSE
|
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
16
|
Requires-Dist: aiofiles
|
|
17
17
|
Requires-Dist: requests
|
|
18
|
-
Requires-Dist: sacremoses
|
|
19
18
|
Requires-Dist: pydub
|
|
20
19
|
Provides-Extra: test
|
|
21
20
|
Requires-Dist: jiwer; extra == "test"
|
|
@@ -59,8 +58,11 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
59
58
|
- [Aynchronous](#Synchronous)
|
|
60
59
|
- [LLM to Speech](#llm-to-speech)
|
|
61
60
|
- [Add your Voice](#add-your-voice)
|
|
62
|
-
- [Synchronously](#synchronously)
|
|
63
|
-
- [Asynchronously](#asynchronously)
|
|
61
|
+
- [Synchronously](#add-synchronously)
|
|
62
|
+
- [Asynchronously](#add-asynchronously)
|
|
63
|
+
- [Delete your Voice](#delete-your-voice)
|
|
64
|
+
- [Synchronously](#delete-synchronously)
|
|
65
|
+
- [Asynchronously](#delete-asynchronously)
|
|
64
66
|
- [Available Methods](#available-methods)
|
|
65
67
|
- [Technical Note: WAV Headers in Streaming Audio](#technical-note-wav-headers-in-streaming-audio)
|
|
66
68
|
|
|
@@ -80,14 +82,6 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
80
82
|
3. Create a new API Key and copy it.
|
|
81
83
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
82
84
|
|
|
83
|
-
## Best Practices for Input Text
|
|
84
|
-
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
85
|
-
|
|
86
|
-
For optimal voice generation results:
|
|
87
|
-
|
|
88
|
-
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
89
|
-
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
90
|
-
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
91
85
|
|
|
92
86
|
## Examples
|
|
93
87
|
|
|
@@ -115,9 +109,10 @@ if __name__ == "__main__":
|
|
|
115
109
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
116
110
|
- `voice_id`: Voice ID (default: "emily")
|
|
117
111
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
118
|
-
- `
|
|
119
|
-
- `
|
|
120
|
-
- `
|
|
112
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. (default: 0.5)
|
|
113
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. (default: 0)
|
|
114
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. (default: False)
|
|
115
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
121
116
|
|
|
122
117
|
These parameters are part of the `Smallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override these parameters for a specific synthesis request.
|
|
123
118
|
|
|
@@ -141,9 +136,8 @@ import asyncio
|
|
|
141
136
|
import aiofiles
|
|
142
137
|
from smallest import AsyncSmallest
|
|
143
138
|
|
|
144
|
-
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
145
|
-
|
|
146
139
|
async def main():
|
|
140
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
147
141
|
async with client as tts:
|
|
148
142
|
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
149
143
|
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
@@ -153,15 +147,33 @@ if __name__ == "__main__":
|
|
|
153
147
|
asyncio.run(main())
|
|
154
148
|
```
|
|
155
149
|
|
|
150
|
+
**Running Asynchronously in a Jupyter Notebook**
|
|
151
|
+
If you are using a Jupyter Notebook, use the following approach to execute the asynchronous function within an existing event loop:
|
|
152
|
+
```python
|
|
153
|
+
import asyncio
|
|
154
|
+
import aiofiles
|
|
155
|
+
from smallest import AsyncSmallest
|
|
156
|
+
|
|
157
|
+
async def main():
|
|
158
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
159
|
+
async with client as tts:
|
|
160
|
+
audio_bytes = await tts.synthesize("Hello, this is a test of the async synthesis function.")
|
|
161
|
+
async with aiofiles.open("async_synthesize.wav", "wb") as f:
|
|
162
|
+
await f.write(audio_bytes) # alternatively you can use the `save_as` parameter.
|
|
163
|
+
|
|
164
|
+
await main()
|
|
165
|
+
```
|
|
166
|
+
|
|
156
167
|
**Parameters:**
|
|
157
168
|
- `api_key`: Your API key (can be set via SMALLEST_API_KEY environment variable)
|
|
158
169
|
- `model`: TTS model to use (default: "lightning")
|
|
159
170
|
- `sample_rate`: Audio sample rate (default: 24000)
|
|
160
171
|
- `voice_id`: Voice ID (default: "emily")
|
|
161
172
|
- `speed`: Speech speed multiplier (default: 1.0)
|
|
162
|
-
- `
|
|
163
|
-
- `
|
|
164
|
-
- `
|
|
173
|
+
- `consistency`: Controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
174
|
+
- `similarity`: Controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
175
|
+
- `enhancement`: Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
176
|
+
- `add_wav_header`: Whether to add a WAV header to the output audio.
|
|
165
177
|
|
|
166
178
|
These parameters are part of the `AsyncSmallest` instance. They can be set when creating the instance (as shown above). However, the `synthesize` function also accepts `kwargs`, allowing you to override any of these parameters on a per-request basis.
|
|
167
179
|
|
|
@@ -178,6 +190,58 @@ audio_bytes = await tts.synthesize(
|
|
|
178
190
|
|
|
179
191
|
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
180
192
|
|
|
193
|
+
#### Stream through a WebSocket
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
import asyncio
|
|
197
|
+
import websockets
|
|
198
|
+
from groq import Groq
|
|
199
|
+
from smallest import Smallest, TextToAudioStream
|
|
200
|
+
|
|
201
|
+
# Initialize Groq (LLM) and Smallest (TTS) instances
|
|
202
|
+
llm = Groq(api_key="GROQ_API_KEY")
|
|
203
|
+
tts = Smallest(api_key="SMALLEST_API_KEY")
|
|
204
|
+
WEBSOCKET_URL = "wss://echo.websocket.events" # Mock WebSocket server
|
|
205
|
+
|
|
206
|
+
# Async function to stream text generation from LLM
|
|
207
|
+
async def generate_text(prompt):
|
|
208
|
+
completion = llm.chat.completions.create(
|
|
209
|
+
messages=[{"role": "user", "content": prompt}],
|
|
210
|
+
model="llama3-8b-8192",
|
|
211
|
+
stream=True,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Yield text as it is generated
|
|
215
|
+
for chunk in completion:
|
|
216
|
+
text = chunk.choices[0].delta.content
|
|
217
|
+
if text:
|
|
218
|
+
yield text
|
|
219
|
+
|
|
220
|
+
# Main function to run the process
|
|
221
|
+
async def main():
|
|
222
|
+
# Initialize the TTS processor
|
|
223
|
+
processor = TextToAudioStream(tts_instance=tts)
|
|
224
|
+
|
|
225
|
+
# Generate text from LLM
|
|
226
|
+
llm_output = generate_text("Explain text to speech like I am five in 5 sentences.")
|
|
227
|
+
|
|
228
|
+
# Stream the generated speech throught a websocket
|
|
229
|
+
async with websockets.connect(WEBSOCKET_URL) as ws:
|
|
230
|
+
print("Connected to WebSocket server.")
|
|
231
|
+
|
|
232
|
+
# Stream the generated speech
|
|
233
|
+
async for audio_chunk in processor.process(llm_output):
|
|
234
|
+
await ws.send(audio_chunk) # Send audio chunk
|
|
235
|
+
echoed_data = await ws.recv() # Receive the echoed message
|
|
236
|
+
print("Received from server:", echoed_data[:20], "...") # Print first 20 bytes
|
|
237
|
+
|
|
238
|
+
print("WebSocket connection closed.")
|
|
239
|
+
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
asyncio.run(main())
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
#### Save to a File
|
|
181
245
|
```python
|
|
182
246
|
import wave
|
|
183
247
|
import asyncio
|
|
@@ -245,12 +309,12 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
245
309
|
## Add your Voice
|
|
246
310
|
The Smallest AI SDK allows you to clone your voice by uploading an audio file. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
247
311
|
|
|
248
|
-
### Synchronously
|
|
312
|
+
### Add Synchronously
|
|
249
313
|
```python
|
|
250
314
|
from smallest import Smallest
|
|
251
315
|
|
|
252
316
|
def main():
|
|
253
|
-
client = Smallest(api_key="
|
|
317
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
254
318
|
res = client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
255
319
|
print(res)
|
|
256
320
|
|
|
@@ -258,13 +322,13 @@ if __name__ == "__main__":
|
|
|
258
322
|
main()
|
|
259
323
|
```
|
|
260
324
|
|
|
261
|
-
### Asynchronously
|
|
325
|
+
### Add Asynchronously
|
|
262
326
|
```python
|
|
263
327
|
import asyncio
|
|
264
328
|
from smallest import AsyncSmallest
|
|
265
329
|
|
|
266
330
|
async def main():
|
|
267
|
-
client = AsyncSmallest(api_key="
|
|
331
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
268
332
|
res = await client.add_voice(display_name="My Voice", file_path="my_voice.wav")
|
|
269
333
|
print(res)
|
|
270
334
|
|
|
@@ -272,6 +336,36 @@ if __name__ == "__main__":
|
|
|
272
336
|
asyncio.run(main())
|
|
273
337
|
```
|
|
274
338
|
|
|
339
|
+
## Delete your Voice
|
|
340
|
+
The Smallest AI SDK allows you to delete your cloned voice. This feature is available both synchronously and asynchronously, making it flexible for different use cases. Below are examples of how to use this functionality.
|
|
341
|
+
|
|
342
|
+
### Delete Synchronously
|
|
343
|
+
```python
|
|
344
|
+
from smallest import Smallest
|
|
345
|
+
|
|
346
|
+
def main():
|
|
347
|
+
client = Smallest(api_key="SMALLEST_API_KEY")
|
|
348
|
+
res = client.delete_voice(voice_id="voice_id")
|
|
349
|
+
print(res)
|
|
350
|
+
|
|
351
|
+
if __name__ == "__main__":
|
|
352
|
+
main()
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### Delete Asynchronously
|
|
356
|
+
```python
|
|
357
|
+
import asyncio
|
|
358
|
+
from smallest import AsyncSmallest
|
|
359
|
+
|
|
360
|
+
async def main():
|
|
361
|
+
client = AsyncSmallest(api_key="SMALLEST_API_KEY")
|
|
362
|
+
res = await client.delete_voice(voice_id="voice_id")
|
|
363
|
+
print(res)
|
|
364
|
+
|
|
365
|
+
if __name__ == "__main__":
|
|
366
|
+
asyncio.run(main())
|
|
367
|
+
```
|
|
368
|
+
|
|
275
369
|
## Available Methods
|
|
276
370
|
|
|
277
371
|
```python
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
+
smallest/async_tts.py,sha256=fyl1yBd4uqD2KthZMdnfsiY9ZlQlMXDK2JCWmjR03I4,12639
|
|
3
|
+
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
+
smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
|
|
5
|
+
smallest/stream_tts.py,sha256=MuQSOgOsZEAYcy-Hbs-ZhCNmtn0u3v9tfOk1RbfAsvY,10893
|
|
6
|
+
smallest/tts.py,sha256=_0OG-1DU0Fx3ZeVlJpNGk3fz6ZceaMfvb5ktkEH3tMw,10721
|
|
7
|
+
smallest/utils.py,sha256=7N4Pghv-6FQENdvWArxGpAuUF5xvcEJm2OxejJTIYnM,3349
|
|
8
|
+
smallestai-2.2.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
+
smallestai-2.2.0.dist-info/METADATA,sha256=-02Yij0bHSd6l1PKjJkZyuj7D5Zc5-fgZCeb5cA8T5c,14878
|
|
10
|
+
smallestai-2.2.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
11
|
+
smallestai-2.2.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
+
smallestai-2.2.0.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
smallest/__init__.py,sha256=vaoIBml_IobavpVvFazB86iikg2iEy4h3ddxqv_0Fy4,190
|
|
2
|
-
smallest/async_tts.py,sha256=5qW7owlMeSWFx0rpn9dYfbO76mmNY0DXcytNjLfbbz8,9727
|
|
3
|
-
smallest/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
|
|
4
|
-
smallest/models.py,sha256=g2e_4nU5P48vyXZandKLWqZC1TkoEGeLvYKqJIqurSI,83
|
|
5
|
-
smallest/stream_tts.py,sha256=SeP9A9zXJWiV62Eezv0L1J5sRIR304Llc_mwVtOOSUI,6348
|
|
6
|
-
smallest/tts.py,sha256=xBBEk_byRPGT6SYkE6qvhfEupgHl6XBdAqtxmzw2rF8,8311
|
|
7
|
-
smallest/utils.py,sha256=FCZkvbbHJBoN0jpBSqmt1hJjvks56t8i82we4XnqjYk,3016
|
|
8
|
-
smallestai-2.0.0.dist-info/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
|
|
9
|
-
smallestai-2.0.0.dist-info/METADATA,sha256=EIyZZqzAvHgQ7jfEs5x5LUx3HjzoCUhzJoXfkb3CuoI,11538
|
|
10
|
-
smallestai-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
11
|
-
smallestai-2.0.0.dist-info/top_level.txt,sha256=i5ktbWkG-2aS28vrYTeuhKtA-tY2ZG7SHgLHi87QTLw,9
|
|
12
|
-
smallestai-2.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|