smallestai 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of smallestai might be problematic. Click here for more details.
- {smallestai-2.1.0 → smallestai-2.2.0}/PKG-INFO +1 -2
- {smallestai-2.1.0 → smallestai-2.2.0}/pyproject.toml +1 -2
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/async_tts.py +57 -44
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/stream_tts.py +129 -18
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/tts.py +55 -43
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/utils.py +6 -6
- {smallestai-2.1.0 → smallestai-2.2.0}/smallestai.egg-info/PKG-INFO +1 -2
- {smallestai-2.1.0 → smallestai-2.2.0}/smallestai.egg-info/requires.txt +0 -1
- {smallestai-2.1.0 → smallestai-2.2.0}/LICENSE +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/README.md +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/setup.cfg +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/__init__.py +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/exceptions.py +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallest/models.py +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallestai.egg-info/SOURCES.txt +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallestai.egg-info/dependency_links.txt +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/smallestai.egg-info/top_level.txt +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/tests/test_async.py +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/tests/test_sync.py +0 -0
- {smallestai-2.1.0 → smallestai-2.2.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
5
|
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
@@ -15,7 +15,6 @@ License-File: LICENSE
|
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
16
|
Requires-Dist: aiofiles
|
|
17
17
|
Requires-Dist: requests
|
|
18
|
-
Requires-Dist: sacremoses
|
|
19
18
|
Requires-Dist: pydub
|
|
20
19
|
Provides-Extra: test
|
|
21
20
|
Requires-Dist: jiwer; extra == "test"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "smallestai"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.2.0"
|
|
4
4
|
description = "Official Python client for the Smallest AI API"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Smallest", email = "support@smallest.ai"},
|
|
@@ -18,7 +18,6 @@ dependencies = [
|
|
|
18
18
|
"aiohttp",
|
|
19
19
|
"aiofiles",
|
|
20
20
|
"requests",
|
|
21
|
-
"sacremoses",
|
|
22
21
|
"pydub"
|
|
23
22
|
]
|
|
24
23
|
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
import aiofiles
|
|
6
6
|
import requests
|
|
7
|
-
from typing import Optional, Union, List
|
|
7
|
+
from typing import Optional, Union, List, AsyncIterator
|
|
8
8
|
|
|
9
9
|
from smallest.exceptions import TTSError, APIError
|
|
10
10
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
@@ -19,6 +19,9 @@ class AsyncSmallest:
|
|
|
19
19
|
sample_rate: Optional[int] = 24000,
|
|
20
20
|
voice_id: Optional[str] = "emily",
|
|
21
21
|
speed: Optional[float] = 1.0,
|
|
22
|
+
consistency: Optional[float] = 0.5,
|
|
23
|
+
similarity: Optional[float] = 0.0,
|
|
24
|
+
enhancement: Optional[int] = 1,
|
|
22
25
|
add_wav_header: Optional[bool] = True
|
|
23
26
|
) -> None:
|
|
24
27
|
"""
|
|
@@ -34,6 +37,9 @@ class AsyncSmallest:
|
|
|
34
37
|
- sample_rate (int): The sample rate for the audio output.
|
|
35
38
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
36
39
|
- speed (float): The speed of the speech synthesis.
|
|
40
|
+
- consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
|
|
41
|
+
- similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
|
|
42
|
+
- enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
|
|
37
43
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
38
44
|
|
|
39
45
|
Methods:
|
|
@@ -45,7 +51,7 @@ class AsyncSmallest:
|
|
|
45
51
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
46
52
|
if not self.api_key:
|
|
47
53
|
raise TTSError()
|
|
48
|
-
if model == "lightning-large":
|
|
54
|
+
if model == "lightning-large" and voice_id is None:
|
|
49
55
|
voice_id = "lakshya"
|
|
50
56
|
|
|
51
57
|
self.chunk_size = 250
|
|
@@ -56,7 +62,10 @@ class AsyncSmallest:
|
|
|
56
62
|
voice_id=voice_id,
|
|
57
63
|
api_key=self.api_key,
|
|
58
64
|
add_wav_header=add_wav_header,
|
|
59
|
-
speed=speed
|
|
65
|
+
speed=speed,
|
|
66
|
+
consistency=consistency,
|
|
67
|
+
similarity=similarity,
|
|
68
|
+
enhancement=enhancement
|
|
60
69
|
)
|
|
61
70
|
self.session = None
|
|
62
71
|
|
|
@@ -121,27 +130,25 @@ class AsyncSmallest:
|
|
|
121
130
|
async def synthesize(
|
|
122
131
|
self,
|
|
123
132
|
text: str,
|
|
124
|
-
|
|
125
|
-
similarity: Optional[float] = 0,
|
|
126
|
-
enhancement: Optional[bool] = False,
|
|
133
|
+
stream: Optional[bool] = False,
|
|
127
134
|
save_as: Optional[str] = None,
|
|
128
135
|
**kwargs
|
|
129
|
-
) -> Union[bytes, None]:
|
|
136
|
+
) -> Union[bytes, None, AsyncIterator[bytes]]:
|
|
130
137
|
"""
|
|
131
138
|
Asynchronously synthesize speech from the provided text.
|
|
132
139
|
|
|
133
140
|
Args:
|
|
134
141
|
- text (str): The text to be converted to speech.
|
|
142
|
+
- stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
|
|
135
143
|
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
136
144
|
The file must have a .wav extension.
|
|
137
|
-
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
138
|
-
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
139
|
-
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
140
145
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
141
146
|
|
|
142
147
|
Returns:
|
|
143
|
-
- Union[bytes, None]:
|
|
144
|
-
|
|
148
|
+
- Union[bytes, None, Iterator[bytes]]:
|
|
149
|
+
- If `stream=True`, returns an iterator yielding audio chunks.
|
|
150
|
+
- If `save_as` is provided, saves the file and returns None.
|
|
151
|
+
- Otherwise, returns the synthesized audio content as bytes.
|
|
145
152
|
|
|
146
153
|
Raises:
|
|
147
154
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
@@ -165,44 +172,50 @@ class AsyncSmallest:
|
|
|
165
172
|
for key, value in kwargs.items():
|
|
166
173
|
setattr(opts, key, value)
|
|
167
174
|
|
|
168
|
-
|
|
175
|
+
text = preprocess_text(text)
|
|
176
|
+
validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
|
|
169
177
|
|
|
170
178
|
self.chunk_size = 250
|
|
171
179
|
if opts.model == 'lightning-large':
|
|
172
180
|
self.chunk_size = 140
|
|
173
181
|
|
|
174
182
|
chunks = chunk_text(text, self.chunk_size)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
183
|
+
|
|
184
|
+
async def audio_stream():
|
|
185
|
+
for chunk in chunks:
|
|
186
|
+
payload = {
|
|
187
|
+
"text": chunk,
|
|
188
|
+
"sample_rate": opts.sample_rate,
|
|
189
|
+
"voice_id": opts.voice_id,
|
|
190
|
+
"add_wav_header": False,
|
|
191
|
+
"speed": opts.speed,
|
|
192
|
+
"model": opts.model
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if opts.model == "lightning-large":
|
|
196
|
+
if opts.consistency is not None:
|
|
197
|
+
payload["consistency"] = opts.consistency
|
|
198
|
+
if opts.similarity is not None:
|
|
199
|
+
payload["similarity"] = opts.similarity
|
|
200
|
+
if opts.enhancement is not None:
|
|
201
|
+
payload["enhancement"] = opts.enhancement
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
headers = {
|
|
205
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
206
|
+
"Content-Type": "application/json",
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
210
|
+
if res.status != 200:
|
|
211
|
+
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
212
|
+
|
|
213
|
+
yield await res.read()
|
|
214
|
+
|
|
215
|
+
if stream:
|
|
216
|
+
return audio_stream()
|
|
217
|
+
|
|
218
|
+
audio_content = b"".join([chunk async for chunk in audio_stream()])
|
|
206
219
|
|
|
207
220
|
if save_as:
|
|
208
221
|
if not save_as.endswith(".wav"):
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import time
|
|
2
3
|
from threading import Thread
|
|
3
4
|
from queue import Queue, Empty
|
|
4
|
-
from typing import AsyncGenerator, Optional, Union
|
|
5
|
+
from typing import AsyncGenerator, Optional, Union, List, Dict, Any
|
|
5
6
|
|
|
6
7
|
from smallest.tts import Smallest
|
|
7
8
|
from smallest.exceptions import APIError
|
|
@@ -13,7 +14,8 @@ class TextToAudioStream:
|
|
|
13
14
|
self,
|
|
14
15
|
tts_instance: Union[Smallest, AsyncSmallest],
|
|
15
16
|
queue_timeout: Optional[float] = 5.0,
|
|
16
|
-
max_retries: Optional[int] = 3
|
|
17
|
+
max_retries: Optional[int] = 3,
|
|
18
|
+
verbose: bool = False
|
|
17
19
|
):
|
|
18
20
|
"""
|
|
19
21
|
A real-time text-to-speech processor that converts streaming text into audio output.
|
|
@@ -30,8 +32,9 @@ class TextToAudioStream:
|
|
|
30
32
|
|
|
31
33
|
Args:
|
|
32
34
|
tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
|
|
33
|
-
queue_timeout: How long to wait for new text (seconds, default:
|
|
35
|
+
queue_timeout: How long to wait for new text (seconds, default: 1.0)
|
|
34
36
|
max_retries: Number of retry attempts for failed synthesis (default: 3)
|
|
37
|
+
verbose: Whether to log detailed metrics about TTS requests (default: False)
|
|
35
38
|
"""
|
|
36
39
|
self.tts_instance = tts_instance
|
|
37
40
|
self.tts_instance.opts.add_wav_header = False
|
|
@@ -41,6 +44,14 @@ class TextToAudioStream:
|
|
|
41
44
|
self.queue = Queue()
|
|
42
45
|
self.buffer_size = 250
|
|
43
46
|
self.stop_flag = False
|
|
47
|
+
self.verbose = verbose
|
|
48
|
+
|
|
49
|
+
# Metrics tracking
|
|
50
|
+
self.request_count = 0
|
|
51
|
+
self.request_logs: List[Dict[str, Any]] = []
|
|
52
|
+
self.start_time = 0
|
|
53
|
+
self.first_api_response_time = None
|
|
54
|
+
self.end_time = 0
|
|
44
55
|
|
|
45
56
|
if self.tts_instance.opts.model == 'lightning-large':
|
|
46
57
|
self.buffer_size = 140
|
|
@@ -90,24 +101,76 @@ class TextToAudioStream:
|
|
|
90
101
|
|
|
91
102
|
def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
92
103
|
"""Synchronously synthesizes a given sentence."""
|
|
104
|
+
request_start_time = time.time()
|
|
105
|
+
request_id = self.request_count + 1
|
|
106
|
+
|
|
93
107
|
try:
|
|
94
|
-
|
|
108
|
+
audio_content = self.tts_instance.synthesize(sentence)
|
|
109
|
+
self.request_count += 1
|
|
110
|
+
request_end_time = time.time()
|
|
111
|
+
|
|
112
|
+
if self.verbose:
|
|
113
|
+
request_duration = request_end_time - request_start_time
|
|
114
|
+
if self.first_api_response_time is None:
|
|
115
|
+
self.first_api_response_time = time.time() - self.start_time
|
|
116
|
+
|
|
117
|
+
self.request_logs.append({
|
|
118
|
+
"id": request_id,
|
|
119
|
+
"text": sentence,
|
|
120
|
+
"start_time": request_start_time - self.start_time,
|
|
121
|
+
"end_time": request_end_time - self.start_time,
|
|
122
|
+
"duration": request_duration,
|
|
123
|
+
"char_count": len(sentence),
|
|
124
|
+
"retries": retries
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
return audio_content
|
|
95
128
|
except APIError as e:
|
|
96
129
|
if retries < self.max_retries:
|
|
130
|
+
if self.verbose:
|
|
131
|
+
print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
|
|
97
132
|
return self._synthesize_sync(sentence, retries + 1)
|
|
98
133
|
else:
|
|
99
|
-
|
|
134
|
+
if self.verbose:
|
|
135
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
136
|
+
return None
|
|
100
137
|
|
|
101
138
|
|
|
102
139
|
async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
|
|
103
140
|
"""Asynchronously synthesizes a given sentence."""
|
|
141
|
+
request_start_time = time.time()
|
|
142
|
+
request_id = self.request_count + 1
|
|
143
|
+
|
|
104
144
|
try:
|
|
105
|
-
|
|
145
|
+
audio_content = await self.tts_instance.synthesize(sentence)
|
|
146
|
+
self.request_count += 1
|
|
147
|
+
request_end_time = time.time()
|
|
148
|
+
|
|
149
|
+
if self.verbose:
|
|
150
|
+
request_duration = request_end_time - request_start_time
|
|
151
|
+
if self.first_api_response_time is None:
|
|
152
|
+
self.first_api_response_time = time.time() - self.start_time
|
|
153
|
+
|
|
154
|
+
self.request_logs.append({
|
|
155
|
+
"id": request_id,
|
|
156
|
+
"text": sentence,
|
|
157
|
+
"start_time": request_start_time - self.start_time,
|
|
158
|
+
"end_time": request_end_time - self.start_time,
|
|
159
|
+
"duration": request_duration,
|
|
160
|
+
"char_count": len(sentence),
|
|
161
|
+
"retries": retries
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
return audio_content
|
|
106
165
|
except APIError as e:
|
|
107
166
|
if retries < self.max_retries:
|
|
167
|
+
if self.verbose:
|
|
168
|
+
print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
|
|
108
169
|
return await self._synthesize_async(sentence, retries + 1)
|
|
109
170
|
else:
|
|
110
|
-
|
|
171
|
+
if self.verbose:
|
|
172
|
+
print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
|
|
173
|
+
return None
|
|
111
174
|
|
|
112
175
|
|
|
113
176
|
async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
|
|
@@ -117,7 +180,8 @@ class TextToAudioStream:
|
|
|
117
180
|
"""
|
|
118
181
|
while not self.stop_flag or not self.queue.empty():
|
|
119
182
|
try:
|
|
120
|
-
sentence = self.queue.
|
|
183
|
+
sentence = self.queue.get_nowait()
|
|
184
|
+
|
|
121
185
|
if isinstance(self.tts_instance, AsyncSmallest):
|
|
122
186
|
audio_content = await self._synthesize_async(sentence)
|
|
123
187
|
else:
|
|
@@ -126,10 +190,55 @@ class TextToAudioStream:
|
|
|
126
190
|
|
|
127
191
|
if audio_content:
|
|
128
192
|
yield audio_content
|
|
193
|
+
|
|
129
194
|
except Empty:
|
|
130
|
-
if
|
|
195
|
+
# Quick check if we should exit
|
|
196
|
+
if self.stop_flag and self.queue.empty():
|
|
131
197
|
break
|
|
132
|
-
|
|
198
|
+
|
|
199
|
+
# Short sleep to avoid busy-waiting
|
|
200
|
+
await asyncio.sleep(0.01) # Much shorter sleep time (10ms)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _print_verbose_summary(self) -> None:
|
|
204
|
+
"""Print a summary of all metrics if verbose mode is enabled."""
|
|
205
|
+
if not self.verbose:
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
total_duration = self.end_time - self.start_time
|
|
209
|
+
|
|
210
|
+
print("\n" + "="*100)
|
|
211
|
+
print(f"TEXT-TO-AUDIO STREAM METRICS")
|
|
212
|
+
print("="*100)
|
|
213
|
+
|
|
214
|
+
print(f"\nOVERALL STATISTICS:")
|
|
215
|
+
print(f" Total requests made: {self.request_count}")
|
|
216
|
+
print(f" Time to first API response: {self.first_api_response_time:.3f}s")
|
|
217
|
+
print(f" Total processing time: {total_duration:.3f}s")
|
|
218
|
+
|
|
219
|
+
# Print table header
|
|
220
|
+
print("\nREQUEST DETAILS:")
|
|
221
|
+
header = f"{'#':4} {'Start (s)':10} {'End (s)':10} {'Duration (s)':12} {'Characters':15} {'Text'}"
|
|
222
|
+
print("\n" + header)
|
|
223
|
+
print("-" * 100)
|
|
224
|
+
|
|
225
|
+
# Print table rows
|
|
226
|
+
for log in self.request_logs:
|
|
227
|
+
row = (
|
|
228
|
+
f"{log['id']:4} "
|
|
229
|
+
f"{log['start_time']:10.3f} "
|
|
230
|
+
f"{log['end_time']:10.3f} "
|
|
231
|
+
f"{log['duration']:12.3f} "
|
|
232
|
+
f"{log['char_count']:15} "
|
|
233
|
+
f"{log['text'][:50]}{'...' if len(log['text']) > 50 else ''}"
|
|
234
|
+
)
|
|
235
|
+
print(row)
|
|
236
|
+
|
|
237
|
+
# Print retry information if any
|
|
238
|
+
if log['retries'] > 0:
|
|
239
|
+
print(f"{'':4} {'':10} {'':10} {'':12} {'':15} Retries: {log['retries']}")
|
|
240
|
+
|
|
241
|
+
print("\n" + "="*100)
|
|
133
242
|
|
|
134
243
|
|
|
135
244
|
async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
|
|
@@ -149,13 +258,15 @@ class TextToAudioStream:
|
|
|
149
258
|
- Streamed over a network
|
|
150
259
|
- Further processed as needed
|
|
151
260
|
"""
|
|
152
|
-
|
|
261
|
+
self.start_time = time.time()
|
|
262
|
+
|
|
263
|
+
llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
|
|
264
|
+
llm_thread.start()
|
|
153
265
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
yield audio_content
|
|
157
|
-
except Exception as e:
|
|
158
|
-
raise APIError(f"Error during synthesis processing: {e}")
|
|
266
|
+
async for audio_content in self._run_synthesis():
|
|
267
|
+
yield audio_content
|
|
159
268
|
|
|
160
|
-
|
|
161
|
-
|
|
269
|
+
llm_thread.join()
|
|
270
|
+
|
|
271
|
+
self.end_time = time.time()
|
|
272
|
+
self._print_verbose_summary()
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import wave
|
|
4
4
|
import copy
|
|
5
5
|
import requests
|
|
6
|
-
from typing import Optional, Union, List
|
|
6
|
+
from typing import Optional, Union, List, Iterator
|
|
7
7
|
|
|
8
8
|
from smallest.exceptions import TTSError, APIError
|
|
9
9
|
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
|
|
@@ -17,6 +17,9 @@ class Smallest:
|
|
|
17
17
|
sample_rate: Optional[int] = 24000,
|
|
18
18
|
voice_id: Optional[str] = "emily",
|
|
19
19
|
speed: Optional[float] = 1.0,
|
|
20
|
+
consistency: Optional[float] = 0.5,
|
|
21
|
+
similarity: Optional[float] = 0.0,
|
|
22
|
+
enhancement: Optional[int] = 1,
|
|
20
23
|
add_wav_header: Optional[bool] = True
|
|
21
24
|
) -> None:
|
|
22
25
|
"""
|
|
@@ -31,6 +34,9 @@ class Smallest:
|
|
|
31
34
|
- sample_rate (int): The sample rate for the audio output.
|
|
32
35
|
- voice_id (TTSVoices): The voice to be used for synthesis.
|
|
33
36
|
- speed (float): The speed of the speech synthesis.
|
|
37
|
+
- consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
|
|
38
|
+
- similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
|
|
39
|
+
- enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
|
|
34
40
|
- add_wav_header (bool): Whether to add a WAV header to the output audio.
|
|
35
41
|
|
|
36
42
|
Methods:
|
|
@@ -42,7 +48,7 @@ class Smallest:
|
|
|
42
48
|
self.api_key = api_key or os.environ.get("SMALLEST_API_KEY")
|
|
43
49
|
if not self.api_key:
|
|
44
50
|
raise TTSError()
|
|
45
|
-
if model == "lightning-large":
|
|
51
|
+
if model == "lightning-large" and voice_id is None:
|
|
46
52
|
voice_id = "lakshya"
|
|
47
53
|
|
|
48
54
|
self.chunk_size = 250
|
|
@@ -53,7 +59,10 @@ class Smallest:
|
|
|
53
59
|
voice_id=voice_id,
|
|
54
60
|
api_key=self.api_key,
|
|
55
61
|
add_wav_header=add_wav_header,
|
|
56
|
-
speed=speed
|
|
62
|
+
speed=speed,
|
|
63
|
+
consistency=consistency,
|
|
64
|
+
similarity=similarity,
|
|
65
|
+
enhancement=enhancement
|
|
57
66
|
)
|
|
58
67
|
|
|
59
68
|
|
|
@@ -98,27 +107,24 @@ class Smallest:
|
|
|
98
107
|
def synthesize(
|
|
99
108
|
self,
|
|
100
109
|
text: str,
|
|
101
|
-
|
|
102
|
-
similarity: Optional[float] = 0,
|
|
103
|
-
enhancement: Optional[bool] = False,
|
|
110
|
+
stream: Optional[bool] = False,
|
|
104
111
|
save_as: Optional[str] = None,
|
|
105
112
|
**kwargs
|
|
106
|
-
) -> Union[bytes, None]:
|
|
113
|
+
) -> Union[bytes, None, Iterator[bytes]]:
|
|
107
114
|
"""
|
|
108
115
|
Synthesize speech from the provided text.
|
|
109
116
|
|
|
110
|
-
Args:
|
|
111
117
|
- text (str): The text to be converted to speech.
|
|
112
|
-
-
|
|
118
|
+
- stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
|
|
119
|
+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
113
120
|
The file must have a .wav extension.
|
|
114
|
-
- consistency (Optional[float]): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model.
|
|
115
|
-
- similarity (Optional[float]): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model.
|
|
116
|
-
- enhancement (Optional[bool]): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model.
|
|
117
121
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
118
122
|
|
|
119
123
|
Returns:
|
|
120
|
-
- Union[bytes, None]:
|
|
121
|
-
|
|
124
|
+
- Union[bytes, None, Iterator[bytes]]:
|
|
125
|
+
- If `stream=True`, returns an iterator yielding audio chunks.
|
|
126
|
+
- If `save_as` is provided, saves the file and returns None.
|
|
127
|
+
- Otherwise, returns the synthesized audio content as bytes.
|
|
122
128
|
|
|
123
129
|
Raises:
|
|
124
130
|
- TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
|
|
@@ -134,42 +140,48 @@ class Smallest:
|
|
|
134
140
|
for key, value in kwargs.items():
|
|
135
141
|
setattr(opts, key, value)
|
|
136
142
|
|
|
137
|
-
|
|
143
|
+
text = preprocess_text(text)
|
|
144
|
+
validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
|
|
138
145
|
|
|
139
146
|
self.chunk_size = 250
|
|
140
147
|
if opts.model == "lightning-large":
|
|
141
148
|
self.chunk_size = 140
|
|
142
149
|
|
|
143
150
|
chunks = chunk_text(text, self.chunk_size)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
151
|
+
|
|
152
|
+
def audio_stream():
|
|
153
|
+
for chunk in chunks:
|
|
154
|
+
payload = {
|
|
155
|
+
"text": chunk,
|
|
156
|
+
"sample_rate": opts.sample_rate,
|
|
157
|
+
"voice_id": opts.voice_id,
|
|
158
|
+
"add_wav_header": False,
|
|
159
|
+
"speed": opts.speed,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if opts.model == "lightning-large":
|
|
163
|
+
if opts.consistency is not None:
|
|
164
|
+
payload["consistency"] = opts.consistency
|
|
165
|
+
if opts.similarity is not None:
|
|
166
|
+
payload["similarity"] = opts.similarity
|
|
167
|
+
if opts.enhancement is not None:
|
|
168
|
+
payload["enhancement"] = opts.enhancement
|
|
169
|
+
|
|
170
|
+
headers = {
|
|
171
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
172
|
+
"Content-Type": "application/json",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
|
|
176
|
+
if res.status_code != 200:
|
|
177
|
+
raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
|
|
178
|
+
|
|
179
|
+
yield res.content
|
|
171
180
|
|
|
172
|
-
|
|
181
|
+
if stream:
|
|
182
|
+
return audio_stream()
|
|
183
|
+
|
|
184
|
+
audio_content = b"".join(audio_stream())
|
|
173
185
|
|
|
174
186
|
if save_as:
|
|
175
187
|
if not save_as.endswith(".wav"):
|
|
@@ -4,7 +4,6 @@ from typing import List
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
from pydub import AudioSegment
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from sacremoses import MosesPunctNormalizer
|
|
8
7
|
|
|
9
8
|
from smallest.exceptions import ValidationError
|
|
10
9
|
from smallest.models import TTSModels, TTSLanguages
|
|
@@ -12,7 +11,6 @@ from smallest.models import TTSModels, TTSLanguages
|
|
|
12
11
|
|
|
13
12
|
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
14
13
|
SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
|
|
15
|
-
mpn = MosesPunctNormalizer()
|
|
16
14
|
SAMPLE_WIDTH = 2
|
|
17
15
|
CHANNELS = 1
|
|
18
16
|
ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
|
|
@@ -26,9 +24,12 @@ class TTSOptions:
|
|
|
26
24
|
api_key: str
|
|
27
25
|
add_wav_header: bool
|
|
28
26
|
speed: float
|
|
27
|
+
consistency: float
|
|
28
|
+
similarity: float
|
|
29
|
+
enhancement: int
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[
|
|
32
|
+
def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[int] = None):
|
|
32
33
|
if not text:
|
|
33
34
|
raise ValidationError("Text cannot be empty.")
|
|
34
35
|
if model not in TTSModels:
|
|
@@ -41,8 +42,8 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float, consis
|
|
|
41
42
|
raise ValidationError(f"Invalid consistency: {consistency}. Must be between 0.0 and 1.0")
|
|
42
43
|
if similarity is not None and not 0.0 <= similarity <= 1.0:
|
|
43
44
|
raise ValidationError(f"Invalid similarity: {similarity}. Must be between 0.0 and 1.0")
|
|
44
|
-
if enhancement is not None and not
|
|
45
|
-
raise ValidationError(f"Invalid enhancement: {enhancement}. Must be
|
|
45
|
+
if enhancement is not None and not 0 <= enhancement <= 2:
|
|
46
|
+
raise ValidationError(f"Invalid enhancement: {enhancement}. Must be between 0 and 2.")
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
@@ -56,7 +57,6 @@ def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: i
|
|
|
56
57
|
def preprocess_text(text: str) -> str:
|
|
57
58
|
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ").replace("-", " ").replace("–", " ")
|
|
58
59
|
text = re.sub(r'\s+', ' ', text)
|
|
59
|
-
text = mpn.normalize(text)
|
|
60
60
|
return text.strip()
|
|
61
61
|
|
|
62
62
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
5
|
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
@@ -15,7 +15,6 @@ License-File: LICENSE
|
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
16
|
Requires-Dist: aiofiles
|
|
17
17
|
Requires-Dist: requests
|
|
18
|
-
Requires-Dist: sacremoses
|
|
19
18
|
Requires-Dist: pydub
|
|
20
19
|
Provides-Extra: test
|
|
21
20
|
Requires-Dist: jiwer; extra == "test"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|