smallestai 1.3.3__tar.gz → 1.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of smallestai might be problematic. Click here for more details.
- {smallestai-1.3.3 → smallestai-1.3.4}/PKG-INFO +15 -5
- {smallestai-1.3.3 → smallestai-1.3.4}/README.md +13 -3
- {smallestai-1.3.3 → smallestai-1.3.4}/pyproject.toml +2 -2
- {smallestai-1.3.3 → smallestai-1.3.4}/smallest/async_tts.py +18 -17
- smallestai-1.3.4/smallest/models.py +23 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallest/stream_tts.py +4 -4
- {smallestai-1.3.3 → smallestai-1.3.4}/smallest/tts.py +5 -5
- smallestai-1.3.4/smallest/utils.py +109 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallestai.egg-info/PKG-INFO +15 -5
- {smallestai-1.3.3 → smallestai-1.3.4}/tests/test_async.py +6 -4
- {smallestai-1.3.3 → smallestai-1.3.4}/tests/test_sync.py +3 -3
- smallestai-1.3.4/tests/test_utils.py +43 -0
- smallestai-1.3.3/smallest/models.py +0 -7
- smallestai-1.3.3/smallest/utils.py +0 -115
- smallestai-1.3.3/tests/test_utils.py +0 -40
- {smallestai-1.3.3 → smallestai-1.3.4}/LICENSE +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/setup.cfg +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallest/__init__.py +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallest/exceptions.py +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallestai.egg-info/SOURCES.txt +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallestai.egg-info/dependency_links.txt +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallestai.egg-info/requires.txt +0 -0
- {smallestai-1.3.3 → smallestai-1.3.4}/smallestai.egg-info/top_level.txt +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
|
-
Author-email: Smallest <
|
|
5
|
+
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/smallest-inc/smallest-python-sdk
|
|
8
8
|
Keywords: smallest,smallest.ai,tts,text-to-speech
|
|
@@ -53,6 +53,7 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
53
53
|
|
|
54
54
|
- [Installation](#installation)
|
|
55
55
|
- [Get the API Key](#get-the-api-key)
|
|
56
|
+
- [Best Practices for Input Text](#best-practices-for-input-text)
|
|
56
57
|
- [Examples](#examples)
|
|
57
58
|
- [Sync](#sync)
|
|
58
59
|
- [Async](#async)
|
|
@@ -76,6 +77,15 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
76
77
|
3. Create a new API Key and copy it.
|
|
77
78
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
78
79
|
|
|
80
|
+
## Best Practices for Input Text
|
|
81
|
+
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
82
|
+
|
|
83
|
+
For optimal voice generation results:
|
|
84
|
+
|
|
85
|
+
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
86
|
+
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
87
|
+
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
88
|
+
|
|
79
89
|
## Examples
|
|
80
90
|
|
|
81
91
|
### Sync
|
|
@@ -162,7 +172,7 @@ audio_bytes = await tts.synthesize(
|
|
|
162
172
|
|
|
163
173
|
### LLM to Speech
|
|
164
174
|
|
|
165
|
-
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output
|
|
175
|
+
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
166
176
|
|
|
167
177
|
```python
|
|
168
178
|
import os
|
|
@@ -236,7 +246,7 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
236
246
|
```python
|
|
237
247
|
from smallest.tts import Smallest
|
|
238
248
|
|
|
239
|
-
client = Smallest()
|
|
249
|
+
client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
240
250
|
|
|
241
251
|
print(f"Avalaible Languages: {client.get_languages()}")
|
|
242
252
|
print(f"Available Voices: {client.get_voices()}")
|
|
@@ -254,7 +264,7 @@ When implementing audio streaming with chunks of synthesized speech, WAV headers
|
|
|
254
264
|
- Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
|
|
255
265
|
- Audio players would try to reinitialize audio settings for each chunk.
|
|
256
266
|
|
|
257
|
-
### Best Practices
|
|
267
|
+
### Best Practices for Audio Streaming
|
|
258
268
|
1. Stream raw PCM audio data without headers
|
|
259
269
|
2. Add a single WAV header only when:
|
|
260
270
|
- Saving the complete stream to a file
|
|
@@ -26,6 +26,7 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
26
26
|
|
|
27
27
|
- [Installation](#installation)
|
|
28
28
|
- [Get the API Key](#get-the-api-key)
|
|
29
|
+
- [Best Practices for Input Text](#best-practices-for-input-text)
|
|
29
30
|
- [Examples](#examples)
|
|
30
31
|
- [Sync](#sync)
|
|
31
32
|
- [Async](#async)
|
|
@@ -49,6 +50,15 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
49
50
|
3. Create a new API Key and copy it.
|
|
50
51
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
51
52
|
|
|
53
|
+
## Best Practices for Input Text
|
|
54
|
+
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
55
|
+
|
|
56
|
+
For optimal voice generation results:
|
|
57
|
+
|
|
58
|
+
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
59
|
+
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
60
|
+
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
61
|
+
|
|
52
62
|
## Examples
|
|
53
63
|
|
|
54
64
|
### Sync
|
|
@@ -135,7 +145,7 @@ audio_bytes = await tts.synthesize(
|
|
|
135
145
|
|
|
136
146
|
### LLM to Speech
|
|
137
147
|
|
|
138
|
-
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output
|
|
148
|
+
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
139
149
|
|
|
140
150
|
```python
|
|
141
151
|
import os
|
|
@@ -209,7 +219,7 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
209
219
|
```python
|
|
210
220
|
from smallest.tts import Smallest
|
|
211
221
|
|
|
212
|
-
client = Smallest()
|
|
222
|
+
client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
213
223
|
|
|
214
224
|
print(f"Avalaible Languages: {client.get_languages()}")
|
|
215
225
|
print(f"Available Voices: {client.get_voices()}")
|
|
@@ -227,7 +237,7 @@ When implementing audio streaming with chunks of synthesized speech, WAV headers
|
|
|
227
237
|
- Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
|
|
228
238
|
- Audio players would try to reinitialize audio settings for each chunk.
|
|
229
239
|
|
|
230
|
-
### Best Practices
|
|
240
|
+
### Best Practices for Audio Streaming
|
|
231
241
|
1. Stream raw PCM audio data without headers
|
|
232
242
|
2. Add a single WAV header only when:
|
|
233
243
|
- Saving the complete stream to a file
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "smallestai"
|
|
3
|
-
version = "1.3.
|
|
3
|
+
version = "1.3.4"
|
|
4
4
|
description = "Official Python client for the Smallest AI API"
|
|
5
5
|
authors = [
|
|
6
|
-
{name = "Smallest", email = "
|
|
6
|
+
{name = "Smallest", email = "support@smallest.ai"},
|
|
7
7
|
]
|
|
8
8
|
readme = "README.md"
|
|
9
9
|
license = {text = "MIT"}
|
|
@@ -4,16 +4,16 @@ import aiohttp
|
|
|
4
4
|
import aiofiles
|
|
5
5
|
from typing import Optional, Union, List
|
|
6
6
|
|
|
7
|
-
from .models import TTSModels, TTSVoices
|
|
8
|
-
from .exceptions import TTSError, APIError
|
|
9
|
-
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
|
|
7
|
+
from smallest.models import TTSModels, TTSVoices
|
|
8
|
+
from smallest.exceptions import TTSError, APIError
|
|
9
|
+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
|
|
10
10
|
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class AsyncSmallest:
|
|
14
14
|
def __init__(
|
|
15
15
|
self,
|
|
16
|
-
api_key:
|
|
16
|
+
api_key: str = None,
|
|
17
17
|
model: TTSModels = "lightning",
|
|
18
18
|
sample_rate: int = 24000,
|
|
19
19
|
voice: TTSVoices = "emily",
|
|
@@ -25,8 +25,8 @@ class AsyncSmallest:
|
|
|
25
25
|
"""
|
|
26
26
|
AsyncSmallest Instance for asynchronous text-to-speech synthesis.
|
|
27
27
|
|
|
28
|
-
This class provides an asynchronous implementation of the text-to-speech functionality.
|
|
29
|
-
It allows for non-blocking synthesis of speech from text, making it suitable for applications
|
|
28
|
+
This class provides an asynchronous implementation of the text-to-speech functionality.
|
|
29
|
+
It allows for non-blocking synthesis of speech from text, making it suitable for applications
|
|
30
30
|
that require async processing.
|
|
31
31
|
|
|
32
32
|
Args:
|
|
@@ -49,7 +49,7 @@ class AsyncSmallest:
|
|
|
49
49
|
if not self.api_key:
|
|
50
50
|
raise TTSError()
|
|
51
51
|
self.chunk_size = 250
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
self.opts = TTSOptions(
|
|
54
54
|
model=model,
|
|
55
55
|
sample_rate=sample_rate,
|
|
@@ -61,7 +61,7 @@ class AsyncSmallest:
|
|
|
61
61
|
remove_extra_silence=remove_extra_silence,
|
|
62
62
|
)
|
|
63
63
|
self.session = None
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
async def __aenter__(self):
|
|
66
66
|
if self.session is None:
|
|
67
67
|
self.session = aiohttp.ClientSession()
|
|
@@ -75,7 +75,7 @@ class AsyncSmallest:
|
|
|
75
75
|
def get_languages(self) -> List[str]:
|
|
76
76
|
"""Returns a list of available languages."""
|
|
77
77
|
return get_smallest_languages()
|
|
78
|
-
|
|
78
|
+
|
|
79
79
|
def get_voices(self) -> List[str]:
|
|
80
80
|
"""Returns a list of available voices."""
|
|
81
81
|
return get_smallest_voices()
|
|
@@ -83,7 +83,7 @@ class AsyncSmallest:
|
|
|
83
83
|
def get_models(self) -> List[str]:
|
|
84
84
|
"""Returns a list of available models."""
|
|
85
85
|
return get_smallest_models()
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
async def synthesize(
|
|
88
88
|
self,
|
|
89
89
|
text: str,
|
|
@@ -95,12 +95,12 @@ class AsyncSmallest:
|
|
|
95
95
|
|
|
96
96
|
Args:
|
|
97
97
|
- text (str): The text to be converted to speech.
|
|
98
|
-
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
98
|
+
- save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
|
|
99
99
|
The file must have a .wav extension.
|
|
100
100
|
- kwargs: Additional optional parameters to override `__init__` options for this call.
|
|
101
101
|
|
|
102
102
|
Returns:
|
|
103
|
-
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
|
|
103
|
+
- Union[bytes, None]: The synthesized audio content in bytes if `save_as` is not specified;
|
|
104
104
|
otherwise, returns None after saving the audio to the specified file.
|
|
105
105
|
|
|
106
106
|
Raises:
|
|
@@ -111,7 +111,8 @@ class AsyncSmallest:
|
|
|
111
111
|
for key, value in kwargs.items():
|
|
112
112
|
setattr(opts, key, value)
|
|
113
113
|
|
|
114
|
-
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
114
|
+
validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
115
|
+
|
|
115
116
|
chunks = split_into_chunks(text)
|
|
116
117
|
audio_content = b""
|
|
117
118
|
|
|
@@ -134,17 +135,17 @@ class AsyncSmallest:
|
|
|
134
135
|
|
|
135
136
|
if not self.session:
|
|
136
137
|
self.session = aiohttp.ClientSession()
|
|
137
|
-
|
|
138
|
+
|
|
138
139
|
async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
|
|
139
140
|
if res.status != 200:
|
|
140
141
|
raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
|
|
141
|
-
|
|
142
|
+
|
|
142
143
|
audio_content += await res.read()
|
|
143
144
|
|
|
144
145
|
if save_as:
|
|
145
146
|
if not save_as.endswith(".wav"):
|
|
146
147
|
raise TTSError("Invalid file name. Extension must be .wav")
|
|
147
|
-
|
|
148
|
+
|
|
148
149
|
async with aiofiles.open(save_as, mode='wb') as f:
|
|
149
150
|
await f.write(add_wav_header(audio_content, self.opts.sample_rate))
|
|
150
151
|
|
|
@@ -152,5 +153,5 @@ class AsyncSmallest:
|
|
|
152
153
|
|
|
153
154
|
if opts.add_wav_header:
|
|
154
155
|
return add_wav_header(audio_content, self.opts.sample_rate)
|
|
155
|
-
|
|
156
|
+
|
|
156
157
|
return audio_content
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Literal, List, Tuple, cast
|
|
2
|
+
import aiohttp
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
6
|
+
|
|
7
|
+
async def _fetch_voice_and_model() -> Tuple[List[str], List[str]]:
|
|
8
|
+
async with aiohttp.ClientSession() as session:
|
|
9
|
+
async with session.get(f"{API_BASE_URL}/voice/get-all-models") as response:
|
|
10
|
+
api_response = await response.json()
|
|
11
|
+
|
|
12
|
+
voices = []
|
|
13
|
+
for model in api_response:
|
|
14
|
+
for voice in model['voiceIds']:
|
|
15
|
+
voices.append(voice['voiceId'])
|
|
16
|
+
models = [model['modelName'] for model in api_response]
|
|
17
|
+
return models, voices
|
|
18
|
+
|
|
19
|
+
models, voices = asyncio.run(_fetch_voice_and_model())
|
|
20
|
+
|
|
21
|
+
TTSLanguages = ["en", "hi"]
|
|
22
|
+
TTSModels = models
|
|
23
|
+
TTSVoices = voices
|
|
@@ -3,10 +3,10 @@ from threading import Thread
|
|
|
3
3
|
from queue import Queue, Empty
|
|
4
4
|
from typing import AsyncGenerator, Optional, Union
|
|
5
5
|
|
|
6
|
-
from .tts import Smallest
|
|
7
|
-
from .exceptions import APIError
|
|
8
|
-
from .async_tts import AsyncSmallest
|
|
9
|
-
from .utils import SENTENCE_END_REGEX
|
|
6
|
+
from smallest.tts import Smallest
|
|
7
|
+
from smallest.exceptions import APIError
|
|
8
|
+
from smallest.async_tts import AsyncSmallest
|
|
9
|
+
from smallest.utils import SENTENCE_END_REGEX
|
|
10
10
|
|
|
11
11
|
class TextToAudioStream:
|
|
12
12
|
def __init__(
|
|
@@ -4,15 +4,15 @@ import copy
|
|
|
4
4
|
import requests
|
|
5
5
|
from typing import Optional, Union, List
|
|
6
6
|
|
|
7
|
-
from .models import TTSModels, TTSVoices
|
|
8
|
-
from .exceptions import TTSError, APIError
|
|
9
|
-
from .utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
|
|
7
|
+
from smallest.models import TTSModels, TTSVoices
|
|
8
|
+
from smallest.exceptions import TTSError, APIError
|
|
9
|
+
from smallest.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, split_into_chunks,
|
|
10
10
|
get_smallest_languages, get_smallest_voices, get_smallest_models, SENTENCE_END_REGEX, API_BASE_URL)
|
|
11
11
|
|
|
12
12
|
class Smallest:
|
|
13
13
|
def __init__(
|
|
14
14
|
self,
|
|
15
|
-
api_key:
|
|
15
|
+
api_key: str = None,
|
|
16
16
|
model: TTSModels = "lightning",
|
|
17
17
|
sample_rate: int = 24000,
|
|
18
18
|
voice: TTSVoices = "emily",
|
|
@@ -100,7 +100,7 @@ class Smallest:
|
|
|
100
100
|
for key, value in kwargs.items():
|
|
101
101
|
setattr(opts, key, value)
|
|
102
102
|
|
|
103
|
-
validate_input(text, opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
103
|
+
validate_input(preprocess_text(text), opts.voice, opts.model, opts.sample_rate, opts.speed)
|
|
104
104
|
|
|
105
105
|
chunks = split_into_chunks(text)
|
|
106
106
|
audio_content = b""
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import io
|
|
3
|
+
import unicodedata
|
|
4
|
+
from typing import List
|
|
5
|
+
from pydub import AudioSegment
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from sacremoses import MosesPunctNormalizer
|
|
8
|
+
|
|
9
|
+
from smallest.exceptions import ValidationError
|
|
10
|
+
from smallest.models import TTSModels, TTSLanguages, TTSVoices
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
14
|
+
SENTENCE_END_REGEX = re.compile(r'.*[-.—!?;:…\n]$')
|
|
15
|
+
CHUNK_SIZE = 250
|
|
16
|
+
SAMPLE_WIDTH = 2
|
|
17
|
+
CHANNELS = 1
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class TTSOptions:
|
|
22
|
+
model: TTSModels
|
|
23
|
+
sample_rate: int
|
|
24
|
+
voice: TTSVoices
|
|
25
|
+
api_key: str
|
|
26
|
+
add_wav_header: bool
|
|
27
|
+
speed: float
|
|
28
|
+
transliterate: bool
|
|
29
|
+
remove_extra_silence: bool
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def validate_input(text: str, voice: TTSVoices, model: TTSModels, sample_rate: int, speed: float):
|
|
33
|
+
if not text:
|
|
34
|
+
raise ValidationError("Text cannot be empty")
|
|
35
|
+
if voice not in TTSVoices:
|
|
36
|
+
raise ValidationError(f"Invalid voice: {voice}")
|
|
37
|
+
if model not in TTSModels:
|
|
38
|
+
raise ValidationError(f"Invalid model: {model}")
|
|
39
|
+
if not 8000 <= sample_rate <= 24000:
|
|
40
|
+
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 24000")
|
|
41
|
+
if not 0.5 <= speed <= 2.0:
|
|
42
|
+
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
46
|
+
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
47
|
+
wav_buf = io.BytesIO()
|
|
48
|
+
audio.export(wav_buf, format="wav")
|
|
49
|
+
wav_buf.seek(0)
|
|
50
|
+
return wav_buf.read()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def preprocess_text(text: str) -> str:
|
|
54
|
+
text = text.replace("\n", " ").replace("\t", " ").replace("—", " ")
|
|
55
|
+
text = re.sub(r'\s+', ' ', text)
|
|
56
|
+
mpn = MosesPunctNormalizer()
|
|
57
|
+
text = mpn.normalize(text)
|
|
58
|
+
return text.strip()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def split_into_chunks(text: str) -> List[str]:
|
|
62
|
+
"""
|
|
63
|
+
Splits the input text into chunks based on sentence boundaries
|
|
64
|
+
defined by SENTENCE_END_REGEX and the maximum chunk size.
|
|
65
|
+
Only splits at valid sentence boundaries to avoid breaking words.
|
|
66
|
+
"""
|
|
67
|
+
chunks = []
|
|
68
|
+
while text:
|
|
69
|
+
# If the remaining text is shorter than chunk size, add it as final chunk
|
|
70
|
+
if len(text) <= CHUNK_SIZE:
|
|
71
|
+
chunks.append(text.strip())
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
# Find the last sentence boundary within CHUNK_SIZE
|
|
75
|
+
chunk_text = text[:CHUNK_SIZE]
|
|
76
|
+
last_break_index = -1
|
|
77
|
+
|
|
78
|
+
# Check each character in reverse order to find last punctuation
|
|
79
|
+
for i in range(len(chunk_text) - 1, -1, -1):
|
|
80
|
+
if chunk_text[i] in '-.—!?;:…\n':
|
|
81
|
+
last_break_index = i
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
if last_break_index == -1:
|
|
85
|
+
# If no punctuation found in chunk, look for the last space
|
|
86
|
+
# to avoid breaking words
|
|
87
|
+
last_space = chunk_text.rfind(' ')
|
|
88
|
+
if last_space != -1:
|
|
89
|
+
last_break_index = last_space
|
|
90
|
+
else:
|
|
91
|
+
# If no space found, use the full chunk size
|
|
92
|
+
last_break_index = CHUNK_SIZE - 1
|
|
93
|
+
|
|
94
|
+
# Add the chunk up to the break point
|
|
95
|
+
chunks.append(text[:last_break_index + 1].strip())
|
|
96
|
+
# Continue with remaining text
|
|
97
|
+
text = text[last_break_index + 1:].strip()
|
|
98
|
+
|
|
99
|
+
return chunks
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_smallest_languages() -> List[str]:
|
|
103
|
+
return list(TTSLanguages)
|
|
104
|
+
|
|
105
|
+
def get_smallest_voices() -> List[str]:
|
|
106
|
+
return list(TTSVoices)
|
|
107
|
+
|
|
108
|
+
def get_smallest_models() -> List[str]:
|
|
109
|
+
return ["lightning"]
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: smallestai
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: Official Python client for the Smallest AI API
|
|
5
|
-
Author-email: Smallest <
|
|
5
|
+
Author-email: Smallest <support@smallest.ai>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/smallest-inc/smallest-python-sdk
|
|
8
8
|
Keywords: smallest,smallest.ai,tts,text-to-speech
|
|
@@ -53,6 +53,7 @@ Currently, the library supports direct synthesis and the ability to synthesize s
|
|
|
53
53
|
|
|
54
54
|
- [Installation](#installation)
|
|
55
55
|
- [Get the API Key](#get-the-api-key)
|
|
56
|
+
- [Best Practices for Input Text](#best-practices-for-input-text)
|
|
56
57
|
- [Examples](#examples)
|
|
57
58
|
- [Sync](#sync)
|
|
58
59
|
- [Async](#async)
|
|
@@ -76,6 +77,15 @@ When using an SDK in your application, make sure to pin to at least the major ve
|
|
|
76
77
|
3. Create a new API Key and copy it.
|
|
77
78
|
4. Export the API Key in your environment with the name `SMALLEST_API_KEY`, ensuring that your application can access it securely for authentication.
|
|
78
79
|
|
|
80
|
+
## Best Practices for Input Text
|
|
81
|
+
While the `transliterate` parameter is provided, please note that it is not fully supported and may not perform consistently across all cases. It is recommended to use the model without relying on this parameter.
|
|
82
|
+
|
|
83
|
+
For optimal voice generation results:
|
|
84
|
+
|
|
85
|
+
1. For English, provide the input in Latin script (e.g., "Hello, how are you?").
|
|
86
|
+
2. For Hindi, provide the input in Devanagari script (e.g., "नमस्ते, आप कैसे हैं?").
|
|
87
|
+
3. For code-mixed input, use Latin script for English and Devanagari script for Hindi (e.g., "Hello, आप कैसे हैं?").
|
|
88
|
+
|
|
79
89
|
## Examples
|
|
80
90
|
|
|
81
91
|
### Sync
|
|
@@ -162,7 +172,7 @@ audio_bytes = await tts.synthesize(
|
|
|
162
172
|
|
|
163
173
|
### LLM to Speech
|
|
164
174
|
|
|
165
|
-
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output
|
|
175
|
+
The `TextToAudioStream` class provides real-time text-to-speech processing, converting streaming text into audio output. It's particularly useful for applications like voice assistants, live captioning, or interactive chatbots that require immediate audio feedback from text generation. Supports both synchronous and asynchronous TTS instance.
|
|
166
176
|
|
|
167
177
|
```python
|
|
168
178
|
import os
|
|
@@ -236,7 +246,7 @@ The processor yields raw audio data chunks without WAV headers for streaming eff
|
|
|
236
246
|
```python
|
|
237
247
|
from smallest.tts import Smallest
|
|
238
248
|
|
|
239
|
-
client = Smallest()
|
|
249
|
+
client = Smallest(api_key=os.environ.get("SMALLEST_API_KEY"))
|
|
240
250
|
|
|
241
251
|
print(f"Avalaible Languages: {client.get_languages()}")
|
|
242
252
|
print(f"Available Voices: {client.get_voices()}")
|
|
@@ -254,7 +264,7 @@ When implementing audio streaming with chunks of synthesized speech, WAV headers
|
|
|
254
264
|
- Sequential playback of chunks with headers causes audio artifacts (pop sounds) when concatenating or playing audio sequentially.
|
|
255
265
|
- Audio players would try to reinitialize audio settings for each chunk.
|
|
256
266
|
|
|
257
|
-
### Best Practices
|
|
267
|
+
### Best Practices for Audio Streaming
|
|
258
268
|
1. Stream raw PCM audio data without headers
|
|
259
269
|
2. Add a single WAV header only when:
|
|
260
270
|
- Saving the complete stream to a file
|
|
@@ -3,6 +3,7 @@ import jiwer
|
|
|
3
3
|
import httpx
|
|
4
4
|
import pytest
|
|
5
5
|
import wave
|
|
6
|
+
import re
|
|
6
7
|
from deepgram import DeepgramClient, DeepgramClientOptions, PrerecordedOptions, FileSource
|
|
7
8
|
|
|
8
9
|
from smallest.async_tts import AsyncSmallest
|
|
@@ -10,7 +11,8 @@ from smallest.async_tts import AsyncSmallest
|
|
|
10
11
|
from dotenv import load_dotenv
|
|
11
12
|
load_dotenv()
|
|
12
13
|
|
|
13
|
-
REFERENCE = "Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky
|
|
14
|
+
REFERENCE = "Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky."
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
transforms = jiwer.Compose(
|
|
16
18
|
[
|
|
@@ -32,7 +34,7 @@ config: DeepgramClientOptions = DeepgramClientOptions(api_key=os.environ.get("DE
|
|
|
32
34
|
deepgram: DeepgramClient = DeepgramClient("", config)
|
|
33
35
|
|
|
34
36
|
options: PrerecordedOptions = PrerecordedOptions(
|
|
35
|
-
model="nova-2",
|
|
37
|
+
model="nova-2-general",
|
|
36
38
|
smart_format=True,
|
|
37
39
|
utterances=True,
|
|
38
40
|
punctuate=True,
|
|
@@ -63,7 +65,7 @@ async def test_synthesize_save(reference_text):
|
|
|
63
65
|
wer = jiwer.wer(
|
|
64
66
|
reference_text,
|
|
65
67
|
hypothesis,
|
|
66
|
-
|
|
68
|
+
reference_transform=transforms,
|
|
67
69
|
hypothesis_transform=transforms,
|
|
68
70
|
)
|
|
69
71
|
assert wer <= 0.2
|
|
@@ -94,7 +96,7 @@ async def test_synthesize(reference_text):
|
|
|
94
96
|
wer = jiwer.wer(
|
|
95
97
|
reference_text,
|
|
96
98
|
hypothesis,
|
|
97
|
-
|
|
99
|
+
reference_transform=transforms,
|
|
98
100
|
hypothesis_transform=transforms,
|
|
99
101
|
)
|
|
100
102
|
assert wer <= 0.2
|
|
@@ -10,7 +10,7 @@ from smallest.tts import Smallest
|
|
|
10
10
|
from dotenv import load_dotenv
|
|
11
11
|
load_dotenv()
|
|
12
12
|
|
|
13
|
-
REFERENCE = "Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky
|
|
13
|
+
REFERENCE = "Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky."
|
|
14
14
|
|
|
15
15
|
transforms = jiwer.Compose(
|
|
16
16
|
[
|
|
@@ -59,7 +59,7 @@ def test_synthesize_save(reference_text):
|
|
|
59
59
|
wer = jiwer.wer(
|
|
60
60
|
reference_text,
|
|
61
61
|
hypothesis,
|
|
62
|
-
|
|
62
|
+
reference_transform=transforms,
|
|
63
63
|
hypothesis_transform=transforms,
|
|
64
64
|
)
|
|
65
65
|
logging.info(f"Word Error Rate: {wer}")
|
|
@@ -83,7 +83,7 @@ def test_synthesize(reference_text):
|
|
|
83
83
|
wer = jiwer.wer(
|
|
84
84
|
reference_text,
|
|
85
85
|
hypothesis,
|
|
86
|
-
|
|
86
|
+
reference_transform=transforms,
|
|
87
87
|
hypothesis_transform=transforms,
|
|
88
88
|
)
|
|
89
89
|
logging.info(f"Word Error Rate: {wer}")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import re
|
|
3
|
+
import jiwer
|
|
4
|
+
|
|
5
|
+
from smallest.utils import (
|
|
6
|
+
preprocess_text,
|
|
7
|
+
split_into_chunks,
|
|
8
|
+
get_smallest_languages,
|
|
9
|
+
get_smallest_voices,
|
|
10
|
+
get_smallest_models
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.parametrize("input_text, expected_output", [
|
|
15
|
+
(
|
|
16
|
+
"Wow! The jubilant child, bursting with glee, $99.99 exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky. \n\n\n वो रंग-बिरंगे गुब्बारे हवा में ऐसे झूल रहे थे जैसे एक खुशियों से \n\n 95 भरी दुनिया हो। सच में, यह एक अद्भुत और खुशी से भरा दृश्य था।",
|
|
17
|
+
"Wow! The jubilant child, bursting with glee, $99.99 exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky. वो रंग-बिरंगे गुबबारे हवा में ऐसे झूल रहे थे जैसे एक खुशियों से 95 भरी दुनिया हो। सच में, यह एक अदभुत और खुशी से भरा दृशय था।"
|
|
18
|
+
),
|
|
19
|
+
# can add more tests here
|
|
20
|
+
])
|
|
21
|
+
def test_preprocess_text(input_text, expected_output):
|
|
22
|
+
assert preprocess_text(input_text) == expected_output
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.mark.parametrize("input_text, expected_output", [
|
|
26
|
+
(
|
|
27
|
+
"Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky. वो रंग बिरंगे गुब्बारे हवा में ऐसे झूल रहे थे जैसे एक खुशियों से भरी दुनिया हो। सच में, यह एक अद्भुत और खुशी से भरा दृश्य था।",
|
|
28
|
+
[
|
|
29
|
+
"Wow! The jubilant child, bursting with glee, exclaimed, 'Look at those magnificent, vibrant balloons!' as they danced under the shimmering, rainbow-hued sky.",
|
|
30
|
+
"वो रंग बिरंगे गुब्बारे हवा में ऐसे झूल रहे थे जैसे एक खुशियों से भरी दुनिया हो। सच में, यह एक अद्भुत और खुशी से भरा दृश्य था।"
|
|
31
|
+
]
|
|
32
|
+
),
|
|
33
|
+
# Add more test cases here as needed
|
|
34
|
+
])
|
|
35
|
+
def test_split_into_chunks(input_text, expected_output):
|
|
36
|
+
assert split_into_chunks(input_text) == expected_output
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize("expected_languages", [
|
|
40
|
+
['en', 'hi']
|
|
41
|
+
])
|
|
42
|
+
def test_get_smallest_languages(expected_languages):
|
|
43
|
+
assert get_smallest_languages() == expected_languages
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
from typing import Literal
|
|
2
|
-
|
|
3
|
-
TTSModels = Literal["lightning"]
|
|
4
|
-
TTSLanguages = Literal["en", "hi"]
|
|
5
|
-
TTSVoices = Literal["emily", "jasmine", "arman", "james", "mithali", "aravind", "raj",
|
|
6
|
-
"arjun", "sanya", "saina", "pooja", "saurabh", "nisha", "mansi", "radhika", "kajal",
|
|
7
|
-
"raghav", "deepika", "niharika", "monika", "raman", "diya", "ananya", "william"]
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import io
|
|
3
|
-
import unicodedata
|
|
4
|
-
from typing import List
|
|
5
|
-
from pydub import AudioSegment
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from sacremoses import MosesPunctNormalizer
|
|
8
|
-
|
|
9
|
-
from .exceptions import ValidationError
|
|
10
|
-
from .models import TTSModels, TTSLanguages, TTSVoices
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
|
|
14
|
-
SENTENCE_END_REGEX = re.compile(r'.*[-.—!?;:…\n]$')
|
|
15
|
-
CHUNK_SIZE = 250
|
|
16
|
-
SAMPLE_WIDTH = 2
|
|
17
|
-
CHANNELS = 1
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@dataclass
|
|
21
|
-
class TTSOptions:
|
|
22
|
-
model: TTSModels
|
|
23
|
-
sample_rate: int
|
|
24
|
-
voice: TTSVoices
|
|
25
|
-
api_key: str
|
|
26
|
-
add_wav_header: bool
|
|
27
|
-
speed: float
|
|
28
|
-
transliterate: bool
|
|
29
|
-
remove_extra_silence: bool
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def validate_input(text: str, voice: TTSVoices, model: TTSModels, sample_rate: int, speed: float):
|
|
33
|
-
if not text:
|
|
34
|
-
raise ValidationError("Text cannot be empty")
|
|
35
|
-
if voice not in TTSVoices.__args__:
|
|
36
|
-
raise ValidationError(f"Invalid voice: {voice}")
|
|
37
|
-
if model not in ['lightning']:
|
|
38
|
-
raise ValidationError(f"Invalid model: {model}")
|
|
39
|
-
if not 8000 <= sample_rate <= 48000:
|
|
40
|
-
raise ValidationError(f"Invalid sample rate: {sample_rate}. Must be between 8000 and 48000")
|
|
41
|
-
if not 0.5 <= speed <= 2.0:
|
|
42
|
-
raise ValidationError(f"Invalid speed: {speed}. Must be between 0.5 and 2.0")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
|
|
46
|
-
audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
|
|
47
|
-
wav_buf = io.BytesIO()
|
|
48
|
-
audio.export(wav_buf, format="wav")
|
|
49
|
-
wav_buf.seek(0)
|
|
50
|
-
return wav_buf.read()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def preprocess_text(text: str) -> str:
|
|
54
|
-
# Replace special characters with their normal form
|
|
55
|
-
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
|
56
|
-
text = text.lower()
|
|
57
|
-
text = text.replace("—", " ")
|
|
58
|
-
# Normalize punctuation using Moses punct normalizer
|
|
59
|
-
mpn = MosesPunctNormalizer()
|
|
60
|
-
text = mpn.normalize(text)
|
|
61
|
-
return text.strip()
|
|
62
|
-
|
|
63
|
-
def split_into_chunks(text: str) -> List[str]:
|
|
64
|
-
"""
|
|
65
|
-
Splits the input text into chunks based on sentence boundaries
|
|
66
|
-
defined by SENTENCE_END_REGEX and the maximum chunk size.
|
|
67
|
-
"""
|
|
68
|
-
chunks = []
|
|
69
|
-
current_chunk = ""
|
|
70
|
-
last_break_index = 0
|
|
71
|
-
|
|
72
|
-
i = 0
|
|
73
|
-
while i < len(text):
|
|
74
|
-
current_chunk += text[i]
|
|
75
|
-
|
|
76
|
-
# Check for sentence boundary using regex
|
|
77
|
-
if SENTENCE_END_REGEX.match(current_chunk):
|
|
78
|
-
last_break_index = i
|
|
79
|
-
|
|
80
|
-
if len(current_chunk) >= CHUNK_SIZE:
|
|
81
|
-
if last_break_index > 0:
|
|
82
|
-
# Split at the last valid sentence boundary
|
|
83
|
-
chunk = text[:last_break_index + 1].strip()
|
|
84
|
-
chunk = chunk.replace("—", " ")
|
|
85
|
-
chunks.append(chunk)
|
|
86
|
-
|
|
87
|
-
text = text[last_break_index + 1:]
|
|
88
|
-
i = -1 # Reset index to process the remaining text
|
|
89
|
-
current_chunk = ""
|
|
90
|
-
last_break_index = 0
|
|
91
|
-
else:
|
|
92
|
-
# No sentence boundary found, split at max length
|
|
93
|
-
current_chunk = current_chunk.replace("—", " ")
|
|
94
|
-
chunks.append(current_chunk.strip())
|
|
95
|
-
text = text[CHUNK_SIZE:]
|
|
96
|
-
i = -1 # Reset index to process the remaining text
|
|
97
|
-
current_chunk = ""
|
|
98
|
-
|
|
99
|
-
i += 1
|
|
100
|
-
|
|
101
|
-
if text:
|
|
102
|
-
text = text.replace("—", " ")
|
|
103
|
-
chunks.append(text.strip())
|
|
104
|
-
|
|
105
|
-
return chunks
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def get_smallest_languages() -> List[str]:
|
|
109
|
-
return list(TTSLanguages.__args__)
|
|
110
|
-
|
|
111
|
-
def get_smallest_voices() -> List[str]:
|
|
112
|
-
return list(TTSVoices.__args__)
|
|
113
|
-
|
|
114
|
-
def get_smallest_models() -> List[str]:
|
|
115
|
-
return ["lightning"]
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
from smallest.utils import (
|
|
4
|
-
preprocess_text,
|
|
5
|
-
get_smallest_languages,
|
|
6
|
-
get_smallest_voices,
|
|
7
|
-
get_smallest_models
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.mark.parametrize("input_text,expected_output", [
|
|
12
|
-
(
|
|
13
|
-
"Check out this amazing website: example.com! It has 10,000 unique visitors per day.\n\nAlso, the price is $99.99",
|
|
14
|
-
"check out this amazing website: example dot com! it has ten thousand unique visitors per day. also, the price is $ninety-nine point nine nine"
|
|
15
|
-
),
|
|
16
|
-
# can add more tests here
|
|
17
|
-
])
|
|
18
|
-
def test_preprocess_text(input_text, expected_output):
|
|
19
|
-
assert preprocess_text(input_text) == expected_output
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.mark.parametrize("expected_languages", [
|
|
23
|
-
['en', 'hi']
|
|
24
|
-
])
|
|
25
|
-
def test_get_smallest_languages(expected_languages):
|
|
26
|
-
assert get_smallest_languages() == expected_languages
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@pytest.mark.parametrize("expected_voices", [
|
|
30
|
-
["emily", "jasmine", "arman", "james", "mithali", "aravind", "raj"]
|
|
31
|
-
])
|
|
32
|
-
def test_get_smallest_voices(expected_voices):
|
|
33
|
-
assert get_smallest_voices() == expected_voices
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@pytest.mark.parametrize("expected_models", [
|
|
37
|
-
["lightning"]
|
|
38
|
-
])
|
|
39
|
-
def test_get_smallest_models(expected_models):
|
|
40
|
-
assert get_smallest_models() == expected_models
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|