smallestai 3.0.3__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of smallestai might be problematic. Click here for more details.

smallestai/__init__.py CHANGED
@@ -84,7 +84,7 @@ from smallestai.atoms import (
84
84
  from smallestai.waves import (
85
85
  WavesClient,
86
86
  AsyncWavesClient,
87
- TextToAudioStream
87
+ WavesStreamingTTS
88
88
  )
89
89
 
90
90
  from smallestai.atoms import __all__ as atoms_all
@@ -1,5 +1,5 @@
1
1
  from smallestai.waves.waves_client import WavesClient
2
2
  from smallestai.waves.async_waves_client import AsyncWavesClient
3
- from smallestai.waves.stream_tts import TextToAudioStream
3
+ from smallestai.waves.stream_tts import WavesStreamingTTS, TTSConfig
4
4
 
5
- __all__ = ["WavesClient", "AsyncWavesClient", "TextToAudioStream"]
5
+ __all__ = ["WavesClient", "AsyncWavesClient", "WavesStreamingTTS", "TTSConfig"]
@@ -4,10 +4,10 @@ import json
4
4
  import aiohttp
5
5
  import aiofiles
6
6
  import requests
7
- from typing import Optional, Union, List, AsyncIterator
7
+ from typing import Optional, Union, List
8
8
 
9
9
  from smallestai.waves.exceptions import TTSError, APIError
10
- from smallestai.waves.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
10
+ from smallestai.waves.utils import (TTSOptions, validate_input,
11
11
  get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
12
12
 
13
13
 
@@ -22,7 +22,8 @@ class AsyncWavesClient:
22
22
  consistency: Optional[float] = 0.5,
23
23
  similarity: Optional[float] = 0.0,
24
24
  enhancement: Optional[int] = 1,
25
- add_wav_header: Optional[bool] = True
25
+ language: Optional[str] = "en",
26
+ output_format: Optional[str] = "wav"
26
27
  ) -> None:
27
28
  """
28
29
  AsyncSmallest Instance for asynchronous text-to-speech synthesis.
@@ -40,7 +41,8 @@ class AsyncWavesClient:
40
41
  - consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
41
42
  - similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
42
43
  - enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
43
- - add_wav_header (bool): Whether to add a WAV header to the output audio.
44
+ - language (str): The language for synthesis. Default is "en".
45
+ - output_format (str): The output audio format. Options: "pcm", "mp3", "wav", "mulaw". Default is "pcm".
44
46
 
45
47
  Methods:
46
48
  - get_languages: Returns a list of available languages for synthesis.
@@ -61,11 +63,12 @@ class AsyncWavesClient:
61
63
  sample_rate=sample_rate,
62
64
  voice_id=voice_id,
63
65
  api_key=self.api_key,
64
- add_wav_header=add_wav_header,
65
66
  speed=speed,
66
67
  consistency=consistency,
67
68
  similarity=similarity,
68
- enhancement=enhancement
69
+ enhancement=enhancement,
70
+ language=language,
71
+ output_format=output_format
69
72
  )
70
73
  self.session = None
71
74
 
@@ -89,9 +92,9 @@ class AsyncWavesClient:
89
92
  return False
90
93
 
91
94
 
92
- def get_languages(self) -> List[str]:
95
+ def get_languages(self, model="lightning") -> List[str]:
93
96
  """Returns a list of available languages."""
94
- return get_smallest_languages()
97
+ return get_smallest_languages(model)
95
98
 
96
99
  def get_cloned_voices(self) -> str:
97
100
  """Returns a list of your cloned voices."""
@@ -130,18 +133,14 @@ class AsyncWavesClient:
130
133
  async def synthesize(
131
134
  self,
132
135
  text: str,
133
- stream: Optional[bool] = False,
134
- save_as: Optional[str] = None,
135
136
  **kwargs
136
- ) -> Union[bytes, None, AsyncIterator[bytes]]:
137
+ ) -> Union[bytes]:
137
138
  """
138
139
  Asynchronously synthesize speech from the provided text.
139
140
 
140
141
  Args:
141
142
  - text (str): The text to be converted to speech.
142
143
  - stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
143
- - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
144
- The file must have a .wav extension.
145
144
  - kwargs: Additional optional parameters to override `__init__` options for this call.
146
145
 
147
146
  Returns:
@@ -151,7 +150,7 @@ class AsyncWavesClient:
151
150
  - Otherwise, returns the synthesized audio content as bytes.
152
151
 
153
152
  Raises:
154
- - TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
153
+ - TTSError: If the provided file name does not have a .wav or .mp3 extension when `save_as` is specified.
155
154
  - APIError: If the API request fails or returns an error.
156
155
  - ValueError: If an unexpected parameter is passed in `kwargs`.
157
156
  """
@@ -172,65 +171,40 @@ class AsyncWavesClient:
172
171
  for key, value in kwargs.items():
173
172
  setattr(opts, key, value)
174
173
 
175
- text = preprocess_text(text)
176
174
  validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
177
175
 
178
- self.chunk_size = 250
179
- if opts.model == 'lightning-large':
180
- self.chunk_size = 140
181
-
182
- chunks = chunk_text(text, self.chunk_size)
183
-
184
- async def audio_stream():
185
- for chunk in chunks:
186
- payload = {
187
- "text": chunk,
188
- "sample_rate": opts.sample_rate,
189
- "voice_id": opts.voice_id,
190
- "add_wav_header": False,
191
- "speed": opts.speed,
192
- "model": opts.model
193
- }
194
-
195
- if opts.model == "lightning-large":
196
- if opts.consistency is not None:
197
- payload["consistency"] = opts.consistency
198
- if opts.similarity is not None:
199
- payload["similarity"] = opts.similarity
200
- if opts.enhancement is not None:
201
- payload["enhancement"] = opts.enhancement
202
-
203
-
204
- headers = {
205
- "Authorization": f"Bearer {self.api_key}",
206
- "Content-Type": "application/json",
207
- }
208
-
209
- async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
210
- if res.status != 200:
211
- raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
212
-
213
- yield await res.read()
176
+ payload = {
177
+ "text": text,
178
+ "voice_id": opts.voice_id,
179
+ "sample_rate": opts.sample_rate,
180
+ "speed": opts.speed,
181
+ "consistency": opts.consistency,
182
+ "similarity": opts.similarity,
183
+ "enhancement": opts.enhancement,
184
+ "language": opts.language,
185
+ "output_format": opts.output_format
186
+ }
214
187
 
215
- if stream:
216
- return audio_stream()
217
-
218
- audio_content = b"".join([chunk async for chunk in audio_stream()])
219
-
220
- if save_as:
221
- if not save_as.endswith(".wav"):
222
- raise TTSError("Invalid file name. Extension must be .wav")
223
-
224
- async with aiofiles.open(save_as, mode='wb') as f:
225
- await f.write(add_wav_header(audio_content, opts.sample_rate))
226
-
227
- return None
228
-
229
- if opts.add_wav_header:
230
- return add_wav_header(audio_content, opts.sample_rate)
188
+ if opts.model == "lightning-large" or opts.model == "lightning-v2":
189
+ if opts.consistency is not None:
190
+ payload["consistency"] = opts.consistency
191
+ if opts.similarity is not None:
192
+ payload["similarity"] = opts.similarity
193
+ if opts.enhancement is not None:
194
+ payload["enhancement"] = opts.enhancement
195
+
196
+ headers = {
197
+ "Authorization": f"Bearer {self.api_key}",
198
+ "Content-Type": "application/json",
199
+ }
231
200
 
232
- return audio_content
201
+ async with self.session.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers) as res:
202
+ if res.status != 200:
203
+ raise APIError(f"Failed to synthesize speech: {await res.text()}. For more information, visit https://waves.smallest.ai/")
204
+
205
+ audio_bytes = await res.content.read()
233
206
 
207
+ return audio_bytes
234
208
  finally:
235
209
  if should_cleanup and self.session:
236
210
  await self.session.close()
@@ -316,9 +290,8 @@ class AsyncWavesClient:
316
290
  if res.status != 200:
317
291
  raise APIError(f"Failed to delete voice: {await res.text()}. For more information, visit https://waves.smallest.ai/")
318
292
 
319
- return await res.text()
320
-
293
+ return json.dumps(await res.json(), indent=4, ensure_ascii=False)
321
294
  finally:
322
295
  if should_cleanup and self.session:
323
296
  await self.session.close()
324
- self.session = None
297
+ self.session = None
@@ -1,5 +1,8 @@
1
- TTSLanguages = ["en", "hi", "ta", "fr", "de", "pl"]
1
+ TTSLanguages_lightning = ["en", "hi"]
2
+ TTSLanguages_lightning_large = ["en", "hi"]
3
+ TTSLanguages_lightning_v2 = ["en", "hi", "mr", "kn", "ta", "bn", "gu", "de", "fr", "es", "it", "pl", "nl", "ru", "ar", "he"]
2
4
  TTSModels = [
3
5
  "lightning",
4
- "lightning-large"
6
+ "lightning-large",
7
+ "lightning-v2"
5
8
  ]
@@ -1,272 +1,207 @@
1
- import asyncio
1
+ import json
2
+ import base64
2
3
  import time
3
- from threading import Thread
4
- from queue import Queue, Empty
5
- from typing import AsyncGenerator, Optional, Union, List, Dict, Any
6
-
7
- from smallestai.waves.waves_client import WavesClient
8
- from smallestai.waves.exceptions import APIError
9
- from smallestai.waves.async_waves_client import AsyncWavesClient
10
- from smallestai.waves.utils import SENTENCE_END_REGEX
11
-
12
- class TextToAudioStream:
13
- def __init__(
14
- self,
15
- tts_instance: Union[WavesClient, AsyncWavesClient],
16
- queue_timeout: Optional[float] = 5.0,
17
- max_retries: Optional[int] = 3,
18
- verbose: bool = False
19
- ):
20
- """
21
- A real-time text-to-speech processor that converts streaming text into audio output.
22
- Useful for applications requiring immediate audio feedback from text generation,
23
- such as voice assistants, live captioning, or interactive chatbots.
24
-
25
- ⚠️ `add_wav_header` is disabled by default for streaming efficiency. Refer to the README for more information.
26
-
27
- Features:
28
- - Streams audio chunks as soon as text is available.
29
- - Handles both sync and async text-to-speech engines.
30
- - Automatically retries failed synthesis attempts.
31
- - Low latency between text generation and speech output.
32
-
33
- Args:
34
- tts_instance: The text-to-speech engine to use (Smallest or AsyncSmallest)
35
- queue_timeout: How long to wait for new text (seconds, default: 1.0)
36
- max_retries: Number of retry attempts for failed synthesis (default: 3)
37
- verbose: Whether to log detailed metrics about TTS requests (default: False)
38
- """
39
- self.tts_instance = tts_instance
40
- self.tts_instance.opts.add_wav_header = False
41
- self.sentence_end_regex = SENTENCE_END_REGEX
42
- self.queue_timeout = queue_timeout
43
- self.max_retries = max_retries
44
- self.queue = Queue()
45
- self.buffer_size = 250
46
- self.stop_flag = False
47
- self.verbose = verbose
4
+ import threading
5
+ import queue
6
+ from typing import Generator
7
+ from dataclasses import dataclass
8
+ from websocket import WebSocketApp
9
+
10
+ @dataclass
11
+ class TTSConfig:
12
+ voice_id: str
13
+ api_key: str
14
+ language: str = "en"
15
+ sample_rate: int = 24000
16
+ speed: float = 1.0
17
+ consistency: float = 0.5
18
+ enhancement: int = 1
19
+ similarity: float = 0
20
+ max_buffer_flush_ms: int = 0
21
+
22
+ class WavesStreamingTTS:
23
+ def __init__(self, config: TTSConfig):
24
+ self.config = config
25
+ self.ws_url = "wss://waves-api.smallest.ai/api/v1/lightning-v2/get_speech/stream"
26
+ self.ws = None
27
+ self.audio_queue = queue.Queue()
28
+ self.error_queue = queue.Queue()
29
+ self.is_complete = False
30
+ self.is_connected = False
31
+ self.request_id = None
48
32
 
49
- # Metrics tracking
50
- self.request_count = 0
51
- self.request_logs: List[Dict[str, Any]] = []
52
- self.start_time = 0
53
- self.first_api_response_time = None
54
- self.end_time = 0
55
-
56
- if self.tts_instance.opts.model == 'lightning-large':
57
- self.buffer_size = 140
58
-
59
-
60
- async def _stream_llm_output(self, llm_output: AsyncGenerator[str, None]) -> None:
61
- """
62
- Streams the LLM output, splitting it into chunks based on sentence boundaries
63
- or space characters if no sentence boundary is found before reaching buffer_size.
64
-
65
- Parameters:
66
- - llm_output (AsyncGenerator[str, None]): An async generator yielding LLM output.
67
- """
68
- buffer = ""
69
-
70
- async for chunk in llm_output:
71
- buffer += chunk
72
-
73
- while len(buffer) > self.buffer_size:
74
- chunk_text = buffer[:self.buffer_size]
75
- last_break_index = -1
76
-
77
- # Find last sentence boundary using regex
78
- for i in range(len(chunk_text) - 1, -1, -1):
79
- if self.sentence_end_regex.match(chunk_text[:i + 1]):
80
- last_break_index = i
81
- break
82
-
83
- if last_break_index == -1:
84
- # Fallback to space if no sentence boundary found
85
- last_space = chunk_text.rfind(' ')
86
- if last_space != -1:
87
- last_break_index = last_space
88
- else:
89
- last_break_index = self.buffer_size - 1
90
-
91
- # Add chunk to queue and update buffer
92
- self.queue.put(f'{buffer[:last_break_index + 1].replace("—", " ").strip()} ')
93
- buffer = buffer[last_break_index + 1:].strip()
94
-
95
- # Don't forget the remaining text
96
- if buffer:
97
- self.queue.put(f'{buffer.replace("—", " ").strip()} ')
98
-
99
- self.stop_flag = True
100
-
101
-
102
- def _synthesize_sync(self, sentence: str, retries: int = 0) -> Optional[bytes]:
103
- """Synchronously synthesizes a given sentence."""
104
- request_start_time = time.time()
105
- request_id = self.request_count + 1
33
+ def _get_headers(self):
34
+ return [f"Authorization: Bearer {self.config.api_key}"]
35
+
36
+ def _create_payload(self, text: str, continue_stream: bool = False, flush: bool = False):
37
+ return {
38
+ "voice_id": self.config.voice_id,
39
+ "text": text,
40
+ "language": self.config.language,
41
+ "sample_rate": self.config.sample_rate,
42
+ "speed": self.config.speed,
43
+ "consistency": self.config.consistency,
44
+ "similarity": self.config.similarity,
45
+ "enhancement": self.config.enhancement,
46
+ "max_buffer_flush_ms": self.config.max_buffer_flush_ms,
47
+ "continue": continue_stream,
48
+ "flush": flush
49
+ }
50
+
51
+ def _on_open(self, ws):
52
+ self.is_connected = True
106
53
 
54
+ def _on_message(self, ws, message):
107
55
  try:
108
- audio_content = self.tts_instance.synthesize(sentence)
109
- self.request_count += 1
110
- request_end_time = time.time()
56
+ data = json.loads(message)
57
+ status = data.get("status", "")
111
58
 
112
- if self.verbose:
113
- request_duration = request_end_time - request_start_time
114
- if self.first_api_response_time is None:
115
- self.first_api_response_time = time.time() - self.start_time
59
+ if status == "error":
60
+ self.error_queue.put(Exception(data.get("message", "Unknown error")))
61
+ return
116
62
 
117
- self.request_logs.append({
118
- "id": request_id,
119
- "text": sentence,
120
- "start_time": request_start_time - self.start_time,
121
- "end_time": request_end_time - self.start_time,
122
- "duration": request_duration,
123
- "char_count": len(sentence),
124
- "retries": retries
125
- })
63
+ if not self.request_id:
64
+ self.request_id = data.get("request_id")
126
65
 
127
- return audio_content
128
- except APIError as e:
129
- if retries < self.max_retries:
130
- if self.verbose:
131
- print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
132
- return self._synthesize_sync(sentence, retries + 1)
133
- else:
134
- if self.verbose:
135
- print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
136
- return None
66
+ audio_b64 = data.get("data", {}).get("audio")
67
+ if audio_b64:
68
+ self.audio_queue.put(base64.b64decode(audio_b64))
69
+
70
+ if status == "complete":
71
+ self.is_complete = True
72
+ self.audio_queue.put(None)
73
+
74
+ except Exception as e:
75
+ self.error_queue.put(e)
76
+
77
+ def _on_error(self, ws, error):
78
+ self.error_queue.put(error)
79
+
80
+ def _on_close(self, ws, *args):
81
+ self.is_connected = False
82
+ if not self.is_complete:
83
+ self.audio_queue.put(None)
84
+
85
+ def _connect(self):
86
+ if self.ws:
87
+ self.ws.close()
137
88
 
138
-
139
- async def _synthesize_async(self, sentence: str, retries: int = 0) -> Optional[bytes]:
140
- """Asynchronously synthesizes a given sentence."""
141
- request_start_time = time.time()
142
- request_id = self.request_count + 1
89
+ self.ws = WebSocketApp(
90
+ self.ws_url,
91
+ header=self._get_headers(),
92
+ on_open=self._on_open,
93
+ on_message=self._on_message,
94
+ on_error=self._on_error,
95
+ on_close=self._on_close
96
+ )
143
97
 
144
- try:
145
- audio_content = await self.tts_instance.synthesize(sentence)
146
- self.request_count += 1
147
- request_end_time = time.time()
98
+ ws_thread = threading.Thread(target=self.ws.run_forever)
99
+ ws_thread.daemon = True
100
+ ws_thread.start()
101
+
102
+ timeout = 5.0
103
+ start_time = time.time()
104
+ while not self.is_connected and time.time() - start_time < timeout:
105
+ time.sleep(0.1)
148
106
 
149
- if self.verbose:
150
- request_duration = request_end_time - request_start_time
151
- if self.first_api_response_time is None:
152
- self.first_api_response_time = time.time() - self.start_time
153
-
154
- self.request_logs.append({
155
- "id": request_id,
156
- "text": sentence,
157
- "start_time": request_start_time - self.start_time,
158
- "end_time": request_end_time - self.start_time,
159
- "duration": request_duration,
160
- "char_count": len(sentence),
161
- "retries": retries
162
- })
107
+ if not self.is_connected:
108
+ raise Exception("Failed to connect to WebSocket")
109
+
110
+ def synthesize(self, text: str) -> Generator[bytes, None, None]:
111
+ self._reset_state()
112
+ self._connect()
113
+
114
+ payload = self._create_payload(text)
115
+ self.ws.send(json.dumps(payload))
116
+
117
+ while True:
118
+ if not self.error_queue.empty():
119
+ raise self.error_queue.get()
163
120
 
164
- return audio_content
165
- except APIError as e:
166
- if retries < self.max_retries:
167
- if self.verbose:
168
- print(f"Retry {retries + 1}/{self.max_retries} for request: '{sentence[:30]}...'")
169
- return await self._synthesize_async(sentence, retries + 1)
170
- else:
171
- if self.verbose:
172
- print(f"Synthesis failed for sentence: {sentence} - Error: {e}. Retries Exhausted, for more information, visit https://waves.smallest.ai/")
173
- return None
174
-
175
-
176
- async def _run_synthesis(self) -> AsyncGenerator[bytes, None]:
177
- """
178
- Continuously synthesizes sentences from the queue, yielding audio content.
179
- If no sentences are in the queue, it waits until new data is available or streaming is complete.
180
- """
181
- while not self.stop_flag or not self.queue.empty():
182
121
  try:
183
- sentence = self.queue.get_nowait()
184
-
185
- if isinstance(self.tts_instance, AsyncWavesClient):
186
- audio_content = await self._synthesize_async(sentence)
187
- else:
188
- loop = asyncio.get_running_loop()
189
- audio_content = await loop.run_in_executor(None, self._synthesize_sync, sentence)
190
-
191
- if audio_content:
192
- yield audio_content
193
-
194
- except Empty:
195
- # Quick check if we should exit
196
- if self.stop_flag and self.queue.empty():
122
+ chunk = self.audio_queue.get(timeout=1.0)
123
+ if chunk is None:
197
124
  break
198
-
199
- # Short sleep to avoid busy-waiting
200
- await asyncio.sleep(0.01) # Much shorter sleep time (10ms)
201
-
202
-
203
- def _print_verbose_summary(self) -> None:
204
- """Print a summary of all metrics if verbose mode is enabled."""
205
- if not self.verbose:
206
- return
207
-
208
- total_duration = self.end_time - self.start_time
209
-
210
- print("\n" + "="*100)
211
- print(f"TEXT-TO-AUDIO STREAM METRICS")
212
- print("="*100)
125
+ yield chunk
126
+ except queue.Empty:
127
+ if self.is_complete:
128
+ break
129
+ continue
130
+
131
+ self.ws.close()
132
+
133
+ def synthesize_streaming(self, text_stream: Generator[str, None, None],
134
+ continue_stream: bool = True,
135
+ auto_flush: bool = True) -> Generator[bytes, None, None]:
136
+ self._reset_state()
137
+ self._connect()
213
138
 
214
- print(f"\nOVERALL STATISTICS:")
215
- print(f" Total requests made: {self.request_count}")
216
- print(f" Time to first API response: {self.first_api_response_time:.3f}s")
217
- print(f" Total processing time: {total_duration:.3f}s")
139
+ def send_text():
140
+ try:
141
+ for text_chunk in text_stream:
142
+ if text_chunk.strip():
143
+ payload = self._create_payload(text_chunk, continue_stream=continue_stream)
144
+ self.ws.send(json.dumps(payload))
145
+
146
+ if auto_flush:
147
+ flush_payload = self._create_payload("", flush=True)
148
+ self.ws.send(json.dumps(flush_payload))
149
+ except Exception as e:
150
+ self.error_queue.put(e)
218
151
 
219
- # Print table header
220
- print("\nREQUEST DETAILS:")
221
- header = f"{'#':4} {'Start (s)':10} {'End (s)':10} {'Duration (s)':12} {'Characters':15} {'Text'}"
222
- print("\n" + header)
223
- print("-" * 100)
152
+ sender_thread = threading.Thread(target=send_text)
153
+ sender_thread.daemon = True
154
+ sender_thread.start()
224
155
 
225
- # Print table rows
226
- for log in self.request_logs:
227
- row = (
228
- f"{log['id']:4} "
229
- f"{log['start_time']:10.3f} "
230
- f"{log['end_time']:10.3f} "
231
- f"{log['duration']:12.3f} "
232
- f"{log['char_count']:15} "
233
- f"{log['text'][:50]}{'...' if len(log['text']) > 50 else ''}"
234
- )
235
- print(row)
236
-
237
- # Print retry information if any
238
- if log['retries'] > 0:
239
- print(f"{'':4} {'':10} {'':10} {'':12} {'':15} Retries: {log['retries']}")
156
+ while True:
157
+ if not self.error_queue.empty():
158
+ raise self.error_queue.get()
240
159
 
241
- print("\n" + "="*100)
242
-
243
-
244
- async def process(self, llm_output: AsyncGenerator[str, None]) -> AsyncGenerator[bytes, None]:
245
- """
246
- Convert streaming text into audio in real-time.
247
-
248
- Handles the entire pipeline from receiving text to producing audio,
249
- yielding audio chunks as soon as they're ready.
250
-
251
- Args:
252
- llm_output: An async generator that yields text chunks.
253
-
254
- Yields:
255
- Raw audio data chunks (without WAV headers) that can be:
256
- - Played directly through an audio device
257
- - Saved to a file
258
- - Streamed over a network
259
- - Further processed as needed
260
- """
261
- self.start_time = time.time()
262
-
263
- llm_thread = Thread(target=asyncio.run, args=(self._stream_llm_output(llm_output),))
264
- llm_thread.start()
265
-
266
- async for audio_content in self._run_synthesis():
267
- yield audio_content
268
-
269
- llm_thread.join()
160
+ try:
161
+ chunk = self.audio_queue.get(timeout=1.0)
162
+ if chunk is None:
163
+ break
164
+ yield chunk
165
+ except queue.Empty:
166
+ if self.is_complete:
167
+ break
168
+ continue
169
+
170
+ self.ws.close()
171
+
172
+ def send_text_chunk(self, text: str, continue_stream: bool = True, flush: bool = False):
173
+ if not self.is_connected:
174
+ raise Exception("WebSocket not connected")
175
+ payload = self._create_payload(text, continue_stream=continue_stream, flush=flush)
176
+ self.ws.send(json.dumps(payload))
177
+
178
+ def flush_buffer(self):
179
+ if not self.is_connected:
180
+ raise Exception("WebSocket not connected")
181
+ payload = self._create_payload("", flush=True)
182
+ self.ws.send(json.dumps(payload))
183
+
184
+ def start_streaming_session(self) -> Generator[bytes, None, None]:
185
+ self._reset_state()
186
+ self._connect()
270
187
 
271
- self.end_time = time.time()
272
- self._print_verbose_summary()
188
+ while True:
189
+ if not self.error_queue.empty():
190
+ raise self.error_queue.get()
191
+
192
+ try:
193
+ chunk = self.audio_queue.get(timeout=0.1)
194
+ if chunk is None:
195
+ break
196
+ yield chunk
197
+ except queue.Empty:
198
+ if self.is_complete:
199
+ break
200
+ continue
201
+
202
+ def _reset_state(self):
203
+ self.audio_queue = queue.Queue()
204
+ self.error_queue = queue.Queue()
205
+ self.is_complete = False
206
+ self.is_connected = False
207
+ self.request_id = None
smallestai/waves/utils.py CHANGED
@@ -1,16 +1,13 @@
1
- import re
2
- import io
3
1
  from typing import List
4
2
  from typing import Optional
5
- from pydub import AudioSegment
6
3
  from dataclasses import dataclass
7
4
 
8
5
  from smallestai.waves.exceptions import ValidationError
9
- from smallestai.waves.models import TTSModels, TTSLanguages
6
+ from smallestai.waves.models import TTSModels, TTSLanguages_lightning, TTSLanguages_lightning_large, TTSLanguages_lightning_v2
10
7
 
11
8
 
12
9
  API_BASE_URL = "https://waves-api.smallest.ai/api/v1"
13
- SENTENCE_END_REGEX = re.compile(r'.*[-.—!?,;:…।|]$')
10
+ WEBSOCKET_URL = "wss://waves-api.smallest.ai/api/v1/lightning-v2/get_speech/stream"
14
11
  SAMPLE_WIDTH = 2
15
12
  CHANNELS = 1
16
13
  ALLOWED_AUDIO_EXTENSIONS = ['.mp3', '.wav']
@@ -22,11 +19,12 @@ class TTSOptions:
22
19
  sample_rate: int
23
20
  voice_id: str
24
21
  api_key: str
25
- add_wav_header: bool
26
22
  speed: float
27
23
  consistency: float
28
24
  similarity: float
29
25
  enhancement: int
26
+ language: str
27
+ output_format: str
30
28
 
31
29
 
32
30
  def validate_input(text: str, model: str, sample_rate: int, speed: float, consistency: Optional[float] = None, similarity: Optional[float] = None, enhancement: Optional[int] = None):
@@ -46,52 +44,15 @@ def validate_input(text: str, model: str, sample_rate: int, speed: float, consis
46
44
  raise ValidationError(f"Invalid enhancement: {enhancement}. Must be between 0 and 2.")
47
45
 
48
46
 
49
- def add_wav_header(frame_input: bytes, sample_rate: int = 24000, sample_width: int = 2, channels: int = 1) -> bytes:
50
- audio = AudioSegment(data=frame_input, sample_width=sample_width, frame_rate=sample_rate, channels=channels)
51
- wav_buf = io.BytesIO()
52
- audio.export(wav_buf, format="wav")
53
- wav_buf.seek(0)
54
- return wav_buf.read()
55
-
56
-
57
- def preprocess_text(text: str) -> str:
58
- text = text.replace("\n", " ").replace("\t", " ")
59
- text = re.sub(r'\s+', ' ', text)
60
- return text.strip()
61
-
62
-
63
- def chunk_text(text: str, chunk_size: int = 250) -> List[str]:
64
- chunks = []
65
- while text:
66
- if len(text) <= chunk_size:
67
- chunks.append(text.strip())
68
- break
69
-
70
- chunk_text = text[:chunk_size]
71
- last_break_index = -1
72
-
73
- # Find last sentence boundary using regex
74
- for i in range(len(chunk_text) - 1, -1, -1):
75
- if SENTENCE_END_REGEX.match(chunk_text[:i + 1]):
76
- last_break_index = i
77
- break
78
-
79
- if last_break_index == -1:
80
- # Fallback to space if no sentence boundary found
81
- last_space = chunk_text.rfind(' ')
82
- if last_space != -1:
83
- last_break_index = last_space
84
- else:
85
- last_break_index = chunk_size - 1
86
-
87
- chunks.append(text[:last_break_index + 1].strip())
88
- text = text[last_break_index + 1:].strip()
89
-
90
- return chunks
91
-
92
-
93
- def get_smallest_languages() -> List[str]:
94
- return TTSLanguages
47
+ def get_smallest_languages(model: str = 'lightning') -> List[str]:
48
+ if model == 'lightning':
49
+ return TTSLanguages_lightning
50
+ elif model == 'lightning-large':
51
+ return TTSLanguages_lightning_large
52
+ elif model == 'lightning-v2':
53
+ return TTSLanguages_lightning_v2
54
+ else:
55
+ raise ValidationError(f"Invalid model: {model}. Must be one of {TTSModels}")
95
56
 
96
57
  def get_smallest_models() -> List[str]:
97
58
  return TTSModels
@@ -1,13 +1,12 @@
1
1
  import os
2
2
  import json
3
- import wave
4
3
  import copy
5
4
  import requests
6
- from typing import Optional, Union, List, Iterator
5
+ from typing import Optional, Union, List
7
6
 
8
7
  from smallestai.waves.exceptions import TTSError, APIError
9
- from smallestai.waves.utils import (TTSOptions, validate_input, preprocess_text, add_wav_header, chunk_text,
10
- get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
8
+ from smallestai.waves.utils import (TTSOptions, validate_input,
9
+ get_smallest_languages, get_smallest_models, ALLOWED_AUDIO_EXTENSIONS, API_BASE_URL)
11
10
 
12
11
  class WavesClient:
13
12
  def __init__(
@@ -20,7 +19,8 @@ class WavesClient:
20
19
  consistency: Optional[float] = 0.5,
21
20
  similarity: Optional[float] = 0.0,
22
21
  enhancement: Optional[int] = 1,
23
- add_wav_header: Optional[bool] = True
22
+ language: Optional[str] = "en",
23
+ output_format: Optional[str] = "wav"
24
24
  ) -> None:
25
25
  """
26
26
  Smallest Instance for text-to-speech synthesis.
@@ -37,7 +37,8 @@ class WavesClient:
37
37
  - consistency (float): This parameter controls word repetition and skipping. Decrease it to prevent skipped words, and increase it to prevent repetition. Only supported in `lightning-large` model. Range - [0, 1]
38
38
  - similarity (float): This parameter controls the similarity between the synthesized audio and the reference audio. Increase it to make the speech more similar to the reference audio. Only supported in `lightning-large` model. Range - [0, 1]
39
39
  - enhancement (int): Enhances speech quality at the cost of increased latency. Only supported in `lightning-large` model. Range - [0, 2].
40
- - add_wav_header (bool): Whether to add a WAV header to the output audio.
40
+ - language (str): The language for synthesis. Default is "en".
41
+ - output_format (str): The output audio format. Options: "pcm", "mp3", "wav", "mulaw". Default is "pcm".
41
42
 
42
43
  Methods:
43
44
  - get_languages: Returns a list of available languages for synthesis.
@@ -58,17 +59,18 @@ class WavesClient:
58
59
  sample_rate=sample_rate,
59
60
  voice_id=voice_id,
60
61
  api_key=self.api_key,
61
- add_wav_header=add_wav_header,
62
62
  speed=speed,
63
63
  consistency=consistency,
64
64
  similarity=similarity,
65
- enhancement=enhancement
65
+ enhancement=enhancement,
66
+ language=language,
67
+ output_format=output_format
66
68
  )
67
69
 
68
70
 
69
- def get_languages(self) -> List[str]:
71
+ def get_languages(self, model:str="lightning") -> List[str]:
70
72
  """Returns a list of available languages."""
71
- return get_smallest_languages()
73
+ return get_smallest_languages(model)
72
74
 
73
75
  def get_cloned_voices(self) -> str:
74
76
  """Returns a list of your cloned voices."""
@@ -107,17 +109,13 @@ class WavesClient:
107
109
  def synthesize(
108
110
  self,
109
111
  text: str,
110
- stream: Optional[bool] = False,
111
- save_as: Optional[str] = None,
112
112
  **kwargs
113
- ) -> Union[bytes, None, Iterator[bytes]]:
113
+ ) -> Union[bytes]:
114
114
  """
115
115
  Synthesize speech from the provided text.
116
116
 
117
117
  - text (str): The text to be converted to speech.
118
118
  - stream (Optional[bool]): If True, returns an iterator yielding audio chunks instead of a full byte array.
119
- - save_as (Optional[str]): If provided, the synthesized audio will be saved to this file path.
120
- The file must have a .wav extension.
121
119
  - kwargs: Additional optional parameters to override `__init__` options for this call.
122
120
 
123
121
  Returns:
@@ -127,7 +125,7 @@ class WavesClient:
127
125
  - Otherwise, returns the synthesized audio content as bytes.
128
126
 
129
127
  Raises:
130
- - TTSError: If the provided file name does not have a .wav extension when `save_as` is specified.
128
+ - TTSError: If the provided file name does not have a .wav or .mp3 extension when `save_as` is specified.
131
129
  - APIError: If the API request fails or returns an error.
132
130
  """
133
131
  opts = copy.deepcopy(self.opts)
@@ -140,64 +138,38 @@ class WavesClient:
140
138
  for key, value in kwargs.items():
141
139
  setattr(opts, key, value)
142
140
 
143
- text = preprocess_text(text)
144
141
  validate_input(text, opts.model, opts.sample_rate, opts.speed, opts.consistency, opts.similarity, opts.enhancement)
145
142
 
146
- self.chunk_size = 250
147
- if opts.model == "lightning-large":
148
- self.chunk_size = 140
149
-
150
- chunks = chunk_text(text, self.chunk_size)
151
-
152
- def audio_stream():
153
- for chunk in chunks:
154
- payload = {
155
- "text": chunk,
156
- "sample_rate": opts.sample_rate,
157
- "voice_id": opts.voice_id,
158
- "add_wav_header": False,
159
- "speed": opts.speed,
160
- }
161
-
162
- if opts.model == "lightning-large":
163
- if opts.consistency is not None:
164
- payload["consistency"] = opts.consistency
165
- if opts.similarity is not None:
166
- payload["similarity"] = opts.similarity
167
- if opts.enhancement is not None:
168
- payload["enhancement"] = opts.enhancement
169
-
170
- headers = {
171
- "Authorization": f"Bearer {self.api_key}",
172
- "Content-Type": "application/json",
173
- }
174
-
175
- res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
176
- if res.status_code != 200:
177
- raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
143
+ payload = {
144
+ "text": text,
145
+ "voice_id": opts.voice_id,
146
+ "sample_rate": opts.sample_rate,
147
+ "speed": opts.speed,
148
+ "consistency": opts.consistency,
149
+ "similarity": opts.similarity,
150
+ "enhancement": opts.enhancement,
151
+ "language": opts.language,
152
+ "output_format": opts.output_format
153
+ }
178
154
 
179
- yield res.content
180
-
181
- if stream:
182
- return audio_stream()
183
-
184
- audio_content = b"".join(audio_stream())
155
+ if opts.model == "lightning-large" or opts.model == "lightning-v2":
156
+ if opts.consistency is not None:
157
+ payload["consistency"] = opts.consistency
158
+ if opts.similarity is not None:
159
+ payload["similarity"] = opts.similarity
160
+ if opts.enhancement is not None:
161
+ payload["enhancement"] = opts.enhancement
162
+
163
+ headers = {
164
+ "Authorization": f"Bearer {self.api_key}",
165
+ "Content-Type": "application/json",
166
+ }
185
167
 
186
- if save_as:
187
- if not save_as.endswith(".wav"):
188
- raise TTSError("Invalid file name. Extension must be .wav")
189
-
190
- with wave.open(save_as, "wb") as wf:
191
- wf.setnchannels(1)
192
- wf.setsampwidth(2)
193
- wf.setframerate(opts.sample_rate)
194
- wf.writeframes(audio_content)
195
- return None
196
-
197
- if opts.add_wav_header:
198
- return add_wav_header(audio_content, opts.sample_rate)
199
-
200
- return audio_content
168
+ res = requests.post(f"{API_BASE_URL}/{opts.model}/get_speech", json=payload, headers=headers)
169
+ if res.status_code != 200:
170
+ raise APIError(f"Failed to synthesize speech: {res.text}. Please check if you have set the correct API key. For more information, visit https://waves.smallest.ai/")
171
+
172
+ return res.content
201
173
 
202
174
 
203
175
  def add_voice(self, display_name: str, file_path: str) -> str:
@@ -262,4 +234,4 @@ class WavesClient:
262
234
  if response.status_code != 200:
263
235
  raise APIError(f"Failed to delete voice: {response.text}. For more information, visit https://waves.smallest.ai/")
264
236
 
265
- return json.dumps(response.json(), indent=4, ensure_ascii=False)
237
+ return json.dumps(response.json(), indent=4, ensure_ascii=False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: smallestai
3
- Version: 3.0.3
3
+ Version: 4.0.0
4
4
  Summary: Official Python client for the Smallest AI API
5
5
  Author-email: Smallest <support@smallest.ai>
6
6
  License: MIT
@@ -16,6 +16,7 @@ Requires-Dist: aiohttp
16
16
  Requires-Dist: aiofiles
17
17
  Requires-Dist: requests
18
18
  Requires-Dist: pydub
19
+ Requires-Dist: websocket-client
19
20
  Requires-Dist: urllib3<3.0.0,>=1.25.3
20
21
  Requires-Dist: python-dateutil>=2.8.2
21
22
  Requires-Dist: pydantic>=2
@@ -1,4 +1,4 @@
1
- smallestai/__init__.py,sha256=zVO8iaNFVgNErxEt58AuB1npc7MR8x8Oi9A-Z2t8Q6w,2624
1
+ smallestai/__init__.py,sha256=lY4DcFTosH2W0KVVN7pSbJmZBu067wG_y1u3GwGNru8,2624
2
2
  smallestai/atoms/__init__.py,sha256=cn5_9tVsUwFQ_zdAZv263P4ow4N7dxRWCYAz82GjwuI,9342
3
3
  smallestai/atoms/api_client.py,sha256=EcyN6nFp9U4u8TPJx3a9ZvbM2T4a9xrHGopQGLZuJpw,27448
4
4
  smallestai/atoms/api_response.py,sha256=eMxw1mpmJcoGZ3gs9z6jM4oYoZ10Gjk333s9sKxGv7s,652
@@ -73,15 +73,15 @@ smallestai/atoms/models/update_agent_request_synthesizer_voice_config.py,sha256=
73
73
  smallestai/atoms/models/update_agent_request_synthesizer_voice_config_one_of.py,sha256=8nGPcJ_CRUlXXjy3vCjpmbHWVBwQo2ebFP1K0MZPAsk,3955
74
74
  smallestai/atoms/models/update_agent_request_synthesizer_voice_config_one_of1.py,sha256=9AJxgngoNSMvDbceajIqnG23PY4rw84coTh7yUTNS3c,3487
75
75
  smallestai/atoms/models/upload_text_to_knowledge_base_request.py,sha256=Sxg0vRv_naT15odE8fBUeyjwLpEYOmQwGcJuzRRr90A,2587
76
- smallestai/waves/__init__.py,sha256=Hkq7N2nuz_wS7pC6QeUnIU1MzQnX_nrhfXGpjGSvFhQ,244
77
- smallestai/waves/async_waves_client.py,sha256=hv9rQ8-ykWuHoAcmZPhwtX_-AAQT4H4G3H8c4BhO5-0,12658
76
+ smallestai/waves/__init__.py,sha256=hxyqisgFiKiroxupuZeNXpXFIbnivmdgPrid3CnLhh0,268
77
+ smallestai/waves/async_waves_client.py,sha256=BgiSqd2UjwECCPwuh2dyhLSBP0inIsbPUEbduWTJrmI,11704
78
78
  smallestai/waves/exceptions.py,sha256=nY6I8fCXe2By54CytQ0-i3hFiYtt8TYAKj0g6OYsCjc,585
79
- smallestai/waves/models.py,sha256=egN4V_HiWIQBLKQdXt1ax1W-1tLK42xqx4FALHyMxh8,108
80
- smallestai/waves/stream_tts.py,sha256=Ppjwp1jXpUSpyNkwCnesMYQbAdyzKLMj_1o1iTb3jaA,10958
81
- smallestai/waves/utils.py,sha256=0VqMA4apJ-9U7abOznVXqUYEEAxQ2JkpLGyFhcJ_Kbw,3307
82
- smallestai/waves/waves_client.py,sha256=XKdPVWs-HZDzlxzF1x3cMdJQ_q71ZFS1P5oltzj2KO4,10740
83
- smallestai-3.0.3.dist-info/licenses/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
84
- smallestai-3.0.3.dist-info/METADATA,sha256=QG7FNlKA5cKXGGi1ay_bCRM4s4aWf64A70r1T2yM68I,20392
85
- smallestai-3.0.3.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
86
- smallestai-3.0.3.dist-info/top_level.txt,sha256=pdJzm1VC2J6RxoobATz45L9U3cki4AFLigsfvETz7Io,11
87
- smallestai-3.0.3.dist-info/RECORD,,
79
+ smallestai/waves/models.py,sha256=FaMVkOFyNCVpWvyMCmqkv3t1wmnfCs1HIULxLr1L8XE,283
80
+ smallestai/waves/stream_tts.py,sha256=c9r8mZuuFjbyWsUrlZ1jb0WNX7-lR39EXDUqyF-5g14,6792
81
+ smallestai/waves/utils.py,sha256=sqDpfa5SC60C_kJZo4MKxlDfkX7RRzO6aJ2hKpNMemE,2273
82
+ smallestai/waves/waves_client.py,sha256=U6aqClYL49cTtYisvpUVhas2miGZiCfqwTU0eDUY548,9770
83
+ smallestai-4.0.0.dist-info/licenses/LICENSE,sha256=kK3HNKhN7luQhkjkNWIvy9_gizbEDUM4mSv_HWq9uuM,1068
84
+ smallestai-4.0.0.dist-info/METADATA,sha256=bk0xBChPACeJiL8j6zxbHGnty60N5cjDH1sLPx32hLM,20424
85
+ smallestai-4.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
+ smallestai-4.0.0.dist-info/top_level.txt,sha256=pdJzm1VC2J6RxoobATz45L9U3cki4AFLigsfvETz7Io,11
87
+ smallestai-4.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.3.1)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5