pipecat-supertonic 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pipecat_supertonic-0.1.0/PKG-INFO +87 -0
- pipecat_supertonic-0.1.0/README.md +64 -0
- pipecat_supertonic-0.1.0/pyproject.toml +41 -0
- pipecat_supertonic-0.1.0/src/pipecat_supertonic/__init__.py +2 -0
- pipecat_supertonic-0.1.0/src/pipecat_supertonic/py.typed +0 -0
- pipecat_supertonic-0.1.0/src/pipecat_supertonic/tts.py +347 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pipecat-supertonic
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Supertonic TTS service integration for Pipecat
|
|
5
|
+
Keywords: pipecat,tts,supertonic,voice,speech-synthesis
|
|
6
|
+
Author: Archit498
|
|
7
|
+
Author-email: Archit498 <archit@voicing.ai>
|
|
8
|
+
License-Expression: BSD-2-Clause
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Dist: pipecat-ai[websockets-base]>=1.2,<2
|
|
20
|
+
Requires-Dist: supertonic>=1.2.1,<2
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# pipecat-supertonic
|
|
25
|
+
|
|
26
|
+
`pipecat-supertonic` provides a Pipecat-compatible `TTSService` wrapper for the
|
|
27
|
+
official [Supertonic](https://github.com/supertone-inc/supertonic) Python SDK.
|
|
28
|
+
|
|
29
|
+
The package is designed to feel like a native Pipecat service:
|
|
30
|
+
|
|
31
|
+
- import with `from pipecat_supertonic import SupertonicTTSService`
|
|
32
|
+
- configure with `SupertonicTTSService.Settings(...)`
|
|
33
|
+
- drop directly into an existing Pipecat pipeline
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install pipecat-supertonic
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or with `uv`:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
uv add pipecat-supertonic
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pipecat_supertonic import SupertonicTTSService
|
|
51
|
+
|
|
52
|
+
tts = SupertonicTTSService(
|
|
53
|
+
settings=SupertonicTTSService.Settings(
|
|
54
|
+
voice="M1",
|
|
55
|
+
language="en",
|
|
56
|
+
total_steps=5,
|
|
57
|
+
speed=1.05,
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
await tts.warmup()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
`warmup()` is required before the service is used in a live Pipecat pipeline.
|
|
65
|
+
Call it during application startup so Supertonic can download and cache the
|
|
66
|
+
model before the first user request arrives.
|
|
67
|
+
|
|
68
|
+
## Warmup Contract
|
|
69
|
+
|
|
70
|
+
This package intentionally does not lazy-load Supertonic during active TTS
|
|
71
|
+
requests. If the service is used before `warmup()`, it fails fast with a clear
|
|
72
|
+
error telling the caller to warm the service up first.
|
|
73
|
+
|
|
74
|
+
This avoids first-request cold-start delays and keeps Pipecat TTS frame ordering
|
|
75
|
+
stable.
|
|
76
|
+
|
|
77
|
+
## Example
|
|
78
|
+
|
|
79
|
+
See `examples/voice-supertonic.py` for a minimal package-level example.
|
|
80
|
+
|
|
81
|
+
## Development
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv sync --group dev
|
|
85
|
+
uv run pytest
|
|
86
|
+
uv run ruff check .
|
|
87
|
+
```
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# pipecat-supertonic
|
|
2
|
+
|
|
3
|
+
`pipecat-supertonic` provides a Pipecat-compatible `TTSService` wrapper for the
|
|
4
|
+
official [Supertonic](https://github.com/supertone-inc/supertonic) Python SDK.
|
|
5
|
+
|
|
6
|
+
The package is designed to feel like a native Pipecat service:
|
|
7
|
+
|
|
8
|
+
- import with `from pipecat_supertonic import SupertonicTTSService`
|
|
9
|
+
- configure with `SupertonicTTSService.Settings(...)`
|
|
10
|
+
- drop directly into an existing Pipecat pipeline
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install pipecat-supertonic
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Or with `uv`:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv add pipecat-supertonic
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from pipecat_supertonic import SupertonicTTSService
|
|
28
|
+
|
|
29
|
+
tts = SupertonicTTSService(
|
|
30
|
+
settings=SupertonicTTSService.Settings(
|
|
31
|
+
voice="M1",
|
|
32
|
+
language="en",
|
|
33
|
+
total_steps=5,
|
|
34
|
+
speed=1.05,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
await tts.warmup()
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`warmup()` is required before the service is used in a live Pipecat pipeline.
|
|
42
|
+
Call it during application startup so Supertonic can download and cache the
|
|
43
|
+
model before the first user request arrives.
|
|
44
|
+
|
|
45
|
+
## Warmup Contract
|
|
46
|
+
|
|
47
|
+
This package intentionally does not lazy-load Supertonic during active TTS
|
|
48
|
+
requests. If the service is used before `warmup()`, it fails fast with a clear
|
|
49
|
+
error telling the caller to warm the service up first.
|
|
50
|
+
|
|
51
|
+
This avoids first-request cold-start delays and keeps Pipecat TTS frame ordering
|
|
52
|
+
stable.
|
|
53
|
+
|
|
54
|
+
## Example
|
|
55
|
+
|
|
56
|
+
See `examples/voice-supertonic.py` for a minimal package-level example.
|
|
57
|
+
|
|
58
|
+
## Development
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv sync --group dev
|
|
62
|
+
uv run pytest
|
|
63
|
+
uv run ruff check .
|
|
64
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pipecat-supertonic"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Supertonic TTS service integration for Pipecat"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "BSD-2-Clause"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Archit498", email = "archit@voicing.ai" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
keywords = ["pipecat", "tts", "supertonic", "voice", "speech-synthesis"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: BSD License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
"Typing :: Typed",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"pipecat-ai[websockets-base]>=1.2,<2",
|
|
26
|
+
"supertonic>=1.2.1,<2",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[dependency-groups]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=9,<10",
|
|
32
|
+
"pytest-asyncio>=1,<2",
|
|
33
|
+
"ruff>=0.12,<1",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[tool.ruff]
|
|
37
|
+
line-length = 100
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["uv_build>=0.10.0,<0.11.0"]
|
|
41
|
+
build-backend = "uv_build"
|
|
File without changes
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2026
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Supertonic TTS service integration for Pipecat."""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from collections.abc import AsyncGenerator
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
from pipecat.audio.utils import create_stream_resampler
|
|
18
|
+
from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
|
|
19
|
+
from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
|
|
20
|
+
from pipecat.services.tts_service import TTSService
|
|
21
|
+
from pipecat.transcriptions.language import Language
|
|
22
|
+
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from supertonic import TTS as SupertonicSDK
|
|
26
|
+
except ModuleNotFoundError as e:
|
|
27
|
+
logger.error(f"Exception: {e}")
|
|
28
|
+
logger.error("In order to use Supertonic, you need to `pip install supertonic`.")
|
|
29
|
+
raise Exception(f"Missing module: {e}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
SUPPORTED_LANGUAGES = frozenset(
|
|
33
|
+
{
|
|
34
|
+
"ar",
|
|
35
|
+
"bg",
|
|
36
|
+
"cs",
|
|
37
|
+
"da",
|
|
38
|
+
"de",
|
|
39
|
+
"el",
|
|
40
|
+
"en",
|
|
41
|
+
"es",
|
|
42
|
+
"et",
|
|
43
|
+
"fi",
|
|
44
|
+
"fr",
|
|
45
|
+
"hi",
|
|
46
|
+
"hr",
|
|
47
|
+
"hu",
|
|
48
|
+
"id",
|
|
49
|
+
"it",
|
|
50
|
+
"ja",
|
|
51
|
+
"ko",
|
|
52
|
+
"lt",
|
|
53
|
+
"lv",
|
|
54
|
+
"nl",
|
|
55
|
+
"pl",
|
|
56
|
+
"pt",
|
|
57
|
+
"ro",
|
|
58
|
+
"ru",
|
|
59
|
+
"sk",
|
|
60
|
+
"sl",
|
|
61
|
+
"sv",
|
|
62
|
+
"tr",
|
|
63
|
+
"uk",
|
|
64
|
+
"vi",
|
|
65
|
+
}
|
|
66
|
+
)
|
|
67
|
+
UNKNOWN_LANGUAGE = "na"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def language_to_supertonic_language(language: Language) -> str:
|
|
71
|
+
"""Convert a Pipecat language enum to a Supertonic language code.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
language: The language to convert.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
A Supertonic language code, falling back to ``"na"`` when the
|
|
78
|
+
language is outside Supertonic's supported set.
|
|
79
|
+
"""
|
|
80
|
+
base_code = str(language).split("-")[0].lower()
|
|
81
|
+
if base_code in SUPPORTED_LANGUAGES:
|
|
82
|
+
return base_code
|
|
83
|
+
|
|
84
|
+
logger.warning(
|
|
85
|
+
f"Language {language} is not supported by Supertonic. Using fallback "
|
|
86
|
+
f"language '{UNKNOWN_LANGUAGE}'."
|
|
87
|
+
)
|
|
88
|
+
return UNKNOWN_LANGUAGE
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class SupertonicTTSSettings(TTSSettings):
|
|
93
|
+
"""Settings for SupertonicTTSService.
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
speed: Speech speed multiplier.
|
|
97
|
+
total_steps: Number of synthesis steps.
|
|
98
|
+
max_chunk_length: Maximum characters per synthesized chunk.
|
|
99
|
+
silence_duration: Silence inserted between synthesized chunks.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
|
103
|
+
total_steps: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
|
104
|
+
max_chunk_length: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
|
105
|
+
silence_duration: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class SupertonicTTSService(TTSService):
|
|
109
|
+
"""Supertonic text-to-speech service for Pipecat."""
|
|
110
|
+
|
|
111
|
+
Settings = SupertonicTTSSettings
|
|
112
|
+
_settings: Settings
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
*,
|
|
117
|
+
model: str | None = None,
|
|
118
|
+
voice: str | None = None,
|
|
119
|
+
language: Language | str | None = None,
|
|
120
|
+
speed: float | None = None,
|
|
121
|
+
total_steps: int | None = None,
|
|
122
|
+
max_chunk_length: int | None = None,
|
|
123
|
+
silence_duration: float | None = None,
|
|
124
|
+
auto_download: bool = True,
|
|
125
|
+
intra_op_num_threads: int | None = None,
|
|
126
|
+
inter_op_num_threads: int | None = None,
|
|
127
|
+
sample_rate: int | None = None,
|
|
128
|
+
settings: Settings | None = None,
|
|
129
|
+
**kwargs,
|
|
130
|
+
):
|
|
131
|
+
"""Initialize the Supertonic TTS service.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
model: Supertonic model name.
|
|
135
|
+
voice: Supertonic voice name.
|
|
136
|
+
language: Language for synthesis.
|
|
137
|
+
speed: Speech speed multiplier.
|
|
138
|
+
total_steps: Number of synthesis steps.
|
|
139
|
+
max_chunk_length: Maximum characters per synthesized chunk.
|
|
140
|
+
silence_duration: Silence inserted between synthesized chunks.
|
|
141
|
+
auto_download: Whether to download model assets automatically.
|
|
142
|
+
intra_op_num_threads: ONNX intra-op thread count.
|
|
143
|
+
inter_op_num_threads: ONNX inter-op thread count.
|
|
144
|
+
sample_rate: Output sample rate for generated audio.
|
|
145
|
+
settings: Runtime-updatable settings. When provided alongside direct
|
|
146
|
+
parameters, ``settings`` values take precedence.
|
|
147
|
+
**kwargs: Additional keyword arguments passed to ``TTSService``.
|
|
148
|
+
"""
|
|
149
|
+
default_settings = self.Settings(
|
|
150
|
+
model="supertonic-3",
|
|
151
|
+
voice="M1",
|
|
152
|
+
language=Language.EN,
|
|
153
|
+
speed=1.05,
|
|
154
|
+
total_steps=5,
|
|
155
|
+
max_chunk_length=None,
|
|
156
|
+
silence_duration=0.3,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if model is not None:
|
|
160
|
+
default_settings.model = model
|
|
161
|
+
if voice is not None:
|
|
162
|
+
default_settings.voice = voice
|
|
163
|
+
if language is not None:
|
|
164
|
+
default_settings.language = language
|
|
165
|
+
if speed is not None:
|
|
166
|
+
default_settings.speed = speed
|
|
167
|
+
if total_steps is not None:
|
|
168
|
+
default_settings.total_steps = total_steps
|
|
169
|
+
if max_chunk_length is not None:
|
|
170
|
+
default_settings.max_chunk_length = max_chunk_length
|
|
171
|
+
if silence_duration is not None:
|
|
172
|
+
default_settings.silence_duration = silence_duration
|
|
173
|
+
|
|
174
|
+
if settings is not None:
|
|
175
|
+
default_settings.apply_update(settings)
|
|
176
|
+
|
|
177
|
+
super().__init__(
|
|
178
|
+
sample_rate=sample_rate,
|
|
179
|
+
push_start_frame=True,
|
|
180
|
+
push_stop_frames=True,
|
|
181
|
+
settings=default_settings,
|
|
182
|
+
**kwargs,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
self._auto_download = auto_download
|
|
186
|
+
self._intra_op_num_threads = intra_op_num_threads
|
|
187
|
+
self._inter_op_num_threads = inter_op_num_threads
|
|
188
|
+
|
|
189
|
+
self._resampler = create_stream_resampler()
|
|
190
|
+
self._tts: Any | None = None
|
|
191
|
+
self._voice_styles: dict[str, object] = {}
|
|
192
|
+
self._available_voice_names: tuple[str, ...] = ()
|
|
193
|
+
self._tts_lock = asyncio.Lock()
|
|
194
|
+
|
|
195
|
+
async def warmup(self) -> None:
|
|
196
|
+
"""Download and initialize Supertonic assets for this service instance.
|
|
197
|
+
|
|
198
|
+
Call this during application startup before the service is used in a
|
|
199
|
+
live Pipecat pipeline. This avoids first-request cold-start delays and
|
|
200
|
+
keeps TTS frame ordering stable during active calls.
|
|
201
|
+
"""
|
|
202
|
+
await self._ensure_tts()
|
|
203
|
+
|
|
204
|
+
def can_generate_metrics(self) -> bool:
|
|
205
|
+
"""Indicate that this service supports TTFB and usage metrics."""
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
def language_to_service_language(self, language: Language) -> str:
|
|
209
|
+
"""Convert a Pipecat language enum to Supertonic's language format."""
|
|
210
|
+
return language_to_supertonic_language(language)
|
|
211
|
+
|
|
212
|
+
async def _update_settings(self, delta: Settings) -> dict[str, object]:
|
|
213
|
+
"""Apply a settings delta.
|
|
214
|
+
|
|
215
|
+
Model updates clear the cached SDK instance so the next synthesis call
|
|
216
|
+
reinitializes with the updated model.
|
|
217
|
+
"""
|
|
218
|
+
changed = await super()._update_settings(delta)
|
|
219
|
+
if "model" in changed:
|
|
220
|
+
async with self._tts_lock:
|
|
221
|
+
self._tts = None
|
|
222
|
+
self._voice_styles.clear()
|
|
223
|
+
self._available_voice_names = ()
|
|
224
|
+
return changed
|
|
225
|
+
|
|
226
|
+
async def _ensure_tts(self) -> Any:
|
|
227
|
+
if self._tts is not None:
|
|
228
|
+
return self._tts
|
|
229
|
+
|
|
230
|
+
async with self._tts_lock:
|
|
231
|
+
if self._tts is None:
|
|
232
|
+
model = assert_given(self._settings.model)
|
|
233
|
+
self._tts = await asyncio.to_thread(
|
|
234
|
+
SupertonicSDK,
|
|
235
|
+
model=model,
|
|
236
|
+
auto_download=self._auto_download,
|
|
237
|
+
intra_op_num_threads=self._intra_op_num_threads,
|
|
238
|
+
inter_op_num_threads=self._inter_op_num_threads,
|
|
239
|
+
)
|
|
240
|
+
self._available_voice_names = tuple(self._tts.voice_style_names)
|
|
241
|
+
return self._tts
|
|
242
|
+
|
|
243
|
+
def _require_warmup(self) -> Any:
|
|
244
|
+
if self._tts is None:
|
|
245
|
+
raise RuntimeError(
|
|
246
|
+
"SupertonicTTSService is not warmed up. Call `await tts.warmup()` "
|
|
247
|
+
"during application startup before using the service."
|
|
248
|
+
)
|
|
249
|
+
return self._tts
|
|
250
|
+
|
|
251
|
+
async def _get_voice_style(self, voice_name: str) -> object:
|
|
252
|
+
tts = self._require_warmup()
|
|
253
|
+
|
|
254
|
+
if voice_name not in self._available_voice_names:
|
|
255
|
+
valid_voices = ", ".join(sorted(self._available_voice_names)) or "none"
|
|
256
|
+
raise ValueError(
|
|
257
|
+
f"Supertonic TTS voice {voice_name!r} is not supported "
|
|
258
|
+
f"(must be one of: {valid_voices})"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
cached = self._voice_styles.get(voice_name)
|
|
262
|
+
if cached is not None:
|
|
263
|
+
return cached
|
|
264
|
+
|
|
265
|
+
style = await asyncio.to_thread(tts.get_voice_style, voice_name)
|
|
266
|
+
self._voice_styles[voice_name] = style
|
|
267
|
+
return style
|
|
268
|
+
|
|
269
|
+
def _waveform_to_pcm16(self, waveform: np.ndarray) -> bytes:
|
|
270
|
+
"""Convert a Supertonic waveform array to mono PCM16 bytes."""
|
|
271
|
+
audio = np.asarray(waveform)
|
|
272
|
+
|
|
273
|
+
if audio.ndim == 2:
|
|
274
|
+
if audio.shape[0] == 1:
|
|
275
|
+
audio = audio[0]
|
|
276
|
+
elif audio.shape[1] == 1:
|
|
277
|
+
audio = audio[:, 0]
|
|
278
|
+
else:
|
|
279
|
+
raise ValueError(f"Expected mono audio from Supertonic, got shape {audio.shape}")
|
|
280
|
+
elif audio.ndim != 1:
|
|
281
|
+
raise ValueError(f"Expected 1-D or mono 2-D audio from Supertonic, got {audio.shape}")
|
|
282
|
+
|
|
283
|
+
if audio.size == 0:
|
|
284
|
+
raise ValueError("Supertonic returned empty audio")
|
|
285
|
+
|
|
286
|
+
if np.issubdtype(audio.dtype, np.floating):
|
|
287
|
+
audio = np.clip(audio, -1.0, 1.0)
|
|
288
|
+
audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
|
|
289
|
+
elif np.issubdtype(audio.dtype, np.integer):
|
|
290
|
+
audio = np.clip(audio, np.iinfo(np.int16).min, np.iinfo(np.int16).max).astype(np.int16)
|
|
291
|
+
else:
|
|
292
|
+
raise TypeError(f"Unsupported Supertonic waveform dtype: {audio.dtype}")
|
|
293
|
+
|
|
294
|
+
return audio.tobytes()
|
|
295
|
+
|
|
296
|
+
@traced_tts
|
|
297
|
+
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
|
298
|
+
"""Generate speech from text using Supertonic."""
|
|
299
|
+
logger.debug(f"{self}: Generating TTS [{text}]")
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
await self.start_tts_usage_metrics(text)
|
|
303
|
+
|
|
304
|
+
voice = assert_given(self._settings.voice)
|
|
305
|
+
if voice is None:
|
|
306
|
+
raise ValueError("Supertonic TTS voice must be specified")
|
|
307
|
+
|
|
308
|
+
language = assert_given(self._settings.language)
|
|
309
|
+
speed = assert_given(self._settings.speed)
|
|
310
|
+
total_steps = assert_given(self._settings.total_steps)
|
|
311
|
+
max_chunk_length = assert_given(self._settings.max_chunk_length)
|
|
312
|
+
silence_duration = assert_given(self._settings.silence_duration)
|
|
313
|
+
|
|
314
|
+
tts = self._require_warmup()
|
|
315
|
+
voice_style = await self._get_voice_style(voice)
|
|
316
|
+
|
|
317
|
+
synthesis_language = language or UNKNOWN_LANGUAGE
|
|
318
|
+
if not tts.is_multilingual:
|
|
319
|
+
synthesis_language = "en"
|
|
320
|
+
|
|
321
|
+
waveform, _ = await asyncio.to_thread(
|
|
322
|
+
tts.synthesize,
|
|
323
|
+
text,
|
|
324
|
+
voice_style,
|
|
325
|
+
total_steps=total_steps,
|
|
326
|
+
speed=speed,
|
|
327
|
+
max_chunk_length=max_chunk_length,
|
|
328
|
+
silence_duration=silence_duration,
|
|
329
|
+
lang=synthesis_language,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
await self.stop_ttfb_metrics()
|
|
333
|
+
|
|
334
|
+
audio = self._waveform_to_pcm16(waveform)
|
|
335
|
+
if tts.sample_rate != self.sample_rate:
|
|
336
|
+
audio = await self._resampler.resample(audio, tts.sample_rate, self.sample_rate)
|
|
337
|
+
|
|
338
|
+
yield TTSAudioRawFrame(
|
|
339
|
+
audio=audio,
|
|
340
|
+
sample_rate=self.sample_rate,
|
|
341
|
+
num_channels=1,
|
|
342
|
+
context_id=context_id,
|
|
343
|
+
)
|
|
344
|
+
except Exception as e:
|
|
345
|
+
yield ErrorFrame(error=f"Unknown error occurred: {e}")
|
|
346
|
+
finally:
|
|
347
|
+
await self.stop_ttfb_metrics()
|