pipecat-supertonic 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: pipecat-supertonic
3
+ Version: 0.1.0
4
+ Summary: Supertonic TTS service integration for Pipecat
5
+ Keywords: pipecat,tts,supertonic,voice,speech-synthesis
6
+ Author: Archit498
7
+ Author-email: Archit498 <archit@voicing.ai>
8
+ License-Expression: BSD-2-Clause
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Typing :: Typed
19
+ Requires-Dist: pipecat-ai[websockets-base]>=1.2,<2
20
+ Requires-Dist: supertonic>=1.2.1,<2
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+
24
+ # pipecat-supertonic
25
+
26
+ `pipecat-supertonic` provides a Pipecat-compatible `TTSService` wrapper for the
27
+ official [Supertonic](https://github.com/supertone-inc/supertonic) Python SDK.
28
+
29
+ The package is designed to feel like a native Pipecat service:
30
+
31
+ - import with `from pipecat_supertonic import SupertonicTTSService`
32
+ - configure with `SupertonicTTSService.Settings(...)`
33
+ - drop directly into an existing Pipecat pipeline
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install pipecat-supertonic
39
+ ```
40
+
41
+ Or with `uv`:
42
+
43
+ ```bash
44
+ uv add pipecat-supertonic
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ ```python
50
+ from pipecat_supertonic import SupertonicTTSService
51
+
52
+ tts = SupertonicTTSService(
53
+ settings=SupertonicTTSService.Settings(
54
+ voice="M1",
55
+ language="en",
56
+ total_steps=5,
57
+ speed=1.05,
58
+ )
59
+ )
60
+
61
+ await tts.warmup()
62
+ ```
63
+
64
+ `warmup()` is required before the service is used in a live Pipecat pipeline.
65
+ Call it during application startup so Supertonic can download and cache the
66
+ model before the first user request arrives.
67
+
68
+ ## Warmup Contract
69
+
70
+ This package intentionally does not lazy-load Supertonic during active TTS
71
+ requests. If the service is used before `warmup()`, it fails fast with a clear
72
+ error telling the caller to warm the service up first.
73
+
74
+ This avoids first-request cold-start delays and keeps Pipecat TTS frame ordering
75
+ stable.
76
+
77
+ ## Example
78
+
79
+ See `examples/voice-supertonic.py` for a minimal package-level example.
80
+
81
+ ## Development
82
+
83
+ ```bash
84
+ uv sync --group dev
85
+ uv run pytest
86
+ uv run ruff check .
87
+ ```
@@ -0,0 +1,64 @@
1
+ # pipecat-supertonic
2
+
3
+ `pipecat-supertonic` provides a Pipecat-compatible `TTSService` wrapper for the
4
+ official [Supertonic](https://github.com/supertone-inc/supertonic) Python SDK.
5
+
6
+ The package is designed to feel like a native Pipecat service:
7
+
8
+ - import with `from pipecat_supertonic import SupertonicTTSService`
9
+ - configure with `SupertonicTTSService.Settings(...)`
10
+ - drop directly into an existing Pipecat pipeline
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install pipecat-supertonic
16
+ ```
17
+
18
+ Or with `uv`:
19
+
20
+ ```bash
21
+ uv add pipecat-supertonic
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ```python
27
+ from pipecat_supertonic import SupertonicTTSService
28
+
29
+ tts = SupertonicTTSService(
30
+ settings=SupertonicTTSService.Settings(
31
+ voice="M1",
32
+ language="en",
33
+ total_steps=5,
34
+ speed=1.05,
35
+ )
36
+ )
37
+
38
+ await tts.warmup()
39
+ ```
40
+
41
+ `warmup()` is required before the service is used in a live Pipecat pipeline.
42
+ Call it during application startup so Supertonic can download and cache the
43
+ model before the first user request arrives.
44
+
45
+ ## Warmup Contract
46
+
47
+ This package intentionally does not lazy-load Supertonic during active TTS
48
+ requests. If the service is used before `warmup()`, it fails fast with a clear
49
+ error telling the caller to warm the service up first.
50
+
51
+ This avoids first-request cold-start delays and keeps Pipecat TTS frame ordering
52
+ stable.
53
+
54
+ ## Example
55
+
56
+ See `examples/voice-supertonic.py` for a minimal package-level example.
57
+
58
+ ## Development
59
+
60
+ ```bash
61
+ uv sync --group dev
62
+ uv run pytest
63
+ uv run ruff check .
64
+ ```
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "pipecat-supertonic"
3
+ version = "0.1.0"
4
+ description = "Supertonic TTS service integration for Pipecat"
5
+ readme = "README.md"
6
+ license = "BSD-2-Clause"
7
+ authors = [
8
+ { name = "Archit498", email = "archit@voicing.ai" }
9
+ ]
10
+ requires-python = ">=3.11"
11
+ keywords = ["pipecat", "tts", "supertonic", "voice", "speech-synthesis"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: BSD License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ "Typing :: Typed",
23
+ ]
24
+ dependencies = [
25
+ "pipecat-ai[websockets-base]>=1.2,<2",
26
+ "supertonic>=1.2.1,<2",
27
+ ]
28
+
29
+ [dependency-groups]
30
+ dev = [
31
+ "pytest>=9,<10",
32
+ "pytest-asyncio>=1,<2",
33
+ "ruff>=0.12,<1",
34
+ ]
35
+
36
+ [tool.ruff]
37
+ line-length = 100
38
+
39
+ [build-system]
40
+ requires = ["uv_build>=0.10.0,<0.11.0"]
41
+ build-backend = "uv_build"
@@ -0,0 +1,2 @@
1
+ from .tts import SupertonicTTSService, SupertonicTTSSettings
2
+ __all__ = ["SupertonicTTSService", "SupertonicTTSSettings"]
@@ -0,0 +1,347 @@
1
+ #
2
+ # Copyright (c) 2026
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Supertonic TTS service integration for Pipecat."""
8
+
9
+ import asyncio
10
+ from collections.abc import AsyncGenerator
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ import numpy as np
15
+ from loguru import logger
16
+
17
+ from pipecat.audio.utils import create_stream_resampler
18
+ from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
19
+ from pipecat.services.settings import NOT_GIVEN, TTSSettings, _NotGiven, assert_given
20
+ from pipecat.services.tts_service import TTSService
21
+ from pipecat.transcriptions.language import Language
22
+ from pipecat.utils.tracing.service_decorators import traced_tts
23
+
24
+ try:
25
+ from supertonic import TTS as SupertonicSDK
26
+ except ModuleNotFoundError as e:
27
+ logger.error(f"Exception: {e}")
28
+ logger.error("In order to use Supertonic, you need to `pip install supertonic`.")
29
+ raise Exception(f"Missing module: {e}")
30
+
31
+
32
+ SUPPORTED_LANGUAGES = frozenset(
33
+ {
34
+ "ar",
35
+ "bg",
36
+ "cs",
37
+ "da",
38
+ "de",
39
+ "el",
40
+ "en",
41
+ "es",
42
+ "et",
43
+ "fi",
44
+ "fr",
45
+ "hi",
46
+ "hr",
47
+ "hu",
48
+ "id",
49
+ "it",
50
+ "ja",
51
+ "ko",
52
+ "lt",
53
+ "lv",
54
+ "nl",
55
+ "pl",
56
+ "pt",
57
+ "ro",
58
+ "ru",
59
+ "sk",
60
+ "sl",
61
+ "sv",
62
+ "tr",
63
+ "uk",
64
+ "vi",
65
+ }
66
+ )
67
+ UNKNOWN_LANGUAGE = "na"
68
+
69
+
70
+ def language_to_supertonic_language(language: Language) -> str:
71
+ """Convert a Pipecat language enum to a Supertonic language code.
72
+
73
+ Args:
74
+ language: The language to convert.
75
+
76
+ Returns:
77
+ A Supertonic language code, falling back to ``"na"`` when the
78
+ language is outside Supertonic's supported set.
79
+ """
80
+ base_code = str(language).split("-")[0].lower()
81
+ if base_code in SUPPORTED_LANGUAGES:
82
+ return base_code
83
+
84
+ logger.warning(
85
+ f"Language {language} is not supported by Supertonic. Using fallback "
86
+ f"language '{UNKNOWN_LANGUAGE}'."
87
+ )
88
+ return UNKNOWN_LANGUAGE
89
+
90
+
91
+ @dataclass
92
+ class SupertonicTTSSettings(TTSSettings):
93
+ """Settings for SupertonicTTSService.
94
+
95
+ Parameters:
96
+ speed: Speech speed multiplier.
97
+ total_steps: Number of synthesis steps.
98
+ max_chunk_length: Maximum characters per synthesized chunk.
99
+ silence_duration: Silence inserted between synthesized chunks.
100
+ """
101
+
102
+ speed: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
103
+ total_steps: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
104
+ max_chunk_length: int | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
105
+ silence_duration: float | None | _NotGiven = field(default_factory=lambda: NOT_GIVEN)
106
+
107
+
108
+ class SupertonicTTSService(TTSService):
109
+ """Supertonic text-to-speech service for Pipecat."""
110
+
111
+ Settings = SupertonicTTSSettings
112
+ _settings: Settings
113
+
114
+ def __init__(
115
+ self,
116
+ *,
117
+ model: str | None = None,
118
+ voice: str | None = None,
119
+ language: Language | str | None = None,
120
+ speed: float | None = None,
121
+ total_steps: int | None = None,
122
+ max_chunk_length: int | None = None,
123
+ silence_duration: float | None = None,
124
+ auto_download: bool = True,
125
+ intra_op_num_threads: int | None = None,
126
+ inter_op_num_threads: int | None = None,
127
+ sample_rate: int | None = None,
128
+ settings: Settings | None = None,
129
+ **kwargs,
130
+ ):
131
+ """Initialize the Supertonic TTS service.
132
+
133
+ Args:
134
+ model: Supertonic model name.
135
+ voice: Supertonic voice name.
136
+ language: Language for synthesis.
137
+ speed: Speech speed multiplier.
138
+ total_steps: Number of synthesis steps.
139
+ max_chunk_length: Maximum characters per synthesized chunk.
140
+ silence_duration: Silence inserted between synthesized chunks.
141
+ auto_download: Whether to download model assets automatically.
142
+ intra_op_num_threads: ONNX intra-op thread count.
143
+ inter_op_num_threads: ONNX inter-op thread count.
144
+ sample_rate: Output sample rate for generated audio.
145
+ settings: Runtime-updatable settings. When provided alongside direct
146
+ parameters, ``settings`` values take precedence.
147
+ **kwargs: Additional keyword arguments passed to ``TTSService``.
148
+ """
149
+ default_settings = self.Settings(
150
+ model="supertonic-3",
151
+ voice="M1",
152
+ language=Language.EN,
153
+ speed=1.05,
154
+ total_steps=5,
155
+ max_chunk_length=None,
156
+ silence_duration=0.3,
157
+ )
158
+
159
+ if model is not None:
160
+ default_settings.model = model
161
+ if voice is not None:
162
+ default_settings.voice = voice
163
+ if language is not None:
164
+ default_settings.language = language
165
+ if speed is not None:
166
+ default_settings.speed = speed
167
+ if total_steps is not None:
168
+ default_settings.total_steps = total_steps
169
+ if max_chunk_length is not None:
170
+ default_settings.max_chunk_length = max_chunk_length
171
+ if silence_duration is not None:
172
+ default_settings.silence_duration = silence_duration
173
+
174
+ if settings is not None:
175
+ default_settings.apply_update(settings)
176
+
177
+ super().__init__(
178
+ sample_rate=sample_rate,
179
+ push_start_frame=True,
180
+ push_stop_frames=True,
181
+ settings=default_settings,
182
+ **kwargs,
183
+ )
184
+
185
+ self._auto_download = auto_download
186
+ self._intra_op_num_threads = intra_op_num_threads
187
+ self._inter_op_num_threads = inter_op_num_threads
188
+
189
+ self._resampler = create_stream_resampler()
190
+ self._tts: Any | None = None
191
+ self._voice_styles: dict[str, object] = {}
192
+ self._available_voice_names: tuple[str, ...] = ()
193
+ self._tts_lock = asyncio.Lock()
194
+
195
+ async def warmup(self) -> None:
196
+ """Download and initialize Supertonic assets for this service instance.
197
+
198
+ Call this during application startup before the service is used in a
199
+ live Pipecat pipeline. This avoids first-request cold-start delays and
200
+ keeps TTS frame ordering stable during active calls.
201
+ """
202
+ await self._ensure_tts()
203
+
204
+ def can_generate_metrics(self) -> bool:
205
+ """Indicate that this service supports TTFB and usage metrics."""
206
+ return True
207
+
208
+ def language_to_service_language(self, language: Language) -> str:
209
+ """Convert a Pipecat language enum to Supertonic's language format."""
210
+ return language_to_supertonic_language(language)
211
+
212
+ async def _update_settings(self, delta: Settings) -> dict[str, object]:
213
+ """Apply a settings delta.
214
+
215
+ Model updates clear the cached SDK instance so the next synthesis call
216
+ reinitializes with the updated model.
217
+ """
218
+ changed = await super()._update_settings(delta)
219
+ if "model" in changed:
220
+ async with self._tts_lock:
221
+ self._tts = None
222
+ self._voice_styles.clear()
223
+ self._available_voice_names = ()
224
+ return changed
225
+
226
+ async def _ensure_tts(self) -> Any:
227
+ if self._tts is not None:
228
+ return self._tts
229
+
230
+ async with self._tts_lock:
231
+ if self._tts is None:
232
+ model = assert_given(self._settings.model)
233
+ self._tts = await asyncio.to_thread(
234
+ SupertonicSDK,
235
+ model=model,
236
+ auto_download=self._auto_download,
237
+ intra_op_num_threads=self._intra_op_num_threads,
238
+ inter_op_num_threads=self._inter_op_num_threads,
239
+ )
240
+ self._available_voice_names = tuple(self._tts.voice_style_names)
241
+ return self._tts
242
+
243
+ def _require_warmup(self) -> Any:
244
+ if self._tts is None:
245
+ raise RuntimeError(
246
+ "SupertonicTTSService is not warmed up. Call `await tts.warmup()` "
247
+ "during application startup before using the service."
248
+ )
249
+ return self._tts
250
+
251
+ async def _get_voice_style(self, voice_name: str) -> object:
252
+ tts = self._require_warmup()
253
+
254
+ if voice_name not in self._available_voice_names:
255
+ valid_voices = ", ".join(sorted(self._available_voice_names)) or "none"
256
+ raise ValueError(
257
+ f"Supertonic TTS voice {voice_name!r} is not supported "
258
+ f"(must be one of: {valid_voices})"
259
+ )
260
+
261
+ cached = self._voice_styles.get(voice_name)
262
+ if cached is not None:
263
+ return cached
264
+
265
+ style = await asyncio.to_thread(tts.get_voice_style, voice_name)
266
+ self._voice_styles[voice_name] = style
267
+ return style
268
+
269
+ def _waveform_to_pcm16(self, waveform: np.ndarray) -> bytes:
270
+ """Convert a Supertonic waveform array to mono PCM16 bytes."""
271
+ audio = np.asarray(waveform)
272
+
273
+ if audio.ndim == 2:
274
+ if audio.shape[0] == 1:
275
+ audio = audio[0]
276
+ elif audio.shape[1] == 1:
277
+ audio = audio[:, 0]
278
+ else:
279
+ raise ValueError(f"Expected mono audio from Supertonic, got shape {audio.shape}")
280
+ elif audio.ndim != 1:
281
+ raise ValueError(f"Expected 1-D or mono 2-D audio from Supertonic, got {audio.shape}")
282
+
283
+ if audio.size == 0:
284
+ raise ValueError("Supertonic returned empty audio")
285
+
286
+ if np.issubdtype(audio.dtype, np.floating):
287
+ audio = np.clip(audio, -1.0, 1.0)
288
+ audio = (audio * np.iinfo(np.int16).max).astype(np.int16)
289
+ elif np.issubdtype(audio.dtype, np.integer):
290
+ audio = np.clip(audio, np.iinfo(np.int16).min, np.iinfo(np.int16).max).astype(np.int16)
291
+ else:
292
+ raise TypeError(f"Unsupported Supertonic waveform dtype: {audio.dtype}")
293
+
294
+ return audio.tobytes()
295
+
296
+ @traced_tts
297
+ async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
298
+ """Generate speech from text using Supertonic."""
299
+ logger.debug(f"{self}: Generating TTS [{text}]")
300
+
301
+ try:
302
+ await self.start_tts_usage_metrics(text)
303
+
304
+ voice = assert_given(self._settings.voice)
305
+ if voice is None:
306
+ raise ValueError("Supertonic TTS voice must be specified")
307
+
308
+ language = assert_given(self._settings.language)
309
+ speed = assert_given(self._settings.speed)
310
+ total_steps = assert_given(self._settings.total_steps)
311
+ max_chunk_length = assert_given(self._settings.max_chunk_length)
312
+ silence_duration = assert_given(self._settings.silence_duration)
313
+
314
+ tts = self._require_warmup()
315
+ voice_style = await self._get_voice_style(voice)
316
+
317
+ synthesis_language = language or UNKNOWN_LANGUAGE
318
+ if not tts.is_multilingual:
319
+ synthesis_language = "en"
320
+
321
+ waveform, _ = await asyncio.to_thread(
322
+ tts.synthesize,
323
+ text,
324
+ voice_style,
325
+ total_steps=total_steps,
326
+ speed=speed,
327
+ max_chunk_length=max_chunk_length,
328
+ silence_duration=silence_duration,
329
+ lang=synthesis_language,
330
+ )
331
+
332
+ await self.stop_ttfb_metrics()
333
+
334
+ audio = self._waveform_to_pcm16(waveform)
335
+ if tts.sample_rate != self.sample_rate:
336
+ audio = await self._resampler.resample(audio, tts.sample_rate, self.sample_rate)
337
+
338
+ yield TTSAudioRawFrame(
339
+ audio=audio,
340
+ sample_rate=self.sample_rate,
341
+ num_channels=1,
342
+ context_id=context_id,
343
+ )
344
+ except Exception as e:
345
+ yield ErrorFrame(error=f"Unknown error occurred: {e}")
346
+ finally:
347
+ await self.stop_ttfb_metrics()