openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Supertonic TTS Provider for OpenVoiceUI.
|
|
4
|
+
|
|
5
|
+
This provider wraps the existing supertonic_tts.py module, implementing
|
|
6
|
+
the TTSProvider interface for seamless integration with the TTS provider system.
|
|
7
|
+
|
|
8
|
+
Supertonic is a local ONNX-based Text-to-Speech engine that supports multiple
|
|
9
|
+
voice styles (M1-M5 for male, F1-F5 for female voices) and multiple languages.
|
|
10
|
+
|
|
11
|
+
Author: OpenVoiceUI
|
|
12
|
+
Date: 2026-02-11
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
from typing import Dict, List, Any, Optional
|
|
18
|
+
|
|
19
|
+
from .base_provider import TTSProvider
|
|
20
|
+
|
|
21
|
+
# Configure logging
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# ── API mode (preferred) ───────────────────────────────────────────────────────
|
|
25
|
+
# When SUPERTONIC_API_URL is set, all synthesis calls go to the shared
|
|
26
|
+
# supertonic-tts microservice (loaded once, serves all users).
|
|
27
|
+
# Falls back to local ONNX loading if the env var is not set.
|
|
28
|
+
_API_URL = os.environ.get("SUPERTONIC_API_URL", "").rstrip("/")
|
|
29
|
+
|
|
30
|
+
# ── Local mode (fallback) ──────────────────────────────────────────────────────
|
|
31
|
+
import sys
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
_project_root = Path(__file__).parent.parent
|
|
35
|
+
if str(_project_root) not in sys.path:
|
|
36
|
+
sys.path.insert(0, str(_project_root))
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from supertonic_tts import SupertonicTTS
|
|
40
|
+
except ImportError as e:
|
|
41
|
+
SupertonicTTS = None
|
|
42
|
+
_import_error = str(e)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SupertonicProvider(TTSProvider):
|
|
46
|
+
"""
|
|
47
|
+
TTS Provider for Supertonic ONNX-based Text-to-Speech engine.
|
|
48
|
+
|
|
49
|
+
This provider offers high-quality local TTS with multiple voice styles.
|
|
50
|
+
It runs entirely offline after initial model loading, making it ideal
|
|
51
|
+
for applications that need low latency and privacy.
|
|
52
|
+
|
|
53
|
+
Key Behavior:
|
|
54
|
+
This provider REINITIALIZES SupertonicTTS with the requested voice
|
|
55
|
+
for each generate_speech() call, matching the behavior of server.py
|
|
56
|
+
(lines ~3089-3094). This ensures proper voice switching without
|
|
57
|
+
state management issues, at the cost of slightly higher latency
|
|
58
|
+
on first use of each voice.
|
|
59
|
+
|
|
60
|
+
Voice Styles:
|
|
61
|
+
- M1-M5: Male voices with different characteristics
|
|
62
|
+
- F1-F5: Female voices with different characteristics
|
|
63
|
+
|
|
64
|
+
Languages:
|
|
65
|
+
- en (English)
|
|
66
|
+
- ko (Korean)
|
|
67
|
+
- es (Spanish)
|
|
68
|
+
- pt (Portuguese)
|
|
69
|
+
- fr (French)
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> from tts_providers import SupertonicProvider
|
|
73
|
+
>>> provider = SupertonicProvider()
|
|
74
|
+
>>> audio = provider.generate_speech("Hello world", voice='M1')
|
|
75
|
+
>>> with open('output.wav', 'wb') as f:
|
|
76
|
+
... f.write(audio)
|
|
77
|
+
|
|
78
|
+
Configuration:
|
|
79
|
+
>>> provider = SupertonicProvider(
|
|
80
|
+
... onnx_dir="/path/to/onnx",
|
|
81
|
+
... voice_styles_dir="/path/to/voice_styles",
|
|
82
|
+
... default_voice="M1"
|
|
83
|
+
... )
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
# Default paths — override via SUPERTONIC_MODEL_PATH env var
|
|
87
|
+
DEFAULT_ONNX_DIR = os.getenv('SUPERTONIC_MODEL_PATH', '/opt/supertonic/assets/onnx')
|
|
88
|
+
DEFAULT_VOICE_STYLES_DIR = None # derived from onnx_dir if not set
|
|
89
|
+
|
|
90
|
+
# Available voice styles (expanded from base implementation)
|
|
91
|
+
AVAILABLE_VOICES = [
|
|
92
|
+
'M1', 'M2', 'M3', 'M4', 'M5', # Male voices
|
|
93
|
+
'F1', 'F2', 'F3', 'F4', 'F5' # Female voices
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
# Supported languages
|
|
97
|
+
SUPPORTED_LANGUAGES = ['en', 'ko', 'es', 'pt', 'fr']
|
|
98
|
+
|
|
99
|
+
# Provider metadata
|
|
100
|
+
PROVIDER_NAME = "Supertonic"
|
|
101
|
+
PROVIDER_VERSION = "1.0.0"
|
|
102
|
+
PROVIDER_DESCRIPTION = "Local ONNX-based TTS with multiple voice styles"
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
onnx_dir: Optional[str] = None,
|
|
107
|
+
voice_styles_dir: Optional[str] = None,
|
|
108
|
+
default_voice: str = 'F3',
|
|
109
|
+
use_gpu: bool = False
|
|
110
|
+
):
|
|
111
|
+
"""
|
|
112
|
+
Initialize the Supertonic TTS Provider.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
onnx_dir: Path to ONNX models directory. If None, uses DEFAULT_ONNX_DIR.
|
|
116
|
+
voice_styles_dir: Path to voice styles JSON files directory.
|
|
117
|
+
If None, uses DEFAULT_VOICE_STYLES_DIR.
|
|
118
|
+
default_voice: Default voice to use (M1-M5, F1-F5). Default is 'M1'.
|
|
119
|
+
use_gpu: Whether to use GPU for inference. Default is False (CPU only).
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
ValueError: If SupertonicTTS module is not available.
|
|
123
|
+
FileNotFoundError: If required directories don't exist.
|
|
124
|
+
RuntimeError: If initialization fails.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> provider = SupertonicProvider(
|
|
128
|
+
... onnx_dir="/custom/path/to/onnx",
|
|
129
|
+
... default_voice="F1"
|
|
130
|
+
... )
|
|
131
|
+
"""
|
|
132
|
+
super().__init__()
|
|
133
|
+
|
|
134
|
+
self._status = 'inactive'
|
|
135
|
+
self._init_error = None
|
|
136
|
+
self._tts_cache: Dict[str, SupertonicTTS] = {}
|
|
137
|
+
self.default_voice = default_voice
|
|
138
|
+
self.use_gpu = use_gpu
|
|
139
|
+
|
|
140
|
+
# ── API mode ──────────────────────────────────────────────────────────
|
|
141
|
+
# Preferred: call the shared supertonic-tts microservice.
|
|
142
|
+
# Models are loaded once system-wide; no per-process ONNX loading.
|
|
143
|
+
if _API_URL:
|
|
144
|
+
try:
|
|
145
|
+
import requests
|
|
146
|
+
resp = requests.get(f"{_API_URL}/health", timeout=3)
|
|
147
|
+
if resp.ok:
|
|
148
|
+
self._use_api = True
|
|
149
|
+
self._api_url = _API_URL
|
|
150
|
+
self.onnx_dir = onnx_dir or self.DEFAULT_ONNX_DIR
|
|
151
|
+
self.voice_styles_dir = ""
|
|
152
|
+
self._status = 'active'
|
|
153
|
+
logger.info(f"SupertonicProvider: API mode → {_API_URL}")
|
|
154
|
+
return
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.warning(f"SupertonicProvider: API at {_API_URL} unreachable ({e}), trying local")
|
|
157
|
+
|
|
158
|
+
self._use_api = False
|
|
159
|
+
|
|
160
|
+
# ── Local mode (fallback) ─────────────────────────────────────────────
|
|
161
|
+
if SupertonicTTS is None:
|
|
162
|
+
self._status = 'error'
|
|
163
|
+
self._init_error = "supertonic_tts module not found. Set SUPERTONIC_API_URL or SUPERTONIC_HELPER_PATH."
|
|
164
|
+
self.onnx_dir = onnx_dir or self.DEFAULT_ONNX_DIR
|
|
165
|
+
self.voice_styles_dir = voice_styles_dir or self.DEFAULT_ONNX_DIR.replace('/onnx', '/voice_styles')
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
self.onnx_dir = onnx_dir or self.DEFAULT_ONNX_DIR
|
|
169
|
+
self.voice_styles_dir = (
|
|
170
|
+
voice_styles_dir
|
|
171
|
+
or (self.DEFAULT_VOICE_STYLES_DIR if self.DEFAULT_VOICE_STYLES_DIR
|
|
172
|
+
else os.path.join(os.path.dirname(self.onnx_dir), 'voice_styles'))
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if not os.path.exists(self.onnx_dir):
|
|
176
|
+
self._status = 'error'
|
|
177
|
+
self._init_error = f"ONNX directory not found: {self.onnx_dir}. Set SUPERTONIC_MODEL_PATH in .env."
|
|
178
|
+
logger.warning(f"SupertonicProvider: {self._init_error}")
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
if not os.path.exists(self.voice_styles_dir):
|
|
182
|
+
self._status = 'error'
|
|
183
|
+
self._init_error = f"Voice styles directory not found: {self.voice_styles_dir}"
|
|
184
|
+
logger.warning(f"SupertonicProvider: {self._init_error}")
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
self._create_tts_instance(self.default_voice)
|
|
189
|
+
self._status = 'active'
|
|
190
|
+
logger.info(f"SupertonicProvider: local mode, voice '{default_voice}'")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
self._status = 'error'
|
|
193
|
+
self._init_error = str(e)
|
|
194
|
+
logger.error(f"SupertonicProvider initialization failed: {e}")
|
|
195
|
+
|
|
196
|
+
def _get_voice_style_path(self, voice: str) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Get the full path to a voice style JSON file.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
voice: Voice identifier (e.g., 'M1', 'F2').
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Full path to the voice style JSON file.
|
|
205
|
+
|
|
206
|
+
Raises:
|
|
207
|
+
ValueError: If voice is not available.
|
|
208
|
+
FileNotFoundError: If voice style file doesn't exist.
|
|
209
|
+
"""
|
|
210
|
+
if voice not in self.AVAILABLE_VOICES:
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"Invalid voice: {voice}. Available: {self.AVAILABLE_VOICES}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
voice_path = os.path.join(self.voice_styles_dir, f"{voice}.json")
|
|
216
|
+
|
|
217
|
+
if not os.path.exists(voice_path):
|
|
218
|
+
raise FileNotFoundError(
|
|
219
|
+
f"Voice style file not found: {voice_path}"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return voice_path
|
|
223
|
+
|
|
224
|
+
def _create_tts_instance(self, voice: str) -> SupertonicTTS:
|
|
225
|
+
"""
|
|
226
|
+
Get or create a TTS instance for the specified voice.
|
|
227
|
+
|
|
228
|
+
Uses caching to avoid reloading ONNX models for every call.
|
|
229
|
+
Instances are cached by voice name and reused.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
voice: Voice identifier.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
SupertonicTTS instance for the specified voice.
|
|
236
|
+
|
|
237
|
+
Raises:
|
|
238
|
+
RuntimeError: If TTS instance creation fails.
|
|
239
|
+
"""
|
|
240
|
+
# Check cache first
|
|
241
|
+
if voice in self._tts_cache:
|
|
242
|
+
logger.debug(f"Reusing cached TTS instance for voice '{voice}'")
|
|
243
|
+
return self._tts_cache[voice]
|
|
244
|
+
|
|
245
|
+
voice_style_path = self._get_voice_style_path(voice)
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
tts_instance = SupertonicTTS(
|
|
249
|
+
onnx_dir=self.onnx_dir,
|
|
250
|
+
voice_style_path=voice_style_path,
|
|
251
|
+
voice_style_name=voice,
|
|
252
|
+
use_gpu=self.use_gpu
|
|
253
|
+
)
|
|
254
|
+
# Cache the instance for reuse
|
|
255
|
+
self._tts_cache[voice] = tts_instance
|
|
256
|
+
logger.debug(f"Created and cached new TTS instance for voice '{voice}'")
|
|
257
|
+
return tts_instance
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f"Failed to create TTS instance for voice '{voice}': {e}")
|
|
260
|
+
raise RuntimeError(f"TTS instance creation failed: {e}")
|
|
261
|
+
|
|
262
|
+
def generate_speech(
|
|
263
|
+
self,
|
|
264
|
+
text: str,
|
|
265
|
+
voice: Optional[str] = None,
|
|
266
|
+
lang: str = 'en',
|
|
267
|
+
speed: float = 1.0,
|
|
268
|
+
total_step: int = 15,
|
|
269
|
+
**options
|
|
270
|
+
) -> bytes:
|
|
271
|
+
"""
|
|
272
|
+
Generate speech from text using Supertonic TTS.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
text: The text to synthesize. Must not be empty.
|
|
276
|
+
voice: Voice identifier (M1-M5, F1-F5). If None, uses default_voice.
|
|
277
|
+
lang: Language code ('en', 'ko', 'es', 'pt', 'fr'). Default is 'en'.
|
|
278
|
+
speed: Speech speed multiplier. Higher = faster.
|
|
279
|
+
Recommended range: 0.8 to 1.3. Default is 1.05.
|
|
280
|
+
total_step: Number of denoising steps. More = better quality but slower.
|
|
281
|
+
Recommended range: 3-10. Default is 5.
|
|
282
|
+
**options: Additional options (currently not used, reserved for future).
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
bytes: WAV audio data ready to write to file or send via HTTP.
|
|
286
|
+
|
|
287
|
+
Raises:
|
|
288
|
+
ValueError: If text is empty, or voice/lang/speed/total_step invalid.
|
|
289
|
+
RuntimeError: If speech generation fails.
|
|
290
|
+
|
|
291
|
+
Example:
|
|
292
|
+
>>> audio = provider.generate_speech(
|
|
293
|
+
... text="Hello world!",
|
|
294
|
+
... voice="M1",
|
|
295
|
+
... lang="en",
|
|
296
|
+
... speed=1.1,
|
|
297
|
+
... total_step=6
|
|
298
|
+
... )
|
|
299
|
+
"""
|
|
300
|
+
# Use default voice if not specified
|
|
301
|
+
if voice is None:
|
|
302
|
+
voice = self.default_voice
|
|
303
|
+
|
|
304
|
+
# Validate inputs
|
|
305
|
+
self.validate_text(text)
|
|
306
|
+
|
|
307
|
+
if voice not in self.AVAILABLE_VOICES:
|
|
308
|
+
raise ValueError(
|
|
309
|
+
f"Invalid voice: {voice}. Available: {self.AVAILABLE_VOICES}"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if lang not in self.SUPPORTED_LANGUAGES:
|
|
313
|
+
raise ValueError(
|
|
314
|
+
f"Unsupported language: {lang}. Supported: {self.SUPPORTED_LANGUAGES}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if speed <= 0 or speed > 3:
|
|
318
|
+
raise ValueError(f"Invalid speed: {speed}. Must be between 0 and 3")
|
|
319
|
+
|
|
320
|
+
if total_step < 1 or total_step > 50:
|
|
321
|
+
raise ValueError(f"Invalid total_step: {total_step}. Must be between 1 and 50")
|
|
322
|
+
|
|
323
|
+
logger.info(
|
|
324
|
+
f"Generating speech: '{text[:50]}...' "
|
|
325
|
+
f"(voice={voice}, lang={lang}, speed={speed}, steps={total_step})"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
# ── API mode: call shared supertonic-tts service ──────────────────
|
|
330
|
+
if getattr(self, '_use_api', False):
|
|
331
|
+
import requests
|
|
332
|
+
resp = requests.post(
|
|
333
|
+
f"{self._api_url}/tts",
|
|
334
|
+
json={"text": text, "voice": voice, "speed": speed,
|
|
335
|
+
"steps": total_step, "lang": lang},
|
|
336
|
+
timeout=60,
|
|
337
|
+
)
|
|
338
|
+
if not resp.ok:
|
|
339
|
+
raise RuntimeError(f"Supertonic API error {resp.status_code}: {resp.text[:200]}")
|
|
340
|
+
audio_bytes = resp.content
|
|
341
|
+
logger.info(f"API: {len(audio_bytes)} bytes for voice '{voice}'")
|
|
342
|
+
return audio_bytes
|
|
343
|
+
|
|
344
|
+
# ── Local mode: load ONNX in-process ─────────────────────────────
|
|
345
|
+
tts = self._create_tts_instance(voice)
|
|
346
|
+
audio_bytes = tts.generate_speech(
|
|
347
|
+
text=text, lang=lang, speed=speed, total_step=total_step
|
|
348
|
+
)
|
|
349
|
+
logger.info(f"Local: {len(audio_bytes)} bytes for voice '{voice}'")
|
|
350
|
+
return audio_bytes
|
|
351
|
+
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.error(f"Speech generation failed: {e}")
|
|
354
|
+
raise RuntimeError(f"Failed to generate speech: {e}")
|
|
355
|
+
|
|
356
|
+
def list_voices(self) -> List[str]:
|
|
357
|
+
"""
|
|
358
|
+
List all available voice styles.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
List of voice identifiers that can be used with generate_speech().
|
|
362
|
+
|
|
363
|
+
Example:
|
|
364
|
+
>>> provider.list_voices()
|
|
365
|
+
['M1', 'M2', 'M3', 'M4', 'M5', 'F1', 'F2', 'F3', 'F4', 'F5']
|
|
366
|
+
"""
|
|
367
|
+
return self.AVAILABLE_VOICES.copy()
|
|
368
|
+
|
|
369
|
+
def get_info(self) -> Dict[str, Any]:
|
|
370
|
+
"""
|
|
371
|
+
Get provider metadata and status matching providers_config.json format.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
Dict containing complete metadata including:
|
|
375
|
+
- 'name': Provider name
|
|
376
|
+
- 'provider_id': Unique provider identifier
|
|
377
|
+
- 'status': 'active', 'inactive', or 'error'
|
|
378
|
+
- 'description': Human-readable description
|
|
379
|
+
- 'quality': Audio quality rating
|
|
380
|
+
- 'latency': Expected latency category
|
|
381
|
+
- 'cost_per_minute': Cost per minute of audio
|
|
382
|
+
- 'voices': List of all available voice identifiers
|
|
383
|
+
- 'features': List of provider features
|
|
384
|
+
- 'requires_api_key': Whether API key is required
|
|
385
|
+
- 'languages': List of supported language codes
|
|
386
|
+
- 'max_characters': Max text length per request
|
|
387
|
+
- 'notes': Additional notes about the provider
|
|
388
|
+
- 'documentation_url': Link to documentation
|
|
389
|
+
- 'default_voice': Default voice identifier
|
|
390
|
+
- 'capabilities': Dict of feature flags
|
|
391
|
+
- 'onnx_dir': ONNX models directory path
|
|
392
|
+
- 'voice_styles_dir': Voice styles directory path
|
|
393
|
+
- 'error': Error message if status is 'error'
|
|
394
|
+
|
|
395
|
+
Example:
|
|
396
|
+
>>> info = provider.get_info()
|
|
397
|
+
>>> print(f"{info['name']}: {info['status']}")
|
|
398
|
+
Supertonic TTS: active
|
|
399
|
+
>>> info['languages']
|
|
400
|
+
['en', 'ko', 'es', 'pt', 'fr']
|
|
401
|
+
"""
|
|
402
|
+
onnx_dir = getattr(self, 'onnx_dir', self.DEFAULT_ONNX_DIR)
|
|
403
|
+
return {
|
|
404
|
+
'name': 'Supertonic TTS',
|
|
405
|
+
'provider_id': 'supertonic',
|
|
406
|
+
'status': self._status,
|
|
407
|
+
'description': 'Local ONNX-based TTS engine with multiple voice styles',
|
|
408
|
+
'quality': 'high',
|
|
409
|
+
'latency': 'very-fast',
|
|
410
|
+
'cost_per_minute': 0.0,
|
|
411
|
+
'voices': self.AVAILABLE_VOICES.copy(),
|
|
412
|
+
'features': [
|
|
413
|
+
'multi-language',
|
|
414
|
+
'local-processing',
|
|
415
|
+
'open-source',
|
|
416
|
+
'no-api-key-required',
|
|
417
|
+
'onnx-based',
|
|
418
|
+
'voice-style-switching',
|
|
419
|
+
'offline-capable',
|
|
420
|
+
],
|
|
421
|
+
'requires_api_key': False,
|
|
422
|
+
'languages': self.SUPPORTED_LANGUAGES.copy(),
|
|
423
|
+
'max_characters': 10000,
|
|
424
|
+
'notes': (
|
|
425
|
+
'Free, fast, local inference. Requires local ONNX models. '
|
|
426
|
+
f'Set SUPERTONIC_MODEL_PATH in .env. Current path: {onnx_dir}'
|
|
427
|
+
),
|
|
428
|
+
'documentation_url': 'https://github.com/playht/supertonic',
|
|
429
|
+
'default_voice': self.default_voice,
|
|
430
|
+
'capabilities': {
|
|
431
|
+
'streaming': False,
|
|
432
|
+
'ssml': False,
|
|
433
|
+
'custom_voices': True,
|
|
434
|
+
'offline': True,
|
|
435
|
+
'gpu_support': True,
|
|
436
|
+
},
|
|
437
|
+
'onnx_dir': self.onnx_dir,
|
|
438
|
+
'voice_styles_dir': self.voice_styles_dir,
|
|
439
|
+
'error': self._init_error if self._status == 'error' else None,
|
|
440
|
+
'requires_microphone': False,
|
|
441
|
+
'requires_websocket': False,
|
|
442
|
+
'mode': 'tts-only',
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
def is_available(self) -> bool:
|
|
446
|
+
"""
|
|
447
|
+
Check if the provider is ready to generate speech.
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
True if provider is active and can generate speech, False otherwise.
|
|
451
|
+
"""
|
|
452
|
+
return self._status == 'active'
|
|
453
|
+
|
|
454
|
+
def get_default_voice(self) -> str:
|
|
455
|
+
"""
|
|
456
|
+
Get the default voice identifier.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
The default voice identifier (e.g., 'M1').
|
|
460
|
+
"""
|
|
461
|
+
return self.default_voice
|
|
462
|
+
|
|
463
|
+
def set_default_voice(self, voice: str) -> None:
|
|
464
|
+
"""
|
|
465
|
+
Change the default voice.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
voice: New default voice identifier (must be in AVAILABLE_VOICES).
|
|
469
|
+
|
|
470
|
+
Raises:
|
|
471
|
+
ValueError: If voice is not available.
|
|
472
|
+
RuntimeError: If voice initialization fails.
|
|
473
|
+
"""
|
|
474
|
+
if voice not in self.AVAILABLE_VOICES:
|
|
475
|
+
raise ValueError(
|
|
476
|
+
f"Invalid voice: {voice}. Available: {self.AVAILABLE_VOICES}"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
# Test initialization with the new default voice
|
|
481
|
+
self._create_tts_instance(voice)
|
|
482
|
+
self.default_voice = voice
|
|
483
|
+
logger.info(f"Default voice changed to '{voice}'")
|
|
484
|
+
except Exception as e:
|
|
485
|
+
raise RuntimeError(f"Failed to set default voice: {e}")
|
|
486
|
+
|
|
487
|
+
def get_supported_languages(self) -> List[str]:
|
|
488
|
+
"""
|
|
489
|
+
Get list of supported language codes.
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
List of supported language codes.
|
|
493
|
+
"""
|
|
494
|
+
return self.SUPPORTED_LANGUAGES.copy()
|
|
495
|
+
|
|
496
|
+
def clear_cache(self) -> None:
|
|
497
|
+
"""
|
|
498
|
+
Clear the TTS instance cache.
|
|
499
|
+
|
|
500
|
+
Removes all cached TTS instances, forcing new instances to be
|
|
501
|
+
created on the next generate_speech() call.
|
|
502
|
+
"""
|
|
503
|
+
self._tts_cache.clear()
|
|
504
|
+
logger.debug("TTS instance cache cleared")
|
|
505
|
+
|
|
506
|
+
def preload_voice(self, voice: str) -> None:
|
|
507
|
+
"""
|
|
508
|
+
Preload a TTS instance for a specific voice.
|
|
509
|
+
|
|
510
|
+
Creates and caches the TTS instance so it's ready for immediate use.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
voice: Voice identifier to preload.
|
|
514
|
+
|
|
515
|
+
Raises:
|
|
516
|
+
ValueError: If voice is not available.
|
|
517
|
+
RuntimeError: If preloading fails.
|
|
518
|
+
"""
|
|
519
|
+
if voice not in self.AVAILABLE_VOICES:
|
|
520
|
+
raise ValueError(
|
|
521
|
+
f"Invalid voice: {voice}. Available: {self.AVAILABLE_VOICES}"
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
# Create and cache the TTS instance
|
|
526
|
+
self._create_tts_instance(voice)
|
|
527
|
+
logger.info(f"Voice '{voice}' preloaded and cached")
|
|
528
|
+
except Exception as e:
|
|
529
|
+
raise RuntimeError(f"Failed to preload voice '{voice}': {e}")
|
|
530
|
+
|
|
531
|
+
def preload_all_voices(self) -> Dict[str, bool]:
|
|
532
|
+
"""
|
|
533
|
+
Test initialization for all available voices.
|
|
534
|
+
|
|
535
|
+
Note: This only tests that each voice can be initialized. Since the
|
|
536
|
+
provider reinitializes for each call, this is a validation check.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Dict mapping voice identifiers to success status.
|
|
540
|
+
|
|
541
|
+
Example:
|
|
542
|
+
>>> results = provider.preload_all_voices()
|
|
543
|
+
>>> print(results)
|
|
544
|
+
{'M1': True, 'M2': True, 'F1': True, 'F2': False}
|
|
545
|
+
"""
|
|
546
|
+
results = {}
|
|
547
|
+
for voice in self.AVAILABLE_VOICES:
|
|
548
|
+
try:
|
|
549
|
+
self.preload_voice(voice)
|
|
550
|
+
results[voice] = True
|
|
551
|
+
except Exception as e:
|
|
552
|
+
logger.error(f"Failed to preload voice '{voice}': {e}")
|
|
553
|
+
results[voice] = False
|
|
554
|
+
return results
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
__all__ = ['SupertonicProvider']
|