openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Supertonic TTS wrapper for OpenVoiceUI.
|
|
4
|
+
|
|
5
|
+
This module provides a clean interface to the Supertonic Text-to-Speech engine,
|
|
6
|
+
wrapping the helper.py functionality for use in Flask applications.
|
|
7
|
+
|
|
8
|
+
Author: OpenVoiceUI
|
|
9
|
+
Date: 2026-02-11
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
import logging
|
|
15
|
+
from io import BytesIO
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import soundfile as sf
|
|
20
|
+
|
|
21
|
+
# Add the Supertonic helper.py directory to the path
|
|
22
|
+
SUPERTONIC_HELPER_PATH = os.environ.get("SUPERTONIC_HELPER_PATH", os.path.expanduser("~/supertonic/py"))
|
|
23
|
+
if SUPERTONIC_HELPER_PATH not in sys.path:
|
|
24
|
+
sys.path.insert(0, SUPERTONIC_HELPER_PATH)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from helper import (
|
|
28
|
+
load_text_to_speech,
|
|
29
|
+
load_voice_style,
|
|
30
|
+
Style,
|
|
31
|
+
)
|
|
32
|
+
except ImportError as e:
|
|
33
|
+
logging.error(f"Failed to import Supertonic helper: {e}")
|
|
34
|
+
logging.error(f"Make sure {SUPERTONIC_HELPER_PATH}/helper.py exists")
|
|
35
|
+
raise
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Configure logging
|
|
39
|
+
logging.basicConfig(
|
|
40
|
+
level=logging.INFO,
|
|
41
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
42
|
+
)
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SupertonicTTS:
|
|
47
|
+
"""
|
|
48
|
+
Wrapper class for Supertonic Text-to-Speech engine.
|
|
49
|
+
|
|
50
|
+
This class provides a simple interface for generating speech from text
|
|
51
|
+
using the Supertonic ONNX models. It handles model loading, voice style
|
|
52
|
+
management, and audio generation.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> tts = SupertonicTTS(
|
|
56
|
+
... onnx_dir="~/supertonic/assets/onnx",
|
|
57
|
+
... voice_style_path="~/supertonic/assets/voice_styles/M1.json"
|
|
58
|
+
... )
|
|
59
|
+
>>> audio_bytes = tts.generate_speech("Hello world, this is a test")
|
|
60
|
+
>>> # audio_bytes contains WAV format audio data
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Default paths (use SUPERTONIC_MODEL_PATH env var or ~/supertonic)
|
|
64
|
+
DEFAULT_ONNX_DIR = os.environ.get("SUPERTONIC_ONNX_DIR", os.path.expanduser("~/supertonic/assets/onnx"))
|
|
65
|
+
DEFAULT_VOICE_STYLES_DIR = os.environ.get("SUPERTONIC_VOICE_STYLES_DIR", os.path.expanduser("~/supertonic/assets/voice_styles"))
|
|
66
|
+
|
|
67
|
+
# Available voice styles
|
|
68
|
+
AVAILABLE_VOICE_STYLES = {
|
|
69
|
+
'M1': 'M1.json', # Male voice 1
|
|
70
|
+
'M2': 'M2.json', # Male voice 2
|
|
71
|
+
'F1': 'F1.json', # Female voice 1
|
|
72
|
+
'F2': 'F2.json', # Female voice 2
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
onnx_dir: Optional[str] = None,
|
|
78
|
+
voice_style_path: Optional[str] = None,
|
|
79
|
+
voice_style_name: str = 'M1',
|
|
80
|
+
use_gpu: bool = False
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initialize the Supertonic TTS engine.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
onnx_dir: Path to the ONNX models directory. If None, uses DEFAULT_ONNX_DIR.
|
|
87
|
+
voice_style_path: Full path to the voice style JSON file. If None,
|
|
88
|
+
constructs path from voice_style_name.
|
|
89
|
+
voice_style_name: Name of the voice style (M1, M2, F1, F2). Used only
|
|
90
|
+
if voice_style_path is None.
|
|
91
|
+
use_gpu: Whether to use GPU for inference. Default is False (CPU only).
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
FileNotFoundError: If onnx_dir or voice_style file doesn't exist.
|
|
95
|
+
RuntimeError: If model loading fails.
|
|
96
|
+
"""
|
|
97
|
+
# Set paths
|
|
98
|
+
self.onnx_dir = onnx_dir or self.DEFAULT_ONNX_DIR
|
|
99
|
+
self.voice_style_name = voice_style_name
|
|
100
|
+
|
|
101
|
+
# Validate onnx directory
|
|
102
|
+
if not os.path.exists(self.onnx_dir):
|
|
103
|
+
raise FileNotFoundError(
|
|
104
|
+
f"ONNX models directory not found: {self.onnx_dir}"
|
|
105
|
+
)
|
|
106
|
+
logger.info(f"Using ONNX models from: {self.onnx_dir}")
|
|
107
|
+
|
|
108
|
+
# Set voice style path
|
|
109
|
+
if voice_style_path:
|
|
110
|
+
self.voice_style_path = voice_style_path
|
|
111
|
+
else:
|
|
112
|
+
# Construct path from voice style name
|
|
113
|
+
if voice_style_name not in self.AVAILABLE_VOICE_STYLES:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Invalid voice_style_name: {voice_style_name}. "
|
|
116
|
+
f"Available: {list(self.AVAILABLE_VOICE_STYLES.keys())}"
|
|
117
|
+
)
|
|
118
|
+
voice_style_file = self.AVAILABLE_VOICE_STYLES[voice_style_name]
|
|
119
|
+
self.voice_style_path = os.path.join(
|
|
120
|
+
self.DEFAULT_VOICE_STYLES_DIR, voice_style_file
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Validate voice style file
|
|
124
|
+
if not os.path.exists(self.voice_style_path):
|
|
125
|
+
raise FileNotFoundError(
|
|
126
|
+
f"Voice style file not found: {self.voice_style_path}"
|
|
127
|
+
)
|
|
128
|
+
logger.info(f"Using voice style: {self.voice_style_path}")
|
|
129
|
+
|
|
130
|
+
# Initialize models
|
|
131
|
+
try:
|
|
132
|
+
logger.info("Loading Supertonic TTS models...")
|
|
133
|
+
self.text_to_speech = load_text_to_speech(self.onnx_dir, use_gpu=use_gpu)
|
|
134
|
+
self.style = load_voice_style([self.voice_style_path], verbose=True)
|
|
135
|
+
self.sample_rate = self.text_to_speech.sample_rate
|
|
136
|
+
logger.info(f"TTS models loaded successfully (sample rate: {self.sample_rate}Hz)")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.error(f"Failed to load TTS models: {e}")
|
|
139
|
+
raise RuntimeError(f"TTS model loading failed: {e}")
|
|
140
|
+
|
|
141
|
+
def generate_speech(
|
|
142
|
+
self,
|
|
143
|
+
text: str,
|
|
144
|
+
lang: str = 'en',
|
|
145
|
+
speed: float = 1.0,
|
|
146
|
+
total_step: int = 15
|
|
147
|
+
) -> bytes:
|
|
148
|
+
"""
|
|
149
|
+
Generate speech from text.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
text: The text to synthesize into speech.
|
|
153
|
+
lang: Language code ('en', 'ko', 'es', 'pt', 'fr'). Default is 'en'.
|
|
154
|
+
speed: Speech speed multiplier. Higher values = faster speech.
|
|
155
|
+
Recommended range: 0.8 to 1.3. Default is 1.05.
|
|
156
|
+
total_step: Number of denoising steps for generation. More steps =
|
|
157
|
+
better quality but slower. Recommended range: 3-10.
|
|
158
|
+
Default is 5 (good balance).
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
bytes: Raw WAV audio data (can be written directly to file or sent
|
|
162
|
+
via HTTP with Content-Type: audio/wav).
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
ValueError: If lang is not supported or parameters are invalid.
|
|
166
|
+
RuntimeError: If speech generation fails.
|
|
167
|
+
|
|
168
|
+
Example:
|
|
169
|
+
>>> audio = tts.generate_speech("Hello world", lang='en', speed=1.05)
|
|
170
|
+
>>> with open('output.wav', 'wb') as f:
|
|
171
|
+
... f.write(audio)
|
|
172
|
+
"""
|
|
173
|
+
# Validate inputs
|
|
174
|
+
if not text or not text.strip():
|
|
175
|
+
raise ValueError("Text cannot be empty")
|
|
176
|
+
|
|
177
|
+
supported_langs = ['en', 'ko', 'es', 'pt', 'fr']
|
|
178
|
+
if lang not in supported_langs:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Unsupported language: {lang}. Supported: {supported_langs}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if speed <= 0 or speed > 3:
|
|
184
|
+
raise ValueError(f"Invalid speed: {speed}. Must be between 0 and 3")
|
|
185
|
+
|
|
186
|
+
if total_step < 1 or total_step > 50:
|
|
187
|
+
raise ValueError(f"Invalid total_step: {total_step}. Must be between 1 and 50")
|
|
188
|
+
|
|
189
|
+
logger.info(f"Generating speech: '{text[:50]}...' (lang={lang}, speed={speed}, steps={total_step})")
|
|
190
|
+
|
|
191
|
+
# Maximum character length per chunk to stay under ONNX token limit (~1000 tokens)
|
|
192
|
+
MAX_CHUNK_LENGTH = 500
|
|
193
|
+
|
|
194
|
+
def split_text_into_chunks(text: str, max_length: int) -> list:
|
|
195
|
+
"""Split text into chunks at sentence boundaries."""
|
|
196
|
+
if len(text) <= max_length:
|
|
197
|
+
return [text]
|
|
198
|
+
|
|
199
|
+
chunks = []
|
|
200
|
+
# Split on sentence boundaries
|
|
201
|
+
sentence_endings = ['. ', '! ', '? ', '\n']
|
|
202
|
+
|
|
203
|
+
current_chunk = ""
|
|
204
|
+
# Split by sentences first
|
|
205
|
+
sentences = [text]
|
|
206
|
+
for ending in sentence_endings:
|
|
207
|
+
new_sentences = []
|
|
208
|
+
for s in sentences:
|
|
209
|
+
parts = s.split(ending)
|
|
210
|
+
for i, part in enumerate(parts):
|
|
211
|
+
if i < len(parts) - 1:
|
|
212
|
+
new_sentences.append(part + ending.strip())
|
|
213
|
+
elif part.strip():
|
|
214
|
+
new_sentences.append(part)
|
|
215
|
+
sentences = new_sentences if new_sentences else sentences
|
|
216
|
+
|
|
217
|
+
# Combine sentences into chunks up to max_length
|
|
218
|
+
for sentence in sentences:
|
|
219
|
+
sentence = sentence.strip()
|
|
220
|
+
if not sentence:
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
if len(current_chunk) + len(sentence) + 1 <= max_length:
|
|
224
|
+
current_chunk += (" " if current_chunk else "") + sentence
|
|
225
|
+
else:
|
|
226
|
+
if current_chunk:
|
|
227
|
+
chunks.append(current_chunk.strip())
|
|
228
|
+
# If single sentence is too long, just use it (will be truncated by tokenizer)
|
|
229
|
+
if len(sentence) > max_length:
|
|
230
|
+
chunks.append(sentence)
|
|
231
|
+
current_chunk = ""
|
|
232
|
+
else:
|
|
233
|
+
current_chunk = sentence
|
|
234
|
+
|
|
235
|
+
if current_chunk.strip():
|
|
236
|
+
chunks.append(current_chunk.strip())
|
|
237
|
+
|
|
238
|
+
return chunks
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
chunks = split_text_into_chunks(text, MAX_CHUNK_LENGTH)
|
|
242
|
+
logger.info(f"Text split into {len(chunks)} chunk(s)")
|
|
243
|
+
|
|
244
|
+
all_audio_chunks = []
|
|
245
|
+
|
|
246
|
+
for i, chunk in enumerate(chunks):
|
|
247
|
+
logger.info(f"Processing chunk {i+1}/{len(chunks)}: '{chunk[:30]}...'")
|
|
248
|
+
|
|
249
|
+
# Generate speech using the Supertonic TextToSpeech instance
|
|
250
|
+
wav, duration = self.text_to_speech(
|
|
251
|
+
text=chunk,
|
|
252
|
+
lang=lang,
|
|
253
|
+
style=self.style,
|
|
254
|
+
total_step=total_step,
|
|
255
|
+
speed=speed
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Extract the audio data (first batch item, trim to actual duration)
|
|
259
|
+
audio_data = wav[0, :int(self.sample_rate * duration[0].item())]
|
|
260
|
+
all_audio_chunks.append(audio_data)
|
|
261
|
+
|
|
262
|
+
# Concatenate all audio chunks
|
|
263
|
+
if len(all_audio_chunks) == 1:
|
|
264
|
+
final_audio = all_audio_chunks[0]
|
|
265
|
+
else:
|
|
266
|
+
final_audio = np.concatenate(all_audio_chunks)
|
|
267
|
+
|
|
268
|
+
# Write to BytesIO buffer to get raw bytes
|
|
269
|
+
buffer = BytesIO()
|
|
270
|
+
sf.write(buffer, final_audio, self.sample_rate, format='WAV')
|
|
271
|
+
audio_bytes = buffer.getvalue()
|
|
272
|
+
|
|
273
|
+
total_duration = len(final_audio) / self.sample_rate
|
|
274
|
+
logger.info(f"Generated {len(audio_bytes)} bytes of audio ({total_duration:.2f}s)")
|
|
275
|
+
return audio_bytes
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.error(f"Speech generation failed: {e}")
|
|
279
|
+
raise RuntimeError(f"Failed to generate speech: {e}")
|
|
280
|
+
|
|
281
|
+
def set_voice_style(self, voice_style_name: str) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Change the voice style.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
voice_style_name: Name of the new voice style (M1, M2, F1, F2).
|
|
287
|
+
|
|
288
|
+
Raises:
|
|
289
|
+
ValueError: If voice_style_name is not available.
|
|
290
|
+
FileNotFoundError: If the voice style file doesn't exist.
|
|
291
|
+
RuntimeError: If loading the new style fails.
|
|
292
|
+
"""
|
|
293
|
+
if voice_style_name == self.voice_style_name:
|
|
294
|
+
logger.info(f"Already using voice style: {voice_style_name}")
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
if voice_style_name not in self.AVAILABLE_VOICE_STYLES:
|
|
298
|
+
raise ValueError(
|
|
299
|
+
f"Invalid voice_style_name: {voice_style_name}. "
|
|
300
|
+
f"Available: {list(self.AVAILABLE_VOICE_STYLES.keys())}"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
voice_style_file = self.AVAILABLE_VOICE_STYLES[voice_style_name]
|
|
304
|
+
new_voice_style_path = os.path.join(
|
|
305
|
+
self.DEFAULT_VOICE_STYLES_DIR, voice_style_file
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if not os.path.exists(new_voice_style_path):
|
|
309
|
+
raise FileNotFoundError(
|
|
310
|
+
f"Voice style file not found: {new_voice_style_path}"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
try:
|
|
314
|
+
self.style = load_voice_style([new_voice_style_path], verbose=True)
|
|
315
|
+
self.voice_style_name = voice_style_name
|
|
316
|
+
self.voice_style_path = new_voice_style_path
|
|
317
|
+
logger.info(f"Voice style changed to: {voice_style_name}")
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(f"Failed to load voice style: {e}")
|
|
320
|
+
raise RuntimeError(f"Failed to load voice style: {e}")
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
# Singleton instance for use in Flask app
|
|
324
|
+
_tts_instance: Optional[SupertonicTTS] = None
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def get_tts_instance() -> Optional[SupertonicTTS]:
|
|
328
|
+
"""
|
|
329
|
+
Get the global TTS instance (singleton).
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
The global SupertonicTTS instance, or None if not initialized.
|
|
333
|
+
|
|
334
|
+
This is useful for Flask apps where you want to initialize TTS once
|
|
335
|
+
at startup and reuse the instance across requests.
|
|
336
|
+
"""
|
|
337
|
+
global _tts_instance
|
|
338
|
+
return _tts_instance
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def initialize_tts(
|
|
342
|
+
onnx_dir: Optional[str] = None,
|
|
343
|
+
voice_style_name: str = 'M1',
|
|
344
|
+
use_gpu: bool = False
|
|
345
|
+
) -> Optional[SupertonicTTS]:
|
|
346
|
+
"""
|
|
347
|
+
Initialize the global TTS instance.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
onnx_dir: Path to ONNX models directory.
|
|
351
|
+
voice_style_name: Default voice style to use.
|
|
352
|
+
use_gpu: Whether to use GPU for inference.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
The initialized SupertonicTTS instance, or None if initialization fails.
|
|
356
|
+
"""
|
|
357
|
+
global _tts_instance
|
|
358
|
+
try:
|
|
359
|
+
_tts_instance = SupertonicTTS(
|
|
360
|
+
onnx_dir=onnx_dir,
|
|
361
|
+
voice_style_name=voice_style_name,
|
|
362
|
+
use_gpu=use_gpu
|
|
363
|
+
)
|
|
364
|
+
logger.info("Global TTS instance initialized")
|
|
365
|
+
return _tts_instance
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.error(f"Failed to initialize TTS: {e}")
|
|
368
|
+
_tts_instance = None
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
if __name__ == "__main__":
|
|
373
|
+
# Simple test when run directly
|
|
374
|
+
print("Supertonic TTS Wrapper - Direct Test")
|
|
375
|
+
print("=" * 50)
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
# Initialize TTS
|
|
379
|
+
tts = SupertonicTTS(
|
|
380
|
+
onnx_dir=os.environ.get("SUPERTONIC_ONNX_DIR", os.path.expanduser("~/supertonic/assets/onnx")),
|
|
381
|
+
voice_style_name="M1"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Generate speech
|
|
385
|
+
test_text = "Hello world, this is a test of the Supertonic TTS system."
|
|
386
|
+
audio = tts.generate_speech(test_text, lang='en', speed=1.05)
|
|
387
|
+
|
|
388
|
+
# Save to file
|
|
389
|
+
output_path = "/tmp/supertonic_test_output.wav"
|
|
390
|
+
with open(output_path, 'wb') as f:
|
|
391
|
+
f.write(audio)
|
|
392
|
+
|
|
393
|
+
print(f"Success! Audio saved to: {output_path}")
|
|
394
|
+
print(f"Generated {len(audio)} bytes of audio data")
|
|
395
|
+
|
|
396
|
+
except Exception as e:
|
|
397
|
+
print(f"Error: {e}")
|
|
398
|
+
import traceback
|
|
399
|
+
traceback.print_exc()
|