openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Qwen3-TTS Provider — fal.ai hosted Qwen3-TTS models.
|
|
3
|
+
|
|
4
|
+
Supports:
|
|
5
|
+
- Named speaker TTS (0.6B and 1.7B)
|
|
6
|
+
- Voice cloning from audio samples via clone-voice endpoint
|
|
7
|
+
- Emotion/style control via prompt (1.7B)
|
|
8
|
+
- Cloned voice embeddings stored locally for reuse
|
|
9
|
+
|
|
10
|
+
API key: FAL_KEY env var
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import time
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
import httpx
|
|
21
|
+
|
|
22
|
+
from .base_provider import TTSProvider
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# fal.ai endpoints
|
|
27
|
+
FAL_TTS_1_7B = "https://fal.run/fal-ai/qwen-3-tts/text-to-speech/1.7b"
|
|
28
|
+
FAL_TTS_0_6B = "https://fal.run/fal-ai/qwen-3-tts/text-to-speech/0.6b"
|
|
29
|
+
FAL_CLONE_1_7B = "https://fal.run/fal-ai/qwen-3-tts/clone-voice/1.7b"
|
|
30
|
+
FAL_CLONE_0_6B = "https://fal.run/fal-ai/qwen-3-tts/clone-voice/0.6b"
|
|
31
|
+
|
|
32
|
+
BUILTIN_VOICES = [
|
|
33
|
+
"Vivian", # Female, warm
|
|
34
|
+
"Serena", # Female, clear
|
|
35
|
+
"Dylan", # Male, casual
|
|
36
|
+
"Eric", # Male, professional
|
|
37
|
+
"Ryan", # Male, energetic
|
|
38
|
+
"Aiden", # Male, deep
|
|
39
|
+
"Uncle_Fu", # Male, character
|
|
40
|
+
"Ono_Anna", # Female, Japanese accent
|
|
41
|
+
"Sohee", # Female, Korean accent
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_clones_dir() -> Path:
|
|
46
|
+
"""Resolve voice clones directory from paths module or fallback."""
|
|
47
|
+
try:
|
|
48
|
+
from services.paths import VOICE_CLONES_DIR
|
|
49
|
+
return VOICE_CLONES_DIR
|
|
50
|
+
except ImportError:
|
|
51
|
+
return Path(os.getenv("VOICE_CLONES_DIR", "./runtime/voice-clones"))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _fal_request(api_key: str, endpoint: str, payload: dict,
|
|
55
|
+
timeout: float = 90.0) -> dict:
|
|
56
|
+
"""Make a JSON request to fal.ai and return the parsed response."""
|
|
57
|
+
headers = {
|
|
58
|
+
'Authorization': f'Key {api_key}',
|
|
59
|
+
'Content-Type': 'application/json',
|
|
60
|
+
}
|
|
61
|
+
with httpx.Client(timeout=httpx.Timeout(timeout, connect=10.0)) as client:
|
|
62
|
+
resp = client.post(endpoint, json=payload, headers=headers)
|
|
63
|
+
resp.raise_for_status()
|
|
64
|
+
return resp.json()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _fal_download(url: str, timeout: float = 30.0) -> bytes:
|
|
68
|
+
"""Download binary content from a fal.ai result URL."""
|
|
69
|
+
with httpx.Client(timeout=httpx.Timeout(timeout)) as client:
|
|
70
|
+
resp = client.get(url)
|
|
71
|
+
resp.raise_for_status()
|
|
72
|
+
return resp.content
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Qwen3Provider(TTSProvider):
|
|
76
|
+
"""
|
|
77
|
+
TTS Provider using Qwen3-TTS via fal.ai.
|
|
78
|
+
|
|
79
|
+
Built-in voices: Vivian, Serena, Dylan, Eric, Ryan, Aiden, Uncle_Fu, Ono_Anna, Sohee
|
|
80
|
+
Cloned voices: stored locally as .safetensors embeddings, referenced by voice_id
|
|
81
|
+
Output: MP3 audio bytes
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self):
|
|
85
|
+
super().__init__()
|
|
86
|
+
self.api_key = os.getenv('FAL_KEY', '')
|
|
87
|
+
self._status = 'active' if self.api_key else 'error'
|
|
88
|
+
self._init_error = None if self.api_key else 'FAL_KEY not set in environment'
|
|
89
|
+
|
|
90
|
+
# ------------------------------------------------------------------
|
|
91
|
+
# Voice cloning
|
|
92
|
+
# ------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def clone_voice(self, audio_url: str, name: str,
|
|
95
|
+
reference_text: Optional[str] = None) -> dict:
|
|
96
|
+
"""
|
|
97
|
+
Clone a voice from a reference audio sample.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
audio_url: Public URL to reference audio (WAV/MP3, 3+ seconds).
|
|
101
|
+
name: Human-readable name for this cloned voice.
|
|
102
|
+
reference_text: Optional transcript of what's said in the audio
|
|
103
|
+
(improves quality).
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
dict with: voice_id, name, embedding_url, created_at, metadata
|
|
107
|
+
"""
|
|
108
|
+
if not self.api_key:
|
|
109
|
+
raise RuntimeError("FAL_KEY not set — cannot clone voice")
|
|
110
|
+
|
|
111
|
+
t = time.time()
|
|
112
|
+
logger.info(f"[Qwen3] Cloning voice '{name}' from {audio_url[:80]}")
|
|
113
|
+
|
|
114
|
+
payload = {"audio_url": audio_url}
|
|
115
|
+
if reference_text:
|
|
116
|
+
payload["reference_text"] = reference_text
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
result = _fal_request(self.api_key, FAL_CLONE_1_7B, payload,
|
|
120
|
+
timeout=120.0)
|
|
121
|
+
except httpx.HTTPStatusError as e:
|
|
122
|
+
raise RuntimeError(
|
|
123
|
+
f"fal.ai clone error {e.response.status_code}: {e.response.text}"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Extract embedding URL from response
|
|
127
|
+
embedding_url = result.get('speaker_embedding', {}).get('url')
|
|
128
|
+
if not embedding_url:
|
|
129
|
+
# Try alternate response shapes
|
|
130
|
+
embedding_url = result.get('audio', {}).get('url')
|
|
131
|
+
if not embedding_url:
|
|
132
|
+
raise RuntimeError(f"No embedding URL in fal.ai response: {result}")
|
|
133
|
+
|
|
134
|
+
elapsed_ms = int((time.time() - t) * 1000)
|
|
135
|
+
|
|
136
|
+
# Download and persist the embedding locally
|
|
137
|
+
embedding_bytes = _fal_download(embedding_url)
|
|
138
|
+
|
|
139
|
+
clones_dir = _get_clones_dir()
|
|
140
|
+
# voice_id = sanitized name
|
|
141
|
+
voice_id = "clone_" + "".join(
|
|
142
|
+
c for c in name.lower().replace(" ", "_")
|
|
143
|
+
if c.isalnum() or c == "_"
|
|
144
|
+
)[:40]
|
|
145
|
+
voice_dir = clones_dir / voice_id
|
|
146
|
+
voice_dir.mkdir(parents=True, exist_ok=True)
|
|
147
|
+
|
|
148
|
+
embedding_path = voice_dir / "embedding.safetensors"
|
|
149
|
+
with open(embedding_path, 'wb') as f:
|
|
150
|
+
f.write(embedding_bytes)
|
|
151
|
+
|
|
152
|
+
metadata = {
|
|
153
|
+
"voice_id": voice_id,
|
|
154
|
+
"name": name,
|
|
155
|
+
"embedding_url": embedding_url,
|
|
156
|
+
"embedding_size": len(embedding_bytes),
|
|
157
|
+
"reference_text": reference_text,
|
|
158
|
+
"source_audio_url": audio_url,
|
|
159
|
+
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
160
|
+
"clone_time_ms": elapsed_ms,
|
|
161
|
+
"provider": "qwen3",
|
|
162
|
+
"fal_response": result,
|
|
163
|
+
}
|
|
164
|
+
with open(voice_dir / "metadata.json", 'w') as f:
|
|
165
|
+
json.dump(metadata, f, indent=2)
|
|
166
|
+
|
|
167
|
+
logger.info(
|
|
168
|
+
f"[Qwen3] Voice cloned: {voice_id} ({len(embedding_bytes)} bytes) "
|
|
169
|
+
f"in {elapsed_ms}ms"
|
|
170
|
+
)
|
|
171
|
+
return metadata
|
|
172
|
+
|
|
173
|
+
def list_cloned_voices(self) -> list:
|
|
174
|
+
"""List all locally stored cloned voice embeddings."""
|
|
175
|
+
clones_dir = _get_clones_dir()
|
|
176
|
+
voices = []
|
|
177
|
+
if not clones_dir.exists():
|
|
178
|
+
return voices
|
|
179
|
+
for voice_dir in sorted(clones_dir.iterdir()):
|
|
180
|
+
meta_path = voice_dir / "metadata.json"
|
|
181
|
+
if meta_path.exists():
|
|
182
|
+
try:
|
|
183
|
+
with open(meta_path) as f:
|
|
184
|
+
meta = json.load(f)
|
|
185
|
+
meta["has_embedding"] = (voice_dir / "embedding.safetensors").exists()
|
|
186
|
+
voices.append(meta)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.warning(f"Bad voice metadata in {voice_dir}: {e}")
|
|
189
|
+
return voices
|
|
190
|
+
|
|
191
|
+
def get_clone_embedding_url(self, voice_id: str) -> Optional[str]:
|
|
192
|
+
"""Get the fal.ai embedding URL for a cloned voice.
|
|
193
|
+
|
|
194
|
+
Returns the cached remote URL from metadata. The embedding is also
|
|
195
|
+
stored locally as a fallback, but fal.ai needs the URL for generation.
|
|
196
|
+
"""
|
|
197
|
+
clones_dir = _get_clones_dir()
|
|
198
|
+
meta_path = clones_dir / voice_id / "metadata.json"
|
|
199
|
+
if not meta_path.exists():
|
|
200
|
+
return None
|
|
201
|
+
try:
|
|
202
|
+
with open(meta_path) as f:
|
|
203
|
+
meta = json.load(f)
|
|
204
|
+
return meta.get("embedding_url")
|
|
205
|
+
except Exception:
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
# ------------------------------------------------------------------
|
|
209
|
+
# Speech generation
|
|
210
|
+
# ------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
def generate_speech(self, text: str, voice: str = 'Vivian', **kwargs) -> bytes:
|
|
213
|
+
"""
|
|
214
|
+
Generate speech via fal.ai Qwen3-TTS.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
text: Text to synthesize.
|
|
218
|
+
voice: Built-in voice name OR cloned voice_id (clone_xxx).
|
|
219
|
+
**kwargs:
|
|
220
|
+
language: Language name (default 'English').
|
|
221
|
+
prompt: Style/emotion instruction for 1.7B model.
|
|
222
|
+
speaker_embedding_url: Direct embedding URL override.
|
|
223
|
+
reference_text: Reference text for cloned voice quality.
|
|
224
|
+
model: '0.6b' or '1.7b' (default '1.7b').
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
MP3 audio bytes.
|
|
228
|
+
"""
|
|
229
|
+
if not self.api_key:
|
|
230
|
+
raise RuntimeError("FAL_KEY not set — cannot call fal.ai API")
|
|
231
|
+
|
|
232
|
+
self.validate_text(text)
|
|
233
|
+
|
|
234
|
+
language = kwargs.get('language', 'English')
|
|
235
|
+
prompt = kwargs.get('prompt', '')
|
|
236
|
+
embedding_url = kwargs.get('speaker_embedding_url')
|
|
237
|
+
reference_text = kwargs.get('reference_text', '')
|
|
238
|
+
model = kwargs.get('model', '1.7b')
|
|
239
|
+
|
|
240
|
+
endpoint = FAL_TTS_1_7B if model == '1.7b' else FAL_TTS_0_6B
|
|
241
|
+
|
|
242
|
+
# Resolve cloned voice → embedding URL
|
|
243
|
+
is_cloned = voice.startswith("clone_") if voice else False
|
|
244
|
+
if is_cloned and not embedding_url:
|
|
245
|
+
embedding_url = self.get_clone_embedding_url(voice)
|
|
246
|
+
if not embedding_url:
|
|
247
|
+
raise RuntimeError(
|
|
248
|
+
f"Cloned voice '{voice}' not found or missing embedding"
|
|
249
|
+
)
|
|
250
|
+
# Load reference_text from metadata if not provided
|
|
251
|
+
if not reference_text:
|
|
252
|
+
clones_dir = _get_clones_dir()
|
|
253
|
+
meta_path = clones_dir / voice / "metadata.json"
|
|
254
|
+
if meta_path.exists():
|
|
255
|
+
try:
|
|
256
|
+
with open(meta_path) as f:
|
|
257
|
+
meta = json.load(f)
|
|
258
|
+
reference_text = meta.get("reference_text", "")
|
|
259
|
+
except Exception:
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
payload = {
|
|
263
|
+
"text": text,
|
|
264
|
+
"language": language,
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if embedding_url:
|
|
268
|
+
# Cloned voice — use embedding, skip built-in voice
|
|
269
|
+
payload["speaker_voice_embedding_file_url"] = embedding_url
|
|
270
|
+
if reference_text:
|
|
271
|
+
payload["reference_text"] = reference_text
|
|
272
|
+
if prompt:
|
|
273
|
+
payload["prompt"] = prompt
|
|
274
|
+
else:
|
|
275
|
+
# Built-in voice
|
|
276
|
+
if voice not in BUILTIN_VOICES:
|
|
277
|
+
logger.warning(f"Unknown voice '{voice}', falling back to Vivian")
|
|
278
|
+
voice = 'Vivian'
|
|
279
|
+
payload["voice"] = voice
|
|
280
|
+
if prompt:
|
|
281
|
+
payload["prompt"] = prompt
|
|
282
|
+
|
|
283
|
+
t = time.time()
|
|
284
|
+
voice_label = voice if not is_cloned else f"{voice} (cloned)"
|
|
285
|
+
logger.info(f"[Qwen3] TTS: '{text[:60]}...' voice={voice_label}")
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
result = _fal_request(self.api_key, endpoint, payload)
|
|
289
|
+
except httpx.HTTPStatusError as e:
|
|
290
|
+
raise RuntimeError(
|
|
291
|
+
f"fal.ai API error {e.response.status_code}: {e.response.text}"
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
raise RuntimeError(f"fal.ai request failed: {e}")
|
|
295
|
+
|
|
296
|
+
audio_url = result.get('audio', {}).get('url')
|
|
297
|
+
if not audio_url:
|
|
298
|
+
raise RuntimeError(f"No audio URL in fal.ai response: {result}")
|
|
299
|
+
|
|
300
|
+
audio_bytes = _fal_download(audio_url)
|
|
301
|
+
|
|
302
|
+
elapsed = int((time.time() - t) * 1000)
|
|
303
|
+
logger.info(f"[Qwen3] Generated {len(audio_bytes)} bytes in {elapsed}ms")
|
|
304
|
+
return audio_bytes
|
|
305
|
+
|
|
306
|
+
# ------------------------------------------------------------------
|
|
307
|
+
# Provider interface
|
|
308
|
+
# ------------------------------------------------------------------
|
|
309
|
+
|
|
310
|
+
def health_check(self) -> dict:
|
|
311
|
+
if not self.api_key:
|
|
312
|
+
return {"ok": False, "latency_ms": 0, "detail": "FAL_KEY not set"}
|
|
313
|
+
t = time.time()
|
|
314
|
+
try:
|
|
315
|
+
with httpx.Client(timeout=httpx.Timeout(8.0)) as client:
|
|
316
|
+
resp = client.get(
|
|
317
|
+
"https://fal.run/",
|
|
318
|
+
headers={"Authorization": f"Key {self.api_key}"},
|
|
319
|
+
)
|
|
320
|
+
latency_ms = int((time.time() - t) * 1000)
|
|
321
|
+
return {
|
|
322
|
+
"ok": True, "latency_ms": latency_ms,
|
|
323
|
+
"detail": "fal.ai reachable — Qwen3-TTS ready",
|
|
324
|
+
}
|
|
325
|
+
except Exception as e:
|
|
326
|
+
latency_ms = int((time.time() - t) * 1000)
|
|
327
|
+
return {"ok": False, "latency_ms": latency_ms, "detail": str(e)}
|
|
328
|
+
|
|
329
|
+
def list_voices(self) -> list:
|
|
330
|
+
voices = BUILTIN_VOICES.copy()
|
|
331
|
+
for clone in self.list_cloned_voices():
|
|
332
|
+
voices.append(clone["voice_id"])
|
|
333
|
+
return voices
|
|
334
|
+
|
|
335
|
+
def get_default_voice(self) -> str:
|
|
336
|
+
return 'Vivian'
|
|
337
|
+
|
|
338
|
+
def is_available(self) -> bool:
|
|
339
|
+
return bool(self.api_key)
|
|
340
|
+
|
|
341
|
+
def get_info(self) -> dict:
|
|
342
|
+
cloned = self.list_cloned_voices()
|
|
343
|
+
return {
|
|
344
|
+
'name': 'Qwen3-TTS (fal.ai)',
|
|
345
|
+
'provider_id': 'qwen3',
|
|
346
|
+
'status': self._status,
|
|
347
|
+
'description': (
|
|
348
|
+
'Qwen3-TTS via fal.ai — expressive, multilingual, '
|
|
349
|
+
'voice cloning, emotion control'
|
|
350
|
+
),
|
|
351
|
+
'quality': 'very-high',
|
|
352
|
+
'latency': 'fast',
|
|
353
|
+
'cost_per_minute': 0.003,
|
|
354
|
+
'voices': BUILTIN_VOICES.copy(),
|
|
355
|
+
'cloned_voices': [
|
|
356
|
+
{"voice_id": c["voice_id"], "name": c["name"]}
|
|
357
|
+
for c in cloned
|
|
358
|
+
],
|
|
359
|
+
'features': [
|
|
360
|
+
'multilingual', 'expressive', 'voice-cloning',
|
|
361
|
+
'emotion-control', 'cloud', 'mp3-output',
|
|
362
|
+
],
|
|
363
|
+
'requires_api_key': True,
|
|
364
|
+
'languages': ['en', 'zh', 'es', 'fr', 'de', 'it', 'ja', 'ko', 'pt', 'ru'],
|
|
365
|
+
'max_characters': 5000,
|
|
366
|
+
'notes': 'Qwen3-TTS 1.7B + 0.6B. Voice cloning via clone-voice endpoint. FAL_KEY required.',
|
|
367
|
+
'default_voice': 'Vivian',
|
|
368
|
+
'audio_format': 'mp3',
|
|
369
|
+
'sample_rate': 24000,
|
|
370
|
+
'error': self._init_error,
|
|
371
|
+
}
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resemble AI TTS Provider — Chatterbox models via Resemble API.
|
|
3
|
+
|
|
4
|
+
Supports:
|
|
5
|
+
- HTTP streaming TTS (chunked WAV, progressive playback)
|
|
6
|
+
- Multiple models: chatterbox (original), chatterbox-turbo, chatterbox-multilingual
|
|
7
|
+
- Voice cloning via Resemble dashboard (voice_uuid per clone)
|
|
8
|
+
- SSML support (prosody, emphasis, breaks, prompts)
|
|
9
|
+
- Emotion/exaggeration control
|
|
10
|
+
- 90+ languages (multilingual model)
|
|
11
|
+
- 8-48kHz sample rate, PCM_16/24/32/MULAW
|
|
12
|
+
|
|
13
|
+
API key: RESEMBLE_API_KEY env var
|
|
14
|
+
Synthesis server: https://f.cluster.resemble.ai
|
|
15
|
+
API server: https://app.resemble.ai/api/v2
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import io
|
|
20
|
+
import time
|
|
21
|
+
import logging
|
|
22
|
+
import threading
|
|
23
|
+
|
|
24
|
+
import httpx
|
|
25
|
+
|
|
26
|
+
from .base_provider import TTSProvider
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# Resemble API endpoints
|
|
31
|
+
SYNTHESIS_URL = "https://f.cluster.resemble.ai/stream"
|
|
32
|
+
API_BASE_URL = "https://app.resemble.ai/api/v2"
|
|
33
|
+
|
|
34
|
+
# Models available via Resemble API
|
|
35
|
+
MODELS = {
|
|
36
|
+
"chatterbox": "Default Chatterbox — emotion exaggeration + CFG control",
|
|
37
|
+
"chatterbox-turbo": "Chatterbox Turbo — lowest latency, paralinguistic tags",
|
|
38
|
+
"chatterbox-multilingual": "Chatterbox Multilingual — 23+ languages",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
DEFAULT_MODEL = "chatterbox-turbo"
|
|
42
|
+
|
|
43
|
+
# Timeouts
|
|
44
|
+
STREAM_TIMEOUT = 30.0 # Max wait for full streaming response
|
|
45
|
+
CONNECT_TIMEOUT = 10.0 # TCP connect timeout
|
|
46
|
+
API_TIMEOUT = 15.0 # For voice listing / non-synthesis calls
|
|
47
|
+
|
|
48
|
+
# Module-level voice cache — shared across all ResembleProvider instances.
|
|
49
|
+
# list_providers() creates new instances each call, so instance-level cache
|
|
50
|
+
# is lost. This persists across the process lifetime.
|
|
51
|
+
_voices_cache_global = None
|
|
52
|
+
_voices_cache_time_global = 0
|
|
53
|
+
_voices_loading_global = False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ResembleProvider(TTSProvider):
|
|
57
|
+
"""
|
|
58
|
+
TTS Provider using Resemble AI's Chatterbox API.
|
|
59
|
+
|
|
60
|
+
Uses HTTP streaming endpoint for progressive audio delivery.
|
|
61
|
+
Voices are managed via Resemble dashboard — each voice has a UUID.
|
|
62
|
+
|
|
63
|
+
Output: WAV audio bytes (PCM_16, configurable sample rate)
|
|
64
|
+
Latency: sub-200ms time-to-first-byte (streaming)
|
|
65
|
+
Cost: pay-as-you-go, character-based
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self):
|
|
69
|
+
super().__init__()
|
|
70
|
+
self.api_key = os.getenv('RESEMBLE_API_KEY', '')
|
|
71
|
+
self._status = 'active' if self.api_key else 'error'
|
|
72
|
+
self._init_error = None if self.api_key else 'RESEMBLE_API_KEY not set'
|
|
73
|
+
|
|
74
|
+
# Warm the global voice cache in background on first instantiation
|
|
75
|
+
global _voices_loading_global
|
|
76
|
+
if self.api_key and not _voices_cache_global and not _voices_loading_global:
|
|
77
|
+
_voices_loading_global = True
|
|
78
|
+
t = threading.Thread(target=self._fetch_voices_from_api, daemon=True)
|
|
79
|
+
t.start()
|
|
80
|
+
|
|
81
|
+
def _auth_headers(self):
|
|
82
|
+
return {
|
|
83
|
+
'Authorization': f'Bearer {self.api_key}',
|
|
84
|
+
'Content-Type': 'application/json',
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# ------------------------------------------------------------------
|
|
88
|
+
# Voice listing (cached from Resemble API)
|
|
89
|
+
# ------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
def _fetch_voices_from_api(self) -> list:
|
|
92
|
+
"""Fetch available voices from Resemble API. Cached globally for 5 minutes."""
|
|
93
|
+
global _voices_cache_global, _voices_cache_time_global, _voices_loading_global
|
|
94
|
+
now = time.time()
|
|
95
|
+
if _voices_cache_global and (now - _voices_cache_time_global) < 300:
|
|
96
|
+
return _voices_cache_global
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
voices = []
|
|
100
|
+
page = 1
|
|
101
|
+
with httpx.Client(timeout=httpx.Timeout(API_TIMEOUT)) as client:
|
|
102
|
+
while True:
|
|
103
|
+
resp = client.get(
|
|
104
|
+
f"{API_BASE_URL}/voices",
|
|
105
|
+
params={"page": page, "page_size": 50},
|
|
106
|
+
headers=self._auth_headers(),
|
|
107
|
+
)
|
|
108
|
+
resp.raise_for_status()
|
|
109
|
+
data = resp.json()
|
|
110
|
+
|
|
111
|
+
for v in data.get('items', []):
|
|
112
|
+
if v.get('voice_status') == 'Ready':
|
|
113
|
+
voices.append({
|
|
114
|
+
'id': v.get('uuid', ''),
|
|
115
|
+
'name': v.get('name', 'Unknown'),
|
|
116
|
+
'language': v.get('default_language', 'en'),
|
|
117
|
+
'streaming': v.get('api_support', {}).get('streaming', False),
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
if page >= data.get('num_pages', 1):
|
|
121
|
+
break
|
|
122
|
+
page += 1
|
|
123
|
+
|
|
124
|
+
_voices_cache_global = voices
|
|
125
|
+
_voices_cache_time_global = now
|
|
126
|
+
_voices_loading_global = False
|
|
127
|
+
logger.info(f"[Resemble] Fetched {len(voices)} voices from API")
|
|
128
|
+
return voices
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
_voices_loading_global = False
|
|
132
|
+
logger.warning(f"[Resemble] Failed to fetch voices: {e}")
|
|
133
|
+
return _voices_cache_global or []
|
|
134
|
+
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
# Speech generation (HTTP streaming)
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def generate_speech(self, text: str, voice: str = '', **kwargs) -> bytes:
|
|
140
|
+
"""
|
|
141
|
+
Generate speech via Resemble streaming API.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
text: Text or SSML to synthesize (max 2000 chars).
|
|
145
|
+
voice: Resemble voice UUID. If empty, uses RESEMBLE_VOICE_UUID env var.
|
|
146
|
+
**kwargs:
|
|
147
|
+
model: 'chatterbox', 'chatterbox-turbo', or 'chatterbox-multilingual'
|
|
148
|
+
sample_rate: 8000-48000 (default 24000)
|
|
149
|
+
precision: 'PCM_16', 'PCM_24', 'PCM_32', 'MULAW' (default PCM_16)
|
|
150
|
+
exaggeration: 0.0-1.0 emotion intensity (via SSML prompt attr)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
WAV audio bytes.
|
|
154
|
+
"""
|
|
155
|
+
if not self.api_key:
|
|
156
|
+
raise RuntimeError("RESEMBLE_API_KEY not set")
|
|
157
|
+
|
|
158
|
+
self.validate_text(text)
|
|
159
|
+
|
|
160
|
+
# Resolve voice — accept UUID or display name
|
|
161
|
+
voice_uuid = voice or os.getenv('RESEMBLE_VOICE_UUID', '')
|
|
162
|
+
if not voice_uuid:
|
|
163
|
+
raise RuntimeError(
|
|
164
|
+
"No voice_uuid provided and RESEMBLE_VOICE_UUID not set. "
|
|
165
|
+
"Create a voice at app.resemble.ai and set the UUID."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# If the voice looks like a name (not a short hex UUID), resolve it
|
|
169
|
+
if not all(c in '0123456789abcdef' for c in voice_uuid):
|
|
170
|
+
cache = _voices_cache_global or self._fetch_voices_from_api()
|
|
171
|
+
for v in cache:
|
|
172
|
+
if v['name'] == voice_uuid:
|
|
173
|
+
logger.info(f"[Resemble] Resolved voice name '{voice_uuid}' → {v['id']}")
|
|
174
|
+
voice_uuid = v['id']
|
|
175
|
+
break
|
|
176
|
+
else:
|
|
177
|
+
logger.warning(f"[Resemble] Voice name '{voice_uuid}' not found in {len(cache)} voices")
|
|
178
|
+
|
|
179
|
+
model = kwargs.get('model', '')
|
|
180
|
+
sample_rate = kwargs.get('sample_rate', 24000)
|
|
181
|
+
precision = kwargs.get('precision', 'PCM_16')
|
|
182
|
+
exaggeration = kwargs.get('exaggeration')
|
|
183
|
+
|
|
184
|
+
# Wrap in SSML if exaggeration is set
|
|
185
|
+
if exaggeration is not None and not text.strip().startswith('<speak'):
|
|
186
|
+
text = f'<speak exaggeration="{exaggeration}">{text}</speak>'
|
|
187
|
+
|
|
188
|
+
payload = {
|
|
189
|
+
'voice_uuid': voice_uuid,
|
|
190
|
+
'data': text[:2000], # API limit
|
|
191
|
+
'precision': precision,
|
|
192
|
+
'sample_rate': sample_rate,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Only include model if explicitly requested — API defaults to
|
|
196
|
+
# the correct model for each voice. Forcing chatterbox-turbo on
|
|
197
|
+
# voices that don't support it returns 500.
|
|
198
|
+
if model:
|
|
199
|
+
payload['model'] = model
|
|
200
|
+
|
|
201
|
+
t = time.time()
|
|
202
|
+
logger.info(
|
|
203
|
+
f"[Resemble] TTS: '{text[:60]}...' model={model} "
|
|
204
|
+
f"voice={voice_uuid[:12]}..."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
with httpx.Client(
|
|
209
|
+
timeout=httpx.Timeout(STREAM_TIMEOUT, connect=CONNECT_TIMEOUT)
|
|
210
|
+
) as client:
|
|
211
|
+
resp = client.post(
|
|
212
|
+
SYNTHESIS_URL,
|
|
213
|
+
json=payload,
|
|
214
|
+
headers=self._auth_headers(),
|
|
215
|
+
)
|
|
216
|
+
resp.raise_for_status()
|
|
217
|
+
audio_bytes = resp.content
|
|
218
|
+
|
|
219
|
+
except httpx.HTTPStatusError as e:
|
|
220
|
+
status = e.response.status_code
|
|
221
|
+
body = e.response.text[:200]
|
|
222
|
+
raise RuntimeError(f"Resemble API error {status}: {body}")
|
|
223
|
+
except httpx.TimeoutException:
|
|
224
|
+
raise RuntimeError(
|
|
225
|
+
f"Resemble API timeout after {STREAM_TIMEOUT}s"
|
|
226
|
+
)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
raise RuntimeError(f"Resemble request failed: {e}")
|
|
229
|
+
|
|
230
|
+
elapsed = int((time.time() - t) * 1000)
|
|
231
|
+
logger.info(f"[Resemble] Generated {len(audio_bytes)} bytes in {elapsed}ms")
|
|
232
|
+
|
|
233
|
+
if len(audio_bytes) < 100:
|
|
234
|
+
raise RuntimeError(
|
|
235
|
+
f"Resemble returned suspiciously small response ({len(audio_bytes)} bytes)"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return audio_bytes
|
|
239
|
+
|
|
240
|
+
# ------------------------------------------------------------------
|
|
241
|
+
# Provider interface
|
|
242
|
+
# ------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
def health_check(self) -> dict:
|
|
245
|
+
if not self.api_key:
|
|
246
|
+
return {"ok": False, "latency_ms": 0, "detail": "RESEMBLE_API_KEY not set"}
|
|
247
|
+
t = time.time()
|
|
248
|
+
try:
|
|
249
|
+
with httpx.Client(timeout=httpx.Timeout(API_TIMEOUT)) as client:
|
|
250
|
+
resp = client.get(
|
|
251
|
+
f"{API_BASE_URL}/voices",
|
|
252
|
+
params={"page": 1, "page_size": 1},
|
|
253
|
+
headers=self._auth_headers(),
|
|
254
|
+
)
|
|
255
|
+
resp.raise_for_status()
|
|
256
|
+
latency_ms = int((time.time() - t) * 1000)
|
|
257
|
+
return {
|
|
258
|
+
"ok": True, "latency_ms": latency_ms,
|
|
259
|
+
"detail": "Resemble API reachable — Chatterbox ready",
|
|
260
|
+
}
|
|
261
|
+
except Exception as e:
|
|
262
|
+
latency_ms = int((time.time() - t) * 1000)
|
|
263
|
+
return {"ok": False, "latency_ms": latency_ms, "detail": str(e)}
|
|
264
|
+
|
|
265
|
+
def list_voices(self) -> list:
|
|
266
|
+
voices = _voices_cache_global or self._fetch_voices_from_api()
|
|
267
|
+
return [v['id'] for v in voices] if voices else []
|
|
268
|
+
|
|
269
|
+
def get_default_voice(self) -> str:
|
|
270
|
+
return os.getenv('RESEMBLE_VOICE_UUID', '')
|
|
271
|
+
|
|
272
|
+
def is_available(self) -> bool:
|
|
273
|
+
return bool(self.api_key)
|
|
274
|
+
|
|
275
|
+
def get_info(self) -> dict:
|
|
276
|
+
# Use global cache — populated by background thread on first init.
|
|
277
|
+
# Never fetch synchronously here; that blocks the settings panel.
|
|
278
|
+
cached_names = [v['name'] for v in _voices_cache_global] if _voices_cache_global else []
|
|
279
|
+
return {
|
|
280
|
+
'name': 'Resemble AI (Chatterbox)',
|
|
281
|
+
'provider_id': 'resemble',
|
|
282
|
+
'status': self._status,
|
|
283
|
+
'description': (
|
|
284
|
+
'Resemble AI Chatterbox — streaming TTS, voice cloning, '
|
|
285
|
+
'emotion control, SSML, 90+ languages'
|
|
286
|
+
),
|
|
287
|
+
'quality': 'very-high',
|
|
288
|
+
'latency': 'very-fast',
|
|
289
|
+
'cost_per_minute': 0.10,
|
|
290
|
+
'voices': cached_names,
|
|
291
|
+
'features': [
|
|
292
|
+
'streaming', 'voice-cloning', 'emotion-control',
|
|
293
|
+
'ssml', 'multilingual', 'cloud', 'wav-output',
|
|
294
|
+
'paralinguistic-tags',
|
|
295
|
+
],
|
|
296
|
+
'requires_api_key': True,
|
|
297
|
+
'languages': [
|
|
298
|
+
'en', 'es', 'fr', 'de', 'it', 'pt', 'ja', 'ko', 'zh',
|
|
299
|
+
'ar', 'ru', 'hi', 'nl', 'pl', 'sv', 'da', 'fi', 'el',
|
|
300
|
+
'cs', 'hu', 'ro', 'tr', 'uk', 'vi', 'th', 'id',
|
|
301
|
+
],
|
|
302
|
+
'max_characters': 2000,
|
|
303
|
+
'notes': (
|
|
304
|
+
'Streaming HTTP TTS via f.cluster.resemble.ai. '
|
|
305
|
+
'Models: chatterbox-turbo (fastest), chatterbox (emotion), '
|
|
306
|
+
'chatterbox-multilingual (23 langs). '
|
|
307
|
+
'Voice cloning via Resemble dashboard. '
|
|
308
|
+
'RESEMBLE_API_KEY + RESEMBLE_VOICE_UUID required.'
|
|
309
|
+
),
|
|
310
|
+
'default_voice': os.getenv('RESEMBLE_VOICE_UUID', ''),
|
|
311
|
+
'audio_format': 'wav',
|
|
312
|
+
'sample_rate': 24000,
|
|
313
|
+
'models': list(MODELS.keys()),
|
|
314
|
+
'error': self._init_error,
|
|
315
|
+
}
|