openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
@@ -0,0 +1,371 @@
1
+ """
2
+ Qwen3-TTS Provider — fal.ai hosted Qwen3-TTS models.
3
+
4
+ Supports:
5
+ - Named speaker TTS (0.6B and 1.7B)
6
+ - Voice cloning from audio samples via clone-voice endpoint
7
+ - Emotion/style control via prompt (1.7B)
8
+ - Cloned voice embeddings stored locally for reuse
9
+
10
+ API key: FAL_KEY env var
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import time
16
+ import logging
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+ import httpx
21
+
22
+ from .base_provider import TTSProvider
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # fal.ai endpoints
27
+ FAL_TTS_1_7B = "https://fal.run/fal-ai/qwen-3-tts/text-to-speech/1.7b"
28
+ FAL_TTS_0_6B = "https://fal.run/fal-ai/qwen-3-tts/text-to-speech/0.6b"
29
+ FAL_CLONE_1_7B = "https://fal.run/fal-ai/qwen-3-tts/clone-voice/1.7b"
30
+ FAL_CLONE_0_6B = "https://fal.run/fal-ai/qwen-3-tts/clone-voice/0.6b"
31
+
32
+ BUILTIN_VOICES = [
33
+ "Vivian", # Female, warm
34
+ "Serena", # Female, clear
35
+ "Dylan", # Male, casual
36
+ "Eric", # Male, professional
37
+ "Ryan", # Male, energetic
38
+ "Aiden", # Male, deep
39
+ "Uncle_Fu", # Male, character
40
+ "Ono_Anna", # Female, Japanese accent
41
+ "Sohee", # Female, Korean accent
42
+ ]
43
+
44
+
45
+ def _get_clones_dir() -> Path:
46
+ """Resolve voice clones directory from paths module or fallback."""
47
+ try:
48
+ from services.paths import VOICE_CLONES_DIR
49
+ return VOICE_CLONES_DIR
50
+ except ImportError:
51
+ return Path(os.getenv("VOICE_CLONES_DIR", "./runtime/voice-clones"))
52
+
53
+
54
+ def _fal_request(api_key: str, endpoint: str, payload: dict,
55
+ timeout: float = 90.0) -> dict:
56
+ """Make a JSON request to fal.ai and return the parsed response."""
57
+ headers = {
58
+ 'Authorization': f'Key {api_key}',
59
+ 'Content-Type': 'application/json',
60
+ }
61
+ with httpx.Client(timeout=httpx.Timeout(timeout, connect=10.0)) as client:
62
+ resp = client.post(endpoint, json=payload, headers=headers)
63
+ resp.raise_for_status()
64
+ return resp.json()
65
+
66
+
67
+ def _fal_download(url: str, timeout: float = 30.0) -> bytes:
68
+ """Download binary content from a fal.ai result URL."""
69
+ with httpx.Client(timeout=httpx.Timeout(timeout)) as client:
70
+ resp = client.get(url)
71
+ resp.raise_for_status()
72
+ return resp.content
73
+
74
+
75
+ class Qwen3Provider(TTSProvider):
76
+ """
77
+ TTS Provider using Qwen3-TTS via fal.ai.
78
+
79
+ Built-in voices: Vivian, Serena, Dylan, Eric, Ryan, Aiden, Uncle_Fu, Ono_Anna, Sohee
80
+ Cloned voices: stored locally as .safetensors embeddings, referenced by voice_id
81
+ Output: MP3 audio bytes
82
+ """
83
+
84
+ def __init__(self):
85
+ super().__init__()
86
+ self.api_key = os.getenv('FAL_KEY', '')
87
+ self._status = 'active' if self.api_key else 'error'
88
+ self._init_error = None if self.api_key else 'FAL_KEY not set in environment'
89
+
90
+ # ------------------------------------------------------------------
91
+ # Voice cloning
92
+ # ------------------------------------------------------------------
93
+
94
+ def clone_voice(self, audio_url: str, name: str,
95
+ reference_text: Optional[str] = None) -> dict:
96
+ """
97
+ Clone a voice from a reference audio sample.
98
+
99
+ Args:
100
+ audio_url: Public URL to reference audio (WAV/MP3, 3+ seconds).
101
+ name: Human-readable name for this cloned voice.
102
+ reference_text: Optional transcript of what's said in the audio
103
+ (improves quality).
104
+
105
+ Returns:
106
+ dict with: voice_id, name, embedding_url, created_at, metadata
107
+ """
108
+ if not self.api_key:
109
+ raise RuntimeError("FAL_KEY not set — cannot clone voice")
110
+
111
+ t = time.time()
112
+ logger.info(f"[Qwen3] Cloning voice '{name}' from {audio_url[:80]}")
113
+
114
+ payload = {"audio_url": audio_url}
115
+ if reference_text:
116
+ payload["reference_text"] = reference_text
117
+
118
+ try:
119
+ result = _fal_request(self.api_key, FAL_CLONE_1_7B, payload,
120
+ timeout=120.0)
121
+ except httpx.HTTPStatusError as e:
122
+ raise RuntimeError(
123
+ f"fal.ai clone error {e.response.status_code}: {e.response.text}"
124
+ )
125
+
126
+ # Extract embedding URL from response
127
+ embedding_url = result.get('speaker_embedding', {}).get('url')
128
+ if not embedding_url:
129
+ # Try alternate response shapes
130
+ embedding_url = result.get('audio', {}).get('url')
131
+ if not embedding_url:
132
+ raise RuntimeError(f"No embedding URL in fal.ai response: {result}")
133
+
134
+ elapsed_ms = int((time.time() - t) * 1000)
135
+
136
+ # Download and persist the embedding locally
137
+ embedding_bytes = _fal_download(embedding_url)
138
+
139
+ clones_dir = _get_clones_dir()
140
+ # voice_id = sanitized name
141
+ voice_id = "clone_" + "".join(
142
+ c for c in name.lower().replace(" ", "_")
143
+ if c.isalnum() or c == "_"
144
+ )[:40]
145
+ voice_dir = clones_dir / voice_id
146
+ voice_dir.mkdir(parents=True, exist_ok=True)
147
+
148
+ embedding_path = voice_dir / "embedding.safetensors"
149
+ with open(embedding_path, 'wb') as f:
150
+ f.write(embedding_bytes)
151
+
152
+ metadata = {
153
+ "voice_id": voice_id,
154
+ "name": name,
155
+ "embedding_url": embedding_url,
156
+ "embedding_size": len(embedding_bytes),
157
+ "reference_text": reference_text,
158
+ "source_audio_url": audio_url,
159
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
160
+ "clone_time_ms": elapsed_ms,
161
+ "provider": "qwen3",
162
+ "fal_response": result,
163
+ }
164
+ with open(voice_dir / "metadata.json", 'w') as f:
165
+ json.dump(metadata, f, indent=2)
166
+
167
+ logger.info(
168
+ f"[Qwen3] Voice cloned: {voice_id} ({len(embedding_bytes)} bytes) "
169
+ f"in {elapsed_ms}ms"
170
+ )
171
+ return metadata
172
+
173
+ def list_cloned_voices(self) -> list:
174
+ """List all locally stored cloned voice embeddings."""
175
+ clones_dir = _get_clones_dir()
176
+ voices = []
177
+ if not clones_dir.exists():
178
+ return voices
179
+ for voice_dir in sorted(clones_dir.iterdir()):
180
+ meta_path = voice_dir / "metadata.json"
181
+ if meta_path.exists():
182
+ try:
183
+ with open(meta_path) as f:
184
+ meta = json.load(f)
185
+ meta["has_embedding"] = (voice_dir / "embedding.safetensors").exists()
186
+ voices.append(meta)
187
+ except Exception as e:
188
+ logger.warning(f"Bad voice metadata in {voice_dir}: {e}")
189
+ return voices
190
+
191
+ def get_clone_embedding_url(self, voice_id: str) -> Optional[str]:
192
+ """Get the fal.ai embedding URL for a cloned voice.
193
+
194
+ Returns the cached remote URL from metadata. The embedding is also
195
+ stored locally as a fallback, but fal.ai needs the URL for generation.
196
+ """
197
+ clones_dir = _get_clones_dir()
198
+ meta_path = clones_dir / voice_id / "metadata.json"
199
+ if not meta_path.exists():
200
+ return None
201
+ try:
202
+ with open(meta_path) as f:
203
+ meta = json.load(f)
204
+ return meta.get("embedding_url")
205
+ except Exception:
206
+ return None
207
+
208
+ # ------------------------------------------------------------------
209
+ # Speech generation
210
+ # ------------------------------------------------------------------
211
+
212
+ def generate_speech(self, text: str, voice: str = 'Vivian', **kwargs) -> bytes:
213
+ """
214
+ Generate speech via fal.ai Qwen3-TTS.
215
+
216
+ Args:
217
+ text: Text to synthesize.
218
+ voice: Built-in voice name OR cloned voice_id (clone_xxx).
219
+ **kwargs:
220
+ language: Language name (default 'English').
221
+ prompt: Style/emotion instruction for 1.7B model.
222
+ speaker_embedding_url: Direct embedding URL override.
223
+ reference_text: Reference text for cloned voice quality.
224
+ model: '0.6b' or '1.7b' (default '1.7b').
225
+
226
+ Returns:
227
+ MP3 audio bytes.
228
+ """
229
+ if not self.api_key:
230
+ raise RuntimeError("FAL_KEY not set — cannot call fal.ai API")
231
+
232
+ self.validate_text(text)
233
+
234
+ language = kwargs.get('language', 'English')
235
+ prompt = kwargs.get('prompt', '')
236
+ embedding_url = kwargs.get('speaker_embedding_url')
237
+ reference_text = kwargs.get('reference_text', '')
238
+ model = kwargs.get('model', '1.7b')
239
+
240
+ endpoint = FAL_TTS_1_7B if model == '1.7b' else FAL_TTS_0_6B
241
+
242
+ # Resolve cloned voice → embedding URL
243
+ is_cloned = voice.startswith("clone_") if voice else False
244
+ if is_cloned and not embedding_url:
245
+ embedding_url = self.get_clone_embedding_url(voice)
246
+ if not embedding_url:
247
+ raise RuntimeError(
248
+ f"Cloned voice '{voice}' not found or missing embedding"
249
+ )
250
+ # Load reference_text from metadata if not provided
251
+ if not reference_text:
252
+ clones_dir = _get_clones_dir()
253
+ meta_path = clones_dir / voice / "metadata.json"
254
+ if meta_path.exists():
255
+ try:
256
+ with open(meta_path) as f:
257
+ meta = json.load(f)
258
+ reference_text = meta.get("reference_text", "")
259
+ except Exception:
260
+ pass
261
+
262
+ payload = {
263
+ "text": text,
264
+ "language": language,
265
+ }
266
+
267
+ if embedding_url:
268
+ # Cloned voice — use embedding, skip built-in voice
269
+ payload["speaker_voice_embedding_file_url"] = embedding_url
270
+ if reference_text:
271
+ payload["reference_text"] = reference_text
272
+ if prompt:
273
+ payload["prompt"] = prompt
274
+ else:
275
+ # Built-in voice
276
+ if voice not in BUILTIN_VOICES:
277
+ logger.warning(f"Unknown voice '{voice}', falling back to Vivian")
278
+ voice = 'Vivian'
279
+ payload["voice"] = voice
280
+ if prompt:
281
+ payload["prompt"] = prompt
282
+
283
+ t = time.time()
284
+ voice_label = voice if not is_cloned else f"{voice} (cloned)"
285
+ logger.info(f"[Qwen3] TTS: '{text[:60]}...' voice={voice_label}")
286
+
287
+ try:
288
+ result = _fal_request(self.api_key, endpoint, payload)
289
+ except httpx.HTTPStatusError as e:
290
+ raise RuntimeError(
291
+ f"fal.ai API error {e.response.status_code}: {e.response.text}"
292
+ )
293
+ except Exception as e:
294
+ raise RuntimeError(f"fal.ai request failed: {e}")
295
+
296
+ audio_url = result.get('audio', {}).get('url')
297
+ if not audio_url:
298
+ raise RuntimeError(f"No audio URL in fal.ai response: {result}")
299
+
300
+ audio_bytes = _fal_download(audio_url)
301
+
302
+ elapsed = int((time.time() - t) * 1000)
303
+ logger.info(f"[Qwen3] Generated {len(audio_bytes)} bytes in {elapsed}ms")
304
+ return audio_bytes
305
+
306
+ # ------------------------------------------------------------------
307
+ # Provider interface
308
+ # ------------------------------------------------------------------
309
+
310
+ def health_check(self) -> dict:
311
+ if not self.api_key:
312
+ return {"ok": False, "latency_ms": 0, "detail": "FAL_KEY not set"}
313
+ t = time.time()
314
+ try:
315
+ with httpx.Client(timeout=httpx.Timeout(8.0)) as client:
316
+ resp = client.get(
317
+ "https://fal.run/",
318
+ headers={"Authorization": f"Key {self.api_key}"},
319
+ )
320
+ latency_ms = int((time.time() - t) * 1000)
321
+ return {
322
+ "ok": True, "latency_ms": latency_ms,
323
+ "detail": "fal.ai reachable — Qwen3-TTS ready",
324
+ }
325
+ except Exception as e:
326
+ latency_ms = int((time.time() - t) * 1000)
327
+ return {"ok": False, "latency_ms": latency_ms, "detail": str(e)}
328
+
329
+ def list_voices(self) -> list:
330
+ voices = BUILTIN_VOICES.copy()
331
+ for clone in self.list_cloned_voices():
332
+ voices.append(clone["voice_id"])
333
+ return voices
334
+
335
+ def get_default_voice(self) -> str:
336
+ return 'Vivian'
337
+
338
+ def is_available(self) -> bool:
339
+ return bool(self.api_key)
340
+
341
+ def get_info(self) -> dict:
342
+ cloned = self.list_cloned_voices()
343
+ return {
344
+ 'name': 'Qwen3-TTS (fal.ai)',
345
+ 'provider_id': 'qwen3',
346
+ 'status': self._status,
347
+ 'description': (
348
+ 'Qwen3-TTS via fal.ai — expressive, multilingual, '
349
+ 'voice cloning, emotion control'
350
+ ),
351
+ 'quality': 'very-high',
352
+ 'latency': 'fast',
353
+ 'cost_per_minute': 0.003,
354
+ 'voices': BUILTIN_VOICES.copy(),
355
+ 'cloned_voices': [
356
+ {"voice_id": c["voice_id"], "name": c["name"]}
357
+ for c in cloned
358
+ ],
359
+ 'features': [
360
+ 'multilingual', 'expressive', 'voice-cloning',
361
+ 'emotion-control', 'cloud', 'mp3-output',
362
+ ],
363
+ 'requires_api_key': True,
364
+ 'languages': ['en', 'zh', 'es', 'fr', 'de', 'it', 'ja', 'ko', 'pt', 'ru'],
365
+ 'max_characters': 5000,
366
+ 'notes': 'Qwen3-TTS 1.7B + 0.6B. Voice cloning via clone-voice endpoint. FAL_KEY required.',
367
+ 'default_voice': 'Vivian',
368
+ 'audio_format': 'mp3',
369
+ 'sample_rate': 24000,
370
+ 'error': self._init_error,
371
+ }
@@ -0,0 +1,315 @@
1
+ """
2
+ Resemble AI TTS Provider — Chatterbox models via Resemble API.
3
+
4
+ Supports:
5
+ - HTTP streaming TTS (chunked WAV, progressive playback)
6
+ - Multiple models: chatterbox (original), chatterbox-turbo, chatterbox-multilingual
7
+ - Voice cloning via Resemble dashboard (voice_uuid per clone)
8
+ - SSML support (prosody, emphasis, breaks, prompts)
9
+ - Emotion/exaggeration control
10
+ - 90+ languages (multilingual model)
11
+ - 8-48kHz sample rate, PCM_16/24/32/MULAW
12
+
13
+ API key: RESEMBLE_API_KEY env var
14
+ Synthesis server: https://f.cluster.resemble.ai
15
+ API server: https://app.resemble.ai/api/v2
16
+ """
17
+
18
+ import os
19
+ import io
20
+ import time
21
+ import logging
22
+ import threading
23
+
24
+ import httpx
25
+
26
+ from .base_provider import TTSProvider
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Resemble API endpoints
31
+ SYNTHESIS_URL = "https://f.cluster.resemble.ai/stream"
32
+ API_BASE_URL = "https://app.resemble.ai/api/v2"
33
+
34
+ # Models available via Resemble API
35
+ MODELS = {
36
+ "chatterbox": "Default Chatterbox — emotion exaggeration + CFG control",
37
+ "chatterbox-turbo": "Chatterbox Turbo — lowest latency, paralinguistic tags",
38
+ "chatterbox-multilingual": "Chatterbox Multilingual — 23+ languages",
39
+ }
40
+
41
+ DEFAULT_MODEL = "chatterbox-turbo"
42
+
43
+ # Timeouts
44
+ STREAM_TIMEOUT = 30.0 # Max wait for full streaming response
45
+ CONNECT_TIMEOUT = 10.0 # TCP connect timeout
46
+ API_TIMEOUT = 15.0 # For voice listing / non-synthesis calls
47
+
48
+ # Module-level voice cache — shared across all ResembleProvider instances.
49
+ # list_providers() creates new instances each call, so instance-level cache
50
+ # is lost. This persists across the process lifetime.
51
+ _voices_cache_global = None
52
+ _voices_cache_time_global = 0
53
+ _voices_loading_global = False
54
+
55
+
56
+ class ResembleProvider(TTSProvider):
57
+ """
58
+ TTS Provider using Resemble AI's Chatterbox API.
59
+
60
+ Uses HTTP streaming endpoint for progressive audio delivery.
61
+ Voices are managed via Resemble dashboard — each voice has a UUID.
62
+
63
+ Output: WAV audio bytes (PCM_16, configurable sample rate)
64
+ Latency: sub-200ms time-to-first-byte (streaming)
65
+ Cost: pay-as-you-go, character-based
66
+ """
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+ self.api_key = os.getenv('RESEMBLE_API_KEY', '')
71
+ self._status = 'active' if self.api_key else 'error'
72
+ self._init_error = None if self.api_key else 'RESEMBLE_API_KEY not set'
73
+
74
+ # Warm the global voice cache in background on first instantiation
75
+ global _voices_loading_global
76
+ if self.api_key and not _voices_cache_global and not _voices_loading_global:
77
+ _voices_loading_global = True
78
+ t = threading.Thread(target=self._fetch_voices_from_api, daemon=True)
79
+ t.start()
80
+
81
+ def _auth_headers(self):
82
+ return {
83
+ 'Authorization': f'Bearer {self.api_key}',
84
+ 'Content-Type': 'application/json',
85
+ }
86
+
87
+ # ------------------------------------------------------------------
88
+ # Voice listing (cached from Resemble API)
89
+ # ------------------------------------------------------------------
90
+
91
+ def _fetch_voices_from_api(self) -> list:
92
+ """Fetch available voices from Resemble API. Cached globally for 5 minutes."""
93
+ global _voices_cache_global, _voices_cache_time_global, _voices_loading_global
94
+ now = time.time()
95
+ if _voices_cache_global and (now - _voices_cache_time_global) < 300:
96
+ return _voices_cache_global
97
+
98
+ try:
99
+ voices = []
100
+ page = 1
101
+ with httpx.Client(timeout=httpx.Timeout(API_TIMEOUT)) as client:
102
+ while True:
103
+ resp = client.get(
104
+ f"{API_BASE_URL}/voices",
105
+ params={"page": page, "page_size": 50},
106
+ headers=self._auth_headers(),
107
+ )
108
+ resp.raise_for_status()
109
+ data = resp.json()
110
+
111
+ for v in data.get('items', []):
112
+ if v.get('voice_status') == 'Ready':
113
+ voices.append({
114
+ 'id': v.get('uuid', ''),
115
+ 'name': v.get('name', 'Unknown'),
116
+ 'language': v.get('default_language', 'en'),
117
+ 'streaming': v.get('api_support', {}).get('streaming', False),
118
+ })
119
+
120
+ if page >= data.get('num_pages', 1):
121
+ break
122
+ page += 1
123
+
124
+ _voices_cache_global = voices
125
+ _voices_cache_time_global = now
126
+ _voices_loading_global = False
127
+ logger.info(f"[Resemble] Fetched {len(voices)} voices from API")
128
+ return voices
129
+
130
+ except Exception as e:
131
+ _voices_loading_global = False
132
+ logger.warning(f"[Resemble] Failed to fetch voices: {e}")
133
+ return _voices_cache_global or []
134
+
135
+ # ------------------------------------------------------------------
136
+ # Speech generation (HTTP streaming)
137
+ # ------------------------------------------------------------------
138
+
139
+ def generate_speech(self, text: str, voice: str = '', **kwargs) -> bytes:
140
+ """
141
+ Generate speech via Resemble streaming API.
142
+
143
+ Args:
144
+ text: Text or SSML to synthesize (max 2000 chars).
145
+ voice: Resemble voice UUID. If empty, uses RESEMBLE_VOICE_UUID env var.
146
+ **kwargs:
147
+ model: 'chatterbox', 'chatterbox-turbo', or 'chatterbox-multilingual'
148
+ sample_rate: 8000-48000 (default 24000)
149
+ precision: 'PCM_16', 'PCM_24', 'PCM_32', 'MULAW' (default PCM_16)
150
+ exaggeration: 0.0-1.0 emotion intensity (via SSML prompt attr)
151
+
152
+ Returns:
153
+ WAV audio bytes.
154
+ """
155
+ if not self.api_key:
156
+ raise RuntimeError("RESEMBLE_API_KEY not set")
157
+
158
+ self.validate_text(text)
159
+
160
+ # Resolve voice — accept UUID or display name
161
+ voice_uuid = voice or os.getenv('RESEMBLE_VOICE_UUID', '')
162
+ if not voice_uuid:
163
+ raise RuntimeError(
164
+ "No voice_uuid provided and RESEMBLE_VOICE_UUID not set. "
165
+ "Create a voice at app.resemble.ai and set the UUID."
166
+ )
167
+
168
+ # If the voice looks like a name (not a short hex UUID), resolve it
169
+ if not all(c in '0123456789abcdef' for c in voice_uuid):
170
+ cache = _voices_cache_global or self._fetch_voices_from_api()
171
+ for v in cache:
172
+ if v['name'] == voice_uuid:
173
+ logger.info(f"[Resemble] Resolved voice name '{voice_uuid}' → {v['id']}")
174
+ voice_uuid = v['id']
175
+ break
176
+ else:
177
+ logger.warning(f"[Resemble] Voice name '{voice_uuid}' not found in {len(cache)} voices")
178
+
179
+ model = kwargs.get('model', '')
180
+ sample_rate = kwargs.get('sample_rate', 24000)
181
+ precision = kwargs.get('precision', 'PCM_16')
182
+ exaggeration = kwargs.get('exaggeration')
183
+
184
+ # Wrap in SSML if exaggeration is set
185
+ if exaggeration is not None and not text.strip().startswith('<speak'):
186
+ text = f'<speak exaggeration="{exaggeration}">{text}</speak>'
187
+
188
+ payload = {
189
+ 'voice_uuid': voice_uuid,
190
+ 'data': text[:2000], # API limit
191
+ 'precision': precision,
192
+ 'sample_rate': sample_rate,
193
+ }
194
+
195
+ # Only include model if explicitly requested — API defaults to
196
+ # the correct model for each voice. Forcing chatterbox-turbo on
197
+ # voices that don't support it returns 500.
198
+ if model:
199
+ payload['model'] = model
200
+
201
+ t = time.time()
202
+ logger.info(
203
+ f"[Resemble] TTS: '{text[:60]}...' model={model} "
204
+ f"voice={voice_uuid[:12]}..."
205
+ )
206
+
207
+ try:
208
+ with httpx.Client(
209
+ timeout=httpx.Timeout(STREAM_TIMEOUT, connect=CONNECT_TIMEOUT)
210
+ ) as client:
211
+ resp = client.post(
212
+ SYNTHESIS_URL,
213
+ json=payload,
214
+ headers=self._auth_headers(),
215
+ )
216
+ resp.raise_for_status()
217
+ audio_bytes = resp.content
218
+
219
+ except httpx.HTTPStatusError as e:
220
+ status = e.response.status_code
221
+ body = e.response.text[:200]
222
+ raise RuntimeError(f"Resemble API error {status}: {body}")
223
+ except httpx.TimeoutException:
224
+ raise RuntimeError(
225
+ f"Resemble API timeout after {STREAM_TIMEOUT}s"
226
+ )
227
+ except Exception as e:
228
+ raise RuntimeError(f"Resemble request failed: {e}")
229
+
230
+ elapsed = int((time.time() - t) * 1000)
231
+ logger.info(f"[Resemble] Generated {len(audio_bytes)} bytes in {elapsed}ms")
232
+
233
+ if len(audio_bytes) < 100:
234
+ raise RuntimeError(
235
+ f"Resemble returned suspiciously small response ({len(audio_bytes)} bytes)"
236
+ )
237
+
238
+ return audio_bytes
239
+
240
+ # ------------------------------------------------------------------
241
+ # Provider interface
242
+ # ------------------------------------------------------------------
243
+
244
+ def health_check(self) -> dict:
245
+ if not self.api_key:
246
+ return {"ok": False, "latency_ms": 0, "detail": "RESEMBLE_API_KEY not set"}
247
+ t = time.time()
248
+ try:
249
+ with httpx.Client(timeout=httpx.Timeout(API_TIMEOUT)) as client:
250
+ resp = client.get(
251
+ f"{API_BASE_URL}/voices",
252
+ params={"page": 1, "page_size": 1},
253
+ headers=self._auth_headers(),
254
+ )
255
+ resp.raise_for_status()
256
+ latency_ms = int((time.time() - t) * 1000)
257
+ return {
258
+ "ok": True, "latency_ms": latency_ms,
259
+ "detail": "Resemble API reachable — Chatterbox ready",
260
+ }
261
+ except Exception as e:
262
+ latency_ms = int((time.time() - t) * 1000)
263
+ return {"ok": False, "latency_ms": latency_ms, "detail": str(e)}
264
+
265
+ def list_voices(self) -> list:
266
+ voices = _voices_cache_global or self._fetch_voices_from_api()
267
+ return [v['id'] for v in voices] if voices else []
268
+
269
+ def get_default_voice(self) -> str:
270
+ return os.getenv('RESEMBLE_VOICE_UUID', '')
271
+
272
+ def is_available(self) -> bool:
273
+ return bool(self.api_key)
274
+
275
+ def get_info(self) -> dict:
276
+ # Use global cache — populated by background thread on first init.
277
+ # Never fetch synchronously here; that blocks the settings panel.
278
+ cached_names = [v['name'] for v in _voices_cache_global] if _voices_cache_global else []
279
+ return {
280
+ 'name': 'Resemble AI (Chatterbox)',
281
+ 'provider_id': 'resemble',
282
+ 'status': self._status,
283
+ 'description': (
284
+ 'Resemble AI Chatterbox — streaming TTS, voice cloning, '
285
+ 'emotion control, SSML, 90+ languages'
286
+ ),
287
+ 'quality': 'very-high',
288
+ 'latency': 'very-fast',
289
+ 'cost_per_minute': 0.10,
290
+ 'voices': cached_names,
291
+ 'features': [
292
+ 'streaming', 'voice-cloning', 'emotion-control',
293
+ 'ssml', 'multilingual', 'cloud', 'wav-output',
294
+ 'paralinguistic-tags',
295
+ ],
296
+ 'requires_api_key': True,
297
+ 'languages': [
298
+ 'en', 'es', 'fr', 'de', 'it', 'pt', 'ja', 'ko', 'zh',
299
+ 'ar', 'ru', 'hi', 'nl', 'pl', 'sv', 'da', 'fi', 'el',
300
+ 'cs', 'hu', 'ro', 'tr', 'uk', 'vi', 'th', 'id',
301
+ ],
302
+ 'max_characters': 2000,
303
+ 'notes': (
304
+ 'Streaming HTTP TTS via f.cluster.resemble.ai. '
305
+ 'Models: chatterbox-turbo (fastest), chatterbox (emotion), '
306
+ 'chatterbox-multilingual (23 langs). '
307
+ 'Voice cloning via Resemble dashboard. '
308
+ 'RESEMBLE_API_KEY + RESEMBLE_VOICE_UUID required.'
309
+ ),
310
+ 'default_voice': os.getenv('RESEMBLE_VOICE_UUID', ''),
311
+ 'audio_format': 'wav',
312
+ 'sample_rate': 24000,
313
+ 'models': list(MODELS.keys()),
314
+ 'error': self._init_error,
315
+ }