openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
@@ -0,0 +1,285 @@
1
+ """
2
+ services/speech_normalizer.py — Speech Normalization Service
3
+
4
+ Cleans and normalizes LLM response text before it is sent to TTS providers.
5
+ Rules are loaded from config/speech_normalization.yaml (ADR-001).
6
+
7
+ Usage:
8
+ from services.speech_normalizer import SpeechNormalizer
9
+
10
+ normalizer = SpeechNormalizer()
11
+ clean_text = normalizer.normalize("Hello **world**! Check https://example.com", profile_id="default")
12
+
13
+ The normalizer supports per-profile rule overrides defined in
14
+ config/speech_normalization.yaml under the `profiles:` key.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import os
21
+ import re
22
+ from pathlib import Path
23
+ from typing import Any, Dict, Optional
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Config file location (relative to project root)
28
+ _DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "speech_normalization.yaml"
29
+
30
+
31
+ class SpeechNormalizer:
32
+ """
33
+ Normalizes text for TTS by applying a configurable pipeline of rules:
34
+
35
+ 1. Strip markdown formatting (headers, bold, code blocks, etc.)
36
+ 2. Strip URLs
37
+ 3. Strip emoji
38
+ 4. Expand abbreviations (API → A P I, etc.)
39
+ 5. Collapse whitespace
40
+ 6. Trim to max_length
41
+
42
+ Rules are loaded from speech_normalization.yaml.
43
+ Per-profile overrides are merged on top of global defaults.
44
+ """
45
+
46
+ def __init__(self, config_path: Optional[str] = None) -> None:
47
+ self._config_path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
48
+ self._raw_config: Dict[str, Any] = {}
49
+ self._global: Dict[str, Any] = {}
50
+ self._profile_overrides: Dict[str, Dict[str, Any]] = {}
51
+ self._load_config()
52
+
53
+ # ── Config loading ─────────────────────────────────────────────────────────
54
+
55
+ def _load_config(self) -> None:
56
+ """Load and parse speech_normalization.yaml."""
57
+ if not self._config_path.exists():
58
+ logger.warning(
59
+ "Speech normalization config not found at %s — using built-in defaults",
60
+ self._config_path,
61
+ )
62
+ self._raw_config = {}
63
+ self._global = self._builtin_defaults()
64
+ return
65
+
66
+ try:
67
+ import yaml # type: ignore
68
+ with open(self._config_path, "r") as f:
69
+ self._raw_config = yaml.safe_load(f) or {}
70
+ self._global = {**self._builtin_defaults(), **self._raw_config.get("global", {})}
71
+ # Merge global abbreviations with per-section ones
72
+ global_abbrevs = self._raw_config.get("abbreviations", {})
73
+ self._global["_abbreviations"] = global_abbrevs
74
+ # Per-profile overrides
75
+ self._profile_overrides = self._raw_config.get("profiles", {})
76
+ logger.info("Speech normalization config loaded from %s", self._config_path)
77
+ except Exception as exc:
78
+ logger.error("Failed to load speech normalization config: %s — using defaults", exc)
79
+ self._global = self._builtin_defaults()
80
+ self._global["_abbreviations"] = {}
81
+
82
+ def _builtin_defaults(self) -> Dict[str, Any]:
83
+ """Minimal built-in defaults used when config file is absent."""
84
+ return {
85
+ "strip_markdown": True,
86
+ "strip_urls": True,
87
+ "strip_emoji": True,
88
+ "collapse_whitespace": True,
89
+ "trim": True,
90
+ "max_length": 800,
91
+ "_abbreviations": {},
92
+ }
93
+
94
+ def reload(self) -> None:
95
+ """Reload config from disk (e.g. after hot-edit)."""
96
+ self._load_config()
97
+ logger.info("Speech normalization config reloaded")
98
+
99
+ # ── Public API ─────────────────────────────────────────────────────────────
100
+
101
+ def normalize(self, text: str, profile_id: Optional[str] = None) -> str:
102
+ """
103
+ Apply the full normalization pipeline to *text*.
104
+
105
+ Args:
106
+ text: Raw LLM response text.
107
+ profile_id: Optional agent profile ID. If provided, per-profile
108
+ overrides from speech_normalization.yaml are merged on
109
+ top of the global settings.
110
+
111
+ Returns:
112
+ Cleaned string ready for TTS input.
113
+ """
114
+ if not text:
115
+ return text
116
+
117
+ cfg = self._merged_config(profile_id)
118
+
119
+ # 1. Strip markdown
120
+ if cfg.get("strip_markdown", True):
121
+ text = self._strip_markdown(text)
122
+
123
+ # 2. Strip URLs
124
+ if cfg.get("strip_urls", True):
125
+ text = self._strip_urls(text)
126
+
127
+ # 3. Strip emoji
128
+ if cfg.get("strip_emoji", True):
129
+ text = self._strip_emoji(text)
130
+
131
+ # 4. Expand abbreviations (global + profile-specific)
132
+ abbreviations = {**self._global.get("_abbreviations", {}), **cfg.get("abbreviations", {})}
133
+ if abbreviations:
134
+ text = self._expand_abbreviations(text, abbreviations)
135
+
136
+ # 5. Collapse whitespace
137
+ if cfg.get("collapse_whitespace", True):
138
+ text = re.sub(r"[ \t]+", " ", text)
139
+ text = re.sub(r"\n{2,}", " ", text)
140
+ text = re.sub(r"\n", " ", text)
141
+
142
+ # 6. Trim
143
+ if cfg.get("trim", True):
144
+ text = text.strip()
145
+
146
+ # 7. Enforce max length (hard cap)
147
+ max_len = cfg.get("max_length", 800)
148
+ if len(text) > max_len:
149
+ # Try to break at a sentence boundary
150
+ cut = text[:max_len].rfind(". ")
151
+ if cut > max_len // 2:
152
+ text = text[: cut + 1]
153
+ else:
154
+ text = text[:max_len].rstrip() + "..."
155
+ logger.debug("Speech normalizer truncated text to %d chars", len(text))
156
+
157
+ return text
158
+
159
+ def get_config_for_profile(self, profile_id: Optional[str] = None) -> Dict[str, Any]:
160
+ """Return the effective normalized config for a given profile (for inspection/debugging)."""
161
+ return self._merged_config(profile_id)
162
+
163
+ # ── Internal helpers ───────────────────────────────────────────────────────
164
+
165
+ def _merged_config(self, profile_id: Optional[str]) -> Dict[str, Any]:
166
+ """Merge global settings with per-profile overrides."""
167
+ base = dict(self._global)
168
+ if profile_id and profile_id in self._profile_overrides:
169
+ override = self._profile_overrides[profile_id]
170
+ # Merge abbreviations separately (additive)
171
+ override_abbrevs = override.pop("abbreviations", {}) if isinstance(override, dict) else {}
172
+ base.update(override)
173
+ override["abbreviations"] = override_abbrevs # restore for future calls
174
+ base["abbreviations"] = override_abbrevs
175
+ return base
176
+
177
+ def _strip_markdown(self, text: str) -> str:
178
+ """Remove common markdown syntax from text."""
179
+ patterns = self._raw_config.get("markdown_patterns", [])
180
+ if patterns:
181
+ for entry in patterns:
182
+ raw_pattern = entry.get("pattern", "")
183
+ replacement = entry.get("replacement", "")
184
+ flags_str = entry.get("flags", "")
185
+ flags = 0
186
+ if "multiline" in flags_str:
187
+ flags |= re.MULTILINE
188
+ try:
189
+ text = re.sub(raw_pattern, replacement, text, flags=flags)
190
+ except re.error as exc:
191
+ logger.warning("Invalid markdown pattern %r: %s", raw_pattern, exc)
192
+ else:
193
+ # Built-in fallback patterns when config is absent
194
+ text = re.sub(r"```[\s\S]*?```", "", text)
195
+ text = re.sub(r"`[^`]+`", "", text)
196
+ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
197
+ text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
198
+ text = re.sub(r"__(.+?)__", r"\1", text)
199
+ text = re.sub(r"\*(.+?)\*", r"\1", text)
200
+ text = re.sub(r"_(.+?)_", r"\1", text)
201
+ text = re.sub(r"~~(.+?)~~", r"\1", text)
202
+ text = re.sub(r"^[-*_]{3,}$", "", text, flags=re.MULTILINE)
203
+ text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
204
+ text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
205
+ text = re.sub(r"^>\s*", "", text, flags=re.MULTILINE)
206
+ text = re.sub(r"^[\-\*\+]\s+", "", text, flags=re.MULTILINE)
207
+ text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE)
208
+ return text
209
+
210
+ def _strip_urls(self, text: str) -> str:
211
+ """Remove HTTP/HTTPS URLs from text."""
212
+ url_pattern = self._raw_config.get("url_pattern", r"https?://[^\s]+")
213
+ try:
214
+ text = re.sub(url_pattern, "", text)
215
+ except re.error:
216
+ text = re.sub(r"https?://[^\s]+", "", text)
217
+ return text
218
+
219
+ def _strip_emoji(self, text: str) -> str:
220
+ """Remove emoji characters from text."""
221
+ # Broad Unicode emoji range
222
+ emoji_re = re.compile(
223
+ "["
224
+ "\U0001F300-\U0001F9FF" # Misc symbols and pictographs
225
+ "\U00002600-\U000027BF" # Misc symbols
226
+ "\U0001FA00-\U0001FAFF" # Chess, medical etc.
227
+ "\U00002702-\U000027B0"
228
+ "\U000024C2-\U0001F251"
229
+ "]+",
230
+ flags=re.UNICODE,
231
+ )
232
+ return emoji_re.sub("", text)
233
+
234
+ def _expand_abbreviations(self, text: str, abbreviations: Dict[str, str]) -> str:
235
+ """
236
+ Replace abbreviations with their spoken forms.
237
+
238
+ Uses word-boundary matching so "API" inside "RAPID" is not replaced.
239
+ Longer abbreviations are tried first to prevent partial matches.
240
+ """
241
+ # Sort by length descending so longer keys match first
242
+ for abbrev, expansion in sorted(abbreviations.items(), key=lambda x: -len(x[0])):
243
+ if not abbrev:
244
+ continue
245
+ try:
246
+ # Word-boundary aware, case-sensitive match
247
+ pattern = r"\b" + re.escape(abbrev) + r"\b"
248
+ text = re.sub(pattern, expansion, text)
249
+ except re.error as exc:
250
+ logger.warning("Invalid abbreviation pattern for %r: %s", abbrev, exc)
251
+ return text
252
+
253
+
254
+ # ── Module-level singleton ─────────────────────────────────────────────────────
255
+
256
+ _normalizer_instance: Optional[SpeechNormalizer] = None
257
+
258
+
259
+ def get_normalizer() -> SpeechNormalizer:
260
+ """Return the shared SpeechNormalizer singleton (lazy-init)."""
261
+ global _normalizer_instance
262
+ if _normalizer_instance is None:
263
+ _normalizer_instance = SpeechNormalizer()
264
+ return _normalizer_instance
265
+
266
+
267
+ def normalize_for_tts(text: str, profile_id: Optional[str] = None) -> str:
268
+ """
269
+ Convenience function: normalize *text* using the global singleton.
270
+
271
+ Args:
272
+ text: Raw text to normalize.
273
+ profile_id: Optional profile ID for per-profile rule overrides.
274
+
275
+ Returns:
276
+ Cleaned text ready for TTS.
277
+ """
278
+ return get_normalizer().normalize(text, profile_id=profile_id)
279
+
280
+
281
+ __all__ = [
282
+ "SpeechNormalizer",
283
+ "get_normalizer",
284
+ "normalize_for_tts",
285
+ ]
@@ -0,0 +1,270 @@
1
+ """
2
+ services/tts.py — Unified TTS Service
3
+
4
+ Consolidates all TTS generation logic from server.py and tts_providers/.
5
+ Provides a single entry point for generating speech audio.
6
+
7
+ Providers:
8
+ - Groq Orpheus TTS (primary, cloud-based)
9
+ - Supertonic TTS (local ONNX, fallback)
10
+
11
+ Usage:
12
+ from services.tts import generate_tts_b64, generate_tts_chunked
13
+
14
+ audio_b64 = generate_tts_b64(text, voice='M1')
15
+ """
16
+
17
+ import base64
18
+ import logging
19
+ import os
20
+ import re
21
+ import struct
22
+ import time
23
+ from typing import Optional
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # ===== GROQ TTS =====
28
+
29
+ _groq_client = None
30
+
31
+
32
+ def get_groq_client():
33
+ """Get or initialize Groq client (lazy, cached)."""
34
+ global _groq_client
35
+ if _groq_client is None:
36
+ api_key = os.getenv('GROQ_API_KEY')
37
+ if api_key:
38
+ try:
39
+ from groq import Groq
40
+ _groq_client = Groq(api_key=api_key)
41
+ logger.info("Groq TTS client initialized")
42
+ except ImportError:
43
+ logger.warning("groq package not installed — Groq TTS unavailable")
44
+ else:
45
+ logger.warning("GROQ_API_KEY not set — Groq TTS unavailable")
46
+ return _groq_client
47
+
48
+
49
+ def generate_groq_tts(text: str, voice: str = 'autumn') -> bytes:
50
+ """
51
+ Generate TTS audio using Groq Orpheus (canopylabs/orpheus-v1-english).
52
+
53
+ Args:
54
+ text: Text to synthesize.
55
+ voice: Orpheus voice name (default 'autumn').
56
+
57
+ Returns:
58
+ MP3 audio bytes.
59
+
60
+ Raises:
61
+ RuntimeError: If Groq client unavailable or API call fails.
62
+ """
63
+ groq = get_groq_client()
64
+ if not groq:
65
+ raise RuntimeError("Groq client not available")
66
+ tts_response = groq.audio.speech.create(
67
+ model="canopylabs/orpheus-v1-english",
68
+ input=text,
69
+ voice=voice,
70
+ response_format="mp3"
71
+ )
72
+ audio_bytes = tts_response.content if hasattr(tts_response, 'content') else tts_response.read()
73
+ logger.info(f"Groq Orpheus TTS generated: {len(audio_bytes)} bytes")
74
+ return audio_bytes
75
+
76
+
77
+ # ===== SUPERTONIC TTS =====
78
+
79
+ from tts_providers import get_provider, list_providers # noqa: E402 — after stdlib imports
80
+
81
+
82
+ def generate_tts_chunked(provider, text: str, voice: str, max_chars: int = 800) -> bytes:
83
+ """
84
+ Generate TTS audio with chunking to avoid Supertonic ONNX overflow.
85
+
86
+ Supertonic ONNX crashes with RUNTIME_EXCEPTION when text exceeds ~1000 tokens.
87
+ Splits long text on sentence boundaries, generates each chunk, then
88
+ concatenates the raw PCM data into a single WAV file.
89
+
90
+ Args:
91
+ provider: TTSProvider instance (e.g. SupertonicProvider).
92
+ text: Text to synthesize.
93
+ voice: Voice identifier (e.g. 'M1').
94
+ max_chars: Max characters per chunk. Default 800.
95
+
96
+ Returns:
97
+ WAV audio bytes (concatenated from all chunks).
98
+ """
99
+ # Short text — no chunking needed
100
+ if len(text) <= max_chars:
101
+ return provider.generate_speech(text=text, voice=voice, speed=1.05, total_step=40)
102
+
103
+ # Split on sentence boundaries
104
+ sentences = re.split(r'(?<=[.!?])\s+', text)
105
+ chunks = []
106
+ current_chunk = ""
107
+
108
+ for sentence in sentences:
109
+ if len(current_chunk) + len(sentence) + 1 > max_chars and current_chunk:
110
+ chunks.append(current_chunk.strip())
111
+ current_chunk = sentence
112
+ else:
113
+ current_chunk = (current_chunk + " " + sentence).strip()
114
+ if current_chunk:
115
+ chunks.append(current_chunk.strip())
116
+
117
+ logger.info(f"TTS chunking: {len(text)} chars -> {len(chunks)} chunks (max {max_chars})")
118
+
119
+ all_audio_data = b""
120
+ sample_rate = None
121
+ num_channels = None
122
+ bits_per_sample = None
123
+
124
+ for i, chunk in enumerate(chunks):
125
+ if not chunk.strip():
126
+ continue
127
+ try:
128
+ chunk_audio = provider.generate_speech(text=chunk, voice=voice, speed=1.05, total_step=40)
129
+ if i == 0:
130
+ if chunk_audio[:4] == b'RIFF' and chunk_audio[8:12] == b'WAVE':
131
+ pos = 12
132
+ while pos < len(chunk_audio) - 8:
133
+ chunk_id = chunk_audio[pos:pos + 4]
134
+ chunk_size = struct.unpack('<I', chunk_audio[pos + 4:pos + 8])[0]
135
+ if chunk_id == b'fmt ':
136
+ fmt_data = chunk_audio[pos + 8:pos + 8 + chunk_size]
137
+ num_channels = struct.unpack('<H', fmt_data[2:4])[0]
138
+ sample_rate = struct.unpack('<I', fmt_data[4:8])[0]
139
+ bits_per_sample = struct.unpack('<H', fmt_data[14:16])[0]
140
+ elif chunk_id == b'data':
141
+ all_audio_data += chunk_audio[pos + 8:pos + 8 + chunk_size]
142
+ break
143
+ pos += 8 + chunk_size
144
+ else:
145
+ return chunk_audio
146
+ else:
147
+ if chunk_audio[:4] == b'RIFF':
148
+ pos = 12
149
+ while pos < len(chunk_audio) - 8:
150
+ chunk_id = chunk_audio[pos:pos + 4]
151
+ chunk_size = struct.unpack('<I', chunk_audio[pos + 4:pos + 8])[0]
152
+ if chunk_id == b'data':
153
+ all_audio_data += chunk_audio[pos + 8:pos + 8 + chunk_size]
154
+ break
155
+ pos += 8 + chunk_size
156
+ logger.info(f" Chunk {i + 1}/{len(chunks)}: {len(chunk)} chars OK")
157
+ except Exception as e:
158
+ logger.error(f" Chunk {i + 1}/{len(chunks)} FAILED: {e}")
159
+
160
+ if not all_audio_data or sample_rate is None:
161
+ logger.warning("All TTS chunks failed, trying truncated text")
162
+ return provider.generate_speech(text=text[:max_chars], voice=voice, speed=1.05, total_step=40)
163
+
164
+ # Rebuild WAV with concatenated PCM data
165
+ byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
166
+ block_align = num_channels * (bits_per_sample // 8)
167
+ data_size = len(all_audio_data)
168
+ file_size = 36 + data_size
169
+
170
+ wav_header = struct.pack('<4sI4s', b'RIFF', file_size, b'WAVE')
171
+ fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1,
172
+ num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
173
+ data_header = struct.pack('<4sI', b'data', data_size)
174
+
175
+ return wav_header + fmt_chunk + data_header + all_audio_data
176
+
177
+
178
+ # ===== UNIFIED GENERATE FUNCTION =====
179
+
180
+ # Fallback order when a provider fails (provider_id → fallback_id)
181
+ _FALLBACK_CHAIN = {
182
+ 'groq': 'supertonic',
183
+ 'qwen3': 'supertonic',
184
+ 'resemble': 'supertonic',
185
+ }
186
+
187
+ _MAX_RETRIES = 2
188
+ _RETRY_DELAYS = (0.5, 1.5) # seconds between retries
189
+
190
+
191
+ def _generate_with_provider(tts_provider: str, text: str, voice: str) -> bytes:
192
+ """Generate audio bytes from a single provider (no retry/fallback)."""
193
+ provider = get_provider(tts_provider)
194
+ provider_info = provider.get_info()
195
+ audio_format = provider_info.get('audio_format', 'wav')
196
+
197
+ # Cloud providers that return WAV handle their own chunking/limits
198
+ if audio_format == 'mp3' or tts_provider in ('resemble',):
199
+ return provider.generate_speech(text=text, voice=voice)
200
+ # Local WAV providers (supertonic) need ONNX overflow chunking
201
+ return generate_tts_chunked(provider, text, voice)
202
+
203
+
204
+ def generate_tts_b64(
205
+ text: str,
206
+ voice: Optional[str] = None,
207
+ tts_provider: str = 'groq',
208
+ **kwargs,
209
+ ) -> Optional[str]:
210
+ """
211
+ Generate TTS audio and return as a base64-encoded string.
212
+
213
+ Retries transient failures up to _MAX_RETRIES times, then falls back
214
+ to an alternate provider (e.g. groq → supertonic).
215
+
216
+ Args:
217
+ text: Text to synthesize.
218
+ voice: Voice ID (provider-specific). Defaults to provider default.
219
+ tts_provider: Provider ID ('supertonic', 'groq', 'qwen3', etc.).
220
+
221
+ Returns:
222
+ Base64-encoded audio string, or None on failure.
223
+ """
224
+ voice = voice or 'M1'
225
+
226
+ # ── Try primary provider (single attempt for cloud, retries for local) ──
227
+ last_err = None
228
+ # Cloud providers (groq, qwen3) have their own timeout — don't retry
229
+ # on timeout, fall back immediately. Only retry local providers.
230
+ is_cloud = tts_provider in ('groq', 'qwen3', 'resemble')
231
+ max_attempts = 1 if is_cloud else _MAX_RETRIES + 1
232
+ for attempt in range(max_attempts):
233
+ try:
234
+ audio_bytes = _generate_with_provider(tts_provider, text, voice)
235
+ logger.info(f"TTS generated: provider={tts_provider}, voice={voice}, attempt={attempt + 1}")
236
+ return base64.b64encode(audio_bytes).decode('utf-8')
237
+ except Exception as e:
238
+ last_err = e
239
+ if attempt < max_attempts - 1:
240
+ delay = _RETRY_DELAYS[attempt]
241
+ logger.warning(f"TTS attempt {attempt + 1} failed (provider={tts_provider}): {e} — retrying in {delay}s")
242
+ time.sleep(delay)
243
+ else:
244
+ logger.warning(f"TTS failed (provider={tts_provider}): {e} — trying fallback")
245
+
246
+ # ── Fallback to alternate provider ───────────────────────────────
247
+ fallback_id = _FALLBACK_CHAIN.get(tts_provider)
248
+ if fallback_id:
249
+ logger.info(f"TTS falling back: {tts_provider} → {fallback_id}")
250
+ try:
251
+ fallback_provider = get_provider(fallback_id)
252
+ fallback_voice = fallback_provider.get_default_voice()
253
+ audio_bytes = _generate_with_provider(fallback_id, text, fallback_voice)
254
+ logger.info(f"TTS fallback OK: provider={fallback_id}, voice={fallback_voice}")
255
+ return base64.b64encode(audio_bytes).decode('utf-8')
256
+ except Exception as fb_err:
257
+ logger.error(f"TTS fallback also failed (provider={fallback_id}): {fb_err}")
258
+
259
+ logger.error(f"TTS generation failed — all providers exhausted for: '{text[:60]}'")
260
+ return None
261
+
262
+
263
+ __all__ = [
264
+ 'get_groq_client',
265
+ 'generate_groq_tts',
266
+ 'generate_tts_chunked',
267
+ 'generate_tts_b64',
268
+ 'get_provider',
269
+ 'list_providers',
270
+ ]