openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""
|
|
2
|
+
services/speech_normalizer.py — Speech Normalization Service
|
|
3
|
+
|
|
4
|
+
Cleans and normalizes LLM response text before it is sent to TTS providers.
|
|
5
|
+
Rules are loaded from config/speech_normalization.yaml (ADR-001).
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from services.speech_normalizer import SpeechNormalizer
|
|
9
|
+
|
|
10
|
+
normalizer = SpeechNormalizer()
|
|
11
|
+
clean_text = normalizer.normalize("Hello **world**! Check https://example.com", profile_id="default")
|
|
12
|
+
|
|
13
|
+
The normalizer supports per-profile rule overrides defined in
|
|
14
|
+
config/speech_normalization.yaml under the `profiles:` key.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, Optional
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Config file location (relative to project root)
|
|
28
|
+
_DEFAULT_CONFIG_PATH = Path(__file__).parent.parent / "config" / "speech_normalization.yaml"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SpeechNormalizer:
|
|
32
|
+
"""
|
|
33
|
+
Normalizes text for TTS by applying a configurable pipeline of rules:
|
|
34
|
+
|
|
35
|
+
1. Strip markdown formatting (headers, bold, code blocks, etc.)
|
|
36
|
+
2. Strip URLs
|
|
37
|
+
3. Strip emoji
|
|
38
|
+
4. Expand abbreviations (API → A P I, etc.)
|
|
39
|
+
5. Collapse whitespace
|
|
40
|
+
6. Trim to max_length
|
|
41
|
+
|
|
42
|
+
Rules are loaded from speech_normalization.yaml.
|
|
43
|
+
Per-profile overrides are merged on top of global defaults.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, config_path: Optional[str] = None) -> None:
|
|
47
|
+
self._config_path = Path(config_path) if config_path else _DEFAULT_CONFIG_PATH
|
|
48
|
+
self._raw_config: Dict[str, Any] = {}
|
|
49
|
+
self._global: Dict[str, Any] = {}
|
|
50
|
+
self._profile_overrides: Dict[str, Dict[str, Any]] = {}
|
|
51
|
+
self._load_config()
|
|
52
|
+
|
|
53
|
+
# ── Config loading ─────────────────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
def _load_config(self) -> None:
|
|
56
|
+
"""Load and parse speech_normalization.yaml."""
|
|
57
|
+
if not self._config_path.exists():
|
|
58
|
+
logger.warning(
|
|
59
|
+
"Speech normalization config not found at %s — using built-in defaults",
|
|
60
|
+
self._config_path,
|
|
61
|
+
)
|
|
62
|
+
self._raw_config = {}
|
|
63
|
+
self._global = self._builtin_defaults()
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import yaml # type: ignore
|
|
68
|
+
with open(self._config_path, "r") as f:
|
|
69
|
+
self._raw_config = yaml.safe_load(f) or {}
|
|
70
|
+
self._global = {**self._builtin_defaults(), **self._raw_config.get("global", {})}
|
|
71
|
+
# Merge global abbreviations with per-section ones
|
|
72
|
+
global_abbrevs = self._raw_config.get("abbreviations", {})
|
|
73
|
+
self._global["_abbreviations"] = global_abbrevs
|
|
74
|
+
# Per-profile overrides
|
|
75
|
+
self._profile_overrides = self._raw_config.get("profiles", {})
|
|
76
|
+
logger.info("Speech normalization config loaded from %s", self._config_path)
|
|
77
|
+
except Exception as exc:
|
|
78
|
+
logger.error("Failed to load speech normalization config: %s — using defaults", exc)
|
|
79
|
+
self._global = self._builtin_defaults()
|
|
80
|
+
self._global["_abbreviations"] = {}
|
|
81
|
+
|
|
82
|
+
def _builtin_defaults(self) -> Dict[str, Any]:
|
|
83
|
+
"""Minimal built-in defaults used when config file is absent."""
|
|
84
|
+
return {
|
|
85
|
+
"strip_markdown": True,
|
|
86
|
+
"strip_urls": True,
|
|
87
|
+
"strip_emoji": True,
|
|
88
|
+
"collapse_whitespace": True,
|
|
89
|
+
"trim": True,
|
|
90
|
+
"max_length": 800,
|
|
91
|
+
"_abbreviations": {},
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def reload(self) -> None:
|
|
95
|
+
"""Reload config from disk (e.g. after hot-edit)."""
|
|
96
|
+
self._load_config()
|
|
97
|
+
logger.info("Speech normalization config reloaded")
|
|
98
|
+
|
|
99
|
+
# ── Public API ─────────────────────────────────────────────────────────────
|
|
100
|
+
|
|
101
|
+
def normalize(self, text: str, profile_id: Optional[str] = None) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Apply the full normalization pipeline to *text*.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
text: Raw LLM response text.
|
|
107
|
+
profile_id: Optional agent profile ID. If provided, per-profile
|
|
108
|
+
overrides from speech_normalization.yaml are merged on
|
|
109
|
+
top of the global settings.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Cleaned string ready for TTS input.
|
|
113
|
+
"""
|
|
114
|
+
if not text:
|
|
115
|
+
return text
|
|
116
|
+
|
|
117
|
+
cfg = self._merged_config(profile_id)
|
|
118
|
+
|
|
119
|
+
# 1. Strip markdown
|
|
120
|
+
if cfg.get("strip_markdown", True):
|
|
121
|
+
text = self._strip_markdown(text)
|
|
122
|
+
|
|
123
|
+
# 2. Strip URLs
|
|
124
|
+
if cfg.get("strip_urls", True):
|
|
125
|
+
text = self._strip_urls(text)
|
|
126
|
+
|
|
127
|
+
# 3. Strip emoji
|
|
128
|
+
if cfg.get("strip_emoji", True):
|
|
129
|
+
text = self._strip_emoji(text)
|
|
130
|
+
|
|
131
|
+
# 4. Expand abbreviations (global + profile-specific)
|
|
132
|
+
abbreviations = {**self._global.get("_abbreviations", {}), **cfg.get("abbreviations", {})}
|
|
133
|
+
if abbreviations:
|
|
134
|
+
text = self._expand_abbreviations(text, abbreviations)
|
|
135
|
+
|
|
136
|
+
# 5. Collapse whitespace
|
|
137
|
+
if cfg.get("collapse_whitespace", True):
|
|
138
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
139
|
+
text = re.sub(r"\n{2,}", " ", text)
|
|
140
|
+
text = re.sub(r"\n", " ", text)
|
|
141
|
+
|
|
142
|
+
# 6. Trim
|
|
143
|
+
if cfg.get("trim", True):
|
|
144
|
+
text = text.strip()
|
|
145
|
+
|
|
146
|
+
# 7. Enforce max length (hard cap)
|
|
147
|
+
max_len = cfg.get("max_length", 800)
|
|
148
|
+
if len(text) > max_len:
|
|
149
|
+
# Try to break at a sentence boundary
|
|
150
|
+
cut = text[:max_len].rfind(". ")
|
|
151
|
+
if cut > max_len // 2:
|
|
152
|
+
text = text[: cut + 1]
|
|
153
|
+
else:
|
|
154
|
+
text = text[:max_len].rstrip() + "..."
|
|
155
|
+
logger.debug("Speech normalizer truncated text to %d chars", len(text))
|
|
156
|
+
|
|
157
|
+
return text
|
|
158
|
+
|
|
159
|
+
def get_config_for_profile(self, profile_id: Optional[str] = None) -> Dict[str, Any]:
|
|
160
|
+
"""Return the effective normalized config for a given profile (for inspection/debugging)."""
|
|
161
|
+
return self._merged_config(profile_id)
|
|
162
|
+
|
|
163
|
+
# ── Internal helpers ───────────────────────────────────────────────────────
|
|
164
|
+
|
|
165
|
+
def _merged_config(self, profile_id: Optional[str]) -> Dict[str, Any]:
|
|
166
|
+
"""Merge global settings with per-profile overrides."""
|
|
167
|
+
base = dict(self._global)
|
|
168
|
+
if profile_id and profile_id in self._profile_overrides:
|
|
169
|
+
override = self._profile_overrides[profile_id]
|
|
170
|
+
# Merge abbreviations separately (additive)
|
|
171
|
+
override_abbrevs = override.pop("abbreviations", {}) if isinstance(override, dict) else {}
|
|
172
|
+
base.update(override)
|
|
173
|
+
override["abbreviations"] = override_abbrevs # restore for future calls
|
|
174
|
+
base["abbreviations"] = override_abbrevs
|
|
175
|
+
return base
|
|
176
|
+
|
|
177
|
+
def _strip_markdown(self, text: str) -> str:
|
|
178
|
+
"""Remove common markdown syntax from text."""
|
|
179
|
+
patterns = self._raw_config.get("markdown_patterns", [])
|
|
180
|
+
if patterns:
|
|
181
|
+
for entry in patterns:
|
|
182
|
+
raw_pattern = entry.get("pattern", "")
|
|
183
|
+
replacement = entry.get("replacement", "")
|
|
184
|
+
flags_str = entry.get("flags", "")
|
|
185
|
+
flags = 0
|
|
186
|
+
if "multiline" in flags_str:
|
|
187
|
+
flags |= re.MULTILINE
|
|
188
|
+
try:
|
|
189
|
+
text = re.sub(raw_pattern, replacement, text, flags=flags)
|
|
190
|
+
except re.error as exc:
|
|
191
|
+
logger.warning("Invalid markdown pattern %r: %s", raw_pattern, exc)
|
|
192
|
+
else:
|
|
193
|
+
# Built-in fallback patterns when config is absent
|
|
194
|
+
text = re.sub(r"```[\s\S]*?```", "", text)
|
|
195
|
+
text = re.sub(r"`[^`]+`", "", text)
|
|
196
|
+
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
|
197
|
+
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
|
|
198
|
+
text = re.sub(r"__(.+?)__", r"\1", text)
|
|
199
|
+
text = re.sub(r"\*(.+?)\*", r"\1", text)
|
|
200
|
+
text = re.sub(r"_(.+?)_", r"\1", text)
|
|
201
|
+
text = re.sub(r"~~(.+?)~~", r"\1", text)
|
|
202
|
+
text = re.sub(r"^[-*_]{3,}$", "", text, flags=re.MULTILINE)
|
|
203
|
+
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
|
204
|
+
text = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", text)
|
|
205
|
+
text = re.sub(r"^>\s*", "", text, flags=re.MULTILINE)
|
|
206
|
+
text = re.sub(r"^[\-\*\+]\s+", "", text, flags=re.MULTILINE)
|
|
207
|
+
text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE)
|
|
208
|
+
return text
|
|
209
|
+
|
|
210
|
+
def _strip_urls(self, text: str) -> str:
|
|
211
|
+
"""Remove HTTP/HTTPS URLs from text."""
|
|
212
|
+
url_pattern = self._raw_config.get("url_pattern", r"https?://[^\s]+")
|
|
213
|
+
try:
|
|
214
|
+
text = re.sub(url_pattern, "", text)
|
|
215
|
+
except re.error:
|
|
216
|
+
text = re.sub(r"https?://[^\s]+", "", text)
|
|
217
|
+
return text
|
|
218
|
+
|
|
219
|
+
def _strip_emoji(self, text: str) -> str:
|
|
220
|
+
"""Remove emoji characters from text."""
|
|
221
|
+
# Broad Unicode emoji range
|
|
222
|
+
emoji_re = re.compile(
|
|
223
|
+
"["
|
|
224
|
+
"\U0001F300-\U0001F9FF" # Misc symbols and pictographs
|
|
225
|
+
"\U00002600-\U000027BF" # Misc symbols
|
|
226
|
+
"\U0001FA00-\U0001FAFF" # Chess, medical etc.
|
|
227
|
+
"\U00002702-\U000027B0"
|
|
228
|
+
"\U000024C2-\U0001F251"
|
|
229
|
+
"]+",
|
|
230
|
+
flags=re.UNICODE,
|
|
231
|
+
)
|
|
232
|
+
return emoji_re.sub("", text)
|
|
233
|
+
|
|
234
|
+
def _expand_abbreviations(self, text: str, abbreviations: Dict[str, str]) -> str:
|
|
235
|
+
"""
|
|
236
|
+
Replace abbreviations with their spoken forms.
|
|
237
|
+
|
|
238
|
+
Uses word-boundary matching so "API" inside "RAPID" is not replaced.
|
|
239
|
+
Longer abbreviations are tried first to prevent partial matches.
|
|
240
|
+
"""
|
|
241
|
+
# Sort by length descending so longer keys match first
|
|
242
|
+
for abbrev, expansion in sorted(abbreviations.items(), key=lambda x: -len(x[0])):
|
|
243
|
+
if not abbrev:
|
|
244
|
+
continue
|
|
245
|
+
try:
|
|
246
|
+
# Word-boundary aware, case-sensitive match
|
|
247
|
+
pattern = r"\b" + re.escape(abbrev) + r"\b"
|
|
248
|
+
text = re.sub(pattern, expansion, text)
|
|
249
|
+
except re.error as exc:
|
|
250
|
+
logger.warning("Invalid abbreviation pattern for %r: %s", abbrev, exc)
|
|
251
|
+
return text
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ── Module-level singleton ─────────────────────────────────────────────────────
|
|
255
|
+
|
|
256
|
+
_normalizer_instance: Optional[SpeechNormalizer] = None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_normalizer() -> SpeechNormalizer:
|
|
260
|
+
"""Return the shared SpeechNormalizer singleton (lazy-init)."""
|
|
261
|
+
global _normalizer_instance
|
|
262
|
+
if _normalizer_instance is None:
|
|
263
|
+
_normalizer_instance = SpeechNormalizer()
|
|
264
|
+
return _normalizer_instance
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def normalize_for_tts(text: str, profile_id: Optional[str] = None) -> str:
|
|
268
|
+
"""
|
|
269
|
+
Convenience function: normalize *text* using the global singleton.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
text: Raw text to normalize.
|
|
273
|
+
profile_id: Optional profile ID for per-profile rule overrides.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Cleaned text ready for TTS.
|
|
277
|
+
"""
|
|
278
|
+
return get_normalizer().normalize(text, profile_id=profile_id)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
__all__ = [
|
|
282
|
+
"SpeechNormalizer",
|
|
283
|
+
"get_normalizer",
|
|
284
|
+
"normalize_for_tts",
|
|
285
|
+
]
|
package/services/tts.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
services/tts.py — Unified TTS Service
|
|
3
|
+
|
|
4
|
+
Consolidates all TTS generation logic from server.py and tts_providers/.
|
|
5
|
+
Provides a single entry point for generating speech audio.
|
|
6
|
+
|
|
7
|
+
Providers:
|
|
8
|
+
- Groq Orpheus TTS (primary, cloud-based)
|
|
9
|
+
- Supertonic TTS (local ONNX, fallback)
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from services.tts import generate_tts_b64, generate_tts_chunked
|
|
13
|
+
|
|
14
|
+
audio_b64 = generate_tts_b64(text, voice='M1')
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import base64
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import struct
|
|
22
|
+
import time
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# ===== GROQ TTS =====
|
|
28
|
+
|
|
29
|
+
_groq_client = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_groq_client():
|
|
33
|
+
"""Get or initialize Groq client (lazy, cached)."""
|
|
34
|
+
global _groq_client
|
|
35
|
+
if _groq_client is None:
|
|
36
|
+
api_key = os.getenv('GROQ_API_KEY')
|
|
37
|
+
if api_key:
|
|
38
|
+
try:
|
|
39
|
+
from groq import Groq
|
|
40
|
+
_groq_client = Groq(api_key=api_key)
|
|
41
|
+
logger.info("Groq TTS client initialized")
|
|
42
|
+
except ImportError:
|
|
43
|
+
logger.warning("groq package not installed — Groq TTS unavailable")
|
|
44
|
+
else:
|
|
45
|
+
logger.warning("GROQ_API_KEY not set — Groq TTS unavailable")
|
|
46
|
+
return _groq_client
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def generate_groq_tts(text: str, voice: str = 'autumn') -> bytes:
|
|
50
|
+
"""
|
|
51
|
+
Generate TTS audio using Groq Orpheus (canopylabs/orpheus-v1-english).
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to synthesize.
|
|
55
|
+
voice: Orpheus voice name (default 'autumn').
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
MP3 audio bytes.
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
RuntimeError: If Groq client unavailable or API call fails.
|
|
62
|
+
"""
|
|
63
|
+
groq = get_groq_client()
|
|
64
|
+
if not groq:
|
|
65
|
+
raise RuntimeError("Groq client not available")
|
|
66
|
+
tts_response = groq.audio.speech.create(
|
|
67
|
+
model="canopylabs/orpheus-v1-english",
|
|
68
|
+
input=text,
|
|
69
|
+
voice=voice,
|
|
70
|
+
response_format="mp3"
|
|
71
|
+
)
|
|
72
|
+
audio_bytes = tts_response.content if hasattr(tts_response, 'content') else tts_response.read()
|
|
73
|
+
logger.info(f"Groq Orpheus TTS generated: {len(audio_bytes)} bytes")
|
|
74
|
+
return audio_bytes
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ===== SUPERTONIC TTS =====
|
|
78
|
+
|
|
79
|
+
from tts_providers import get_provider, list_providers # noqa: E402 — after stdlib imports
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def generate_tts_chunked(provider, text: str, voice: str, max_chars: int = 800) -> bytes:
|
|
83
|
+
"""
|
|
84
|
+
Generate TTS audio with chunking to avoid Supertonic ONNX overflow.
|
|
85
|
+
|
|
86
|
+
Supertonic ONNX crashes with RUNTIME_EXCEPTION when text exceeds ~1000 tokens.
|
|
87
|
+
Splits long text on sentence boundaries, generates each chunk, then
|
|
88
|
+
concatenates the raw PCM data into a single WAV file.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
provider: TTSProvider instance (e.g. SupertonicProvider).
|
|
92
|
+
text: Text to synthesize.
|
|
93
|
+
voice: Voice identifier (e.g. 'M1').
|
|
94
|
+
max_chars: Max characters per chunk. Default 800.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
WAV audio bytes (concatenated from all chunks).
|
|
98
|
+
"""
|
|
99
|
+
# Short text — no chunking needed
|
|
100
|
+
if len(text) <= max_chars:
|
|
101
|
+
return provider.generate_speech(text=text, voice=voice, speed=1.05, total_step=40)
|
|
102
|
+
|
|
103
|
+
# Split on sentence boundaries
|
|
104
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
105
|
+
chunks = []
|
|
106
|
+
current_chunk = ""
|
|
107
|
+
|
|
108
|
+
for sentence in sentences:
|
|
109
|
+
if len(current_chunk) + len(sentence) + 1 > max_chars and current_chunk:
|
|
110
|
+
chunks.append(current_chunk.strip())
|
|
111
|
+
current_chunk = sentence
|
|
112
|
+
else:
|
|
113
|
+
current_chunk = (current_chunk + " " + sentence).strip()
|
|
114
|
+
if current_chunk:
|
|
115
|
+
chunks.append(current_chunk.strip())
|
|
116
|
+
|
|
117
|
+
logger.info(f"TTS chunking: {len(text)} chars -> {len(chunks)} chunks (max {max_chars})")
|
|
118
|
+
|
|
119
|
+
all_audio_data = b""
|
|
120
|
+
sample_rate = None
|
|
121
|
+
num_channels = None
|
|
122
|
+
bits_per_sample = None
|
|
123
|
+
|
|
124
|
+
for i, chunk in enumerate(chunks):
|
|
125
|
+
if not chunk.strip():
|
|
126
|
+
continue
|
|
127
|
+
try:
|
|
128
|
+
chunk_audio = provider.generate_speech(text=chunk, voice=voice, speed=1.05, total_step=40)
|
|
129
|
+
if i == 0:
|
|
130
|
+
if chunk_audio[:4] == b'RIFF' and chunk_audio[8:12] == b'WAVE':
|
|
131
|
+
pos = 12
|
|
132
|
+
while pos < len(chunk_audio) - 8:
|
|
133
|
+
chunk_id = chunk_audio[pos:pos + 4]
|
|
134
|
+
chunk_size = struct.unpack('<I', chunk_audio[pos + 4:pos + 8])[0]
|
|
135
|
+
if chunk_id == b'fmt ':
|
|
136
|
+
fmt_data = chunk_audio[pos + 8:pos + 8 + chunk_size]
|
|
137
|
+
num_channels = struct.unpack('<H', fmt_data[2:4])[0]
|
|
138
|
+
sample_rate = struct.unpack('<I', fmt_data[4:8])[0]
|
|
139
|
+
bits_per_sample = struct.unpack('<H', fmt_data[14:16])[0]
|
|
140
|
+
elif chunk_id == b'data':
|
|
141
|
+
all_audio_data += chunk_audio[pos + 8:pos + 8 + chunk_size]
|
|
142
|
+
break
|
|
143
|
+
pos += 8 + chunk_size
|
|
144
|
+
else:
|
|
145
|
+
return chunk_audio
|
|
146
|
+
else:
|
|
147
|
+
if chunk_audio[:4] == b'RIFF':
|
|
148
|
+
pos = 12
|
|
149
|
+
while pos < len(chunk_audio) - 8:
|
|
150
|
+
chunk_id = chunk_audio[pos:pos + 4]
|
|
151
|
+
chunk_size = struct.unpack('<I', chunk_audio[pos + 4:pos + 8])[0]
|
|
152
|
+
if chunk_id == b'data':
|
|
153
|
+
all_audio_data += chunk_audio[pos + 8:pos + 8 + chunk_size]
|
|
154
|
+
break
|
|
155
|
+
pos += 8 + chunk_size
|
|
156
|
+
logger.info(f" Chunk {i + 1}/{len(chunks)}: {len(chunk)} chars OK")
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f" Chunk {i + 1}/{len(chunks)} FAILED: {e}")
|
|
159
|
+
|
|
160
|
+
if not all_audio_data or sample_rate is None:
|
|
161
|
+
logger.warning("All TTS chunks failed, trying truncated text")
|
|
162
|
+
return provider.generate_speech(text=text[:max_chars], voice=voice, speed=1.05, total_step=40)
|
|
163
|
+
|
|
164
|
+
# Rebuild WAV with concatenated PCM data
|
|
165
|
+
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
|
|
166
|
+
block_align = num_channels * (bits_per_sample // 8)
|
|
167
|
+
data_size = len(all_audio_data)
|
|
168
|
+
file_size = 36 + data_size
|
|
169
|
+
|
|
170
|
+
wav_header = struct.pack('<4sI4s', b'RIFF', file_size, b'WAVE')
|
|
171
|
+
fmt_chunk = struct.pack('<4sIHHIIHH', b'fmt ', 16, 1,
|
|
172
|
+
num_channels, sample_rate, byte_rate, block_align, bits_per_sample)
|
|
173
|
+
data_header = struct.pack('<4sI', b'data', data_size)
|
|
174
|
+
|
|
175
|
+
return wav_header + fmt_chunk + data_header + all_audio_data
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ===== UNIFIED GENERATE FUNCTION =====
|
|
179
|
+
|
|
180
|
+
# Fallback order when a provider fails (provider_id → fallback_id)
|
|
181
|
+
_FALLBACK_CHAIN = {
|
|
182
|
+
'groq': 'supertonic',
|
|
183
|
+
'qwen3': 'supertonic',
|
|
184
|
+
'resemble': 'supertonic',
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
_MAX_RETRIES = 2
|
|
188
|
+
_RETRY_DELAYS = (0.5, 1.5) # seconds between retries
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _generate_with_provider(tts_provider: str, text: str, voice: str) -> bytes:
|
|
192
|
+
"""Generate audio bytes from a single provider (no retry/fallback)."""
|
|
193
|
+
provider = get_provider(tts_provider)
|
|
194
|
+
provider_info = provider.get_info()
|
|
195
|
+
audio_format = provider_info.get('audio_format', 'wav')
|
|
196
|
+
|
|
197
|
+
# Cloud providers that return WAV handle their own chunking/limits
|
|
198
|
+
if audio_format == 'mp3' or tts_provider in ('resemble',):
|
|
199
|
+
return provider.generate_speech(text=text, voice=voice)
|
|
200
|
+
# Local WAV providers (supertonic) need ONNX overflow chunking
|
|
201
|
+
return generate_tts_chunked(provider, text, voice)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def generate_tts_b64(
|
|
205
|
+
text: str,
|
|
206
|
+
voice: Optional[str] = None,
|
|
207
|
+
tts_provider: str = 'groq',
|
|
208
|
+
**kwargs,
|
|
209
|
+
) -> Optional[str]:
|
|
210
|
+
"""
|
|
211
|
+
Generate TTS audio and return as a base64-encoded string.
|
|
212
|
+
|
|
213
|
+
Retries transient failures up to _MAX_RETRIES times, then falls back
|
|
214
|
+
to an alternate provider (e.g. groq → supertonic).
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
text: Text to synthesize.
|
|
218
|
+
voice: Voice ID (provider-specific). Defaults to provider default.
|
|
219
|
+
tts_provider: Provider ID ('supertonic', 'groq', 'qwen3', etc.).
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Base64-encoded audio string, or None on failure.
|
|
223
|
+
"""
|
|
224
|
+
voice = voice or 'M1'
|
|
225
|
+
|
|
226
|
+
# ── Try primary provider (single attempt for cloud, retries for local) ──
|
|
227
|
+
last_err = None
|
|
228
|
+
# Cloud providers (groq, qwen3) have their own timeout — don't retry
|
|
229
|
+
# on timeout, fall back immediately. Only retry local providers.
|
|
230
|
+
is_cloud = tts_provider in ('groq', 'qwen3', 'resemble')
|
|
231
|
+
max_attempts = 1 if is_cloud else _MAX_RETRIES + 1
|
|
232
|
+
for attempt in range(max_attempts):
|
|
233
|
+
try:
|
|
234
|
+
audio_bytes = _generate_with_provider(tts_provider, text, voice)
|
|
235
|
+
logger.info(f"TTS generated: provider={tts_provider}, voice={voice}, attempt={attempt + 1}")
|
|
236
|
+
return base64.b64encode(audio_bytes).decode('utf-8')
|
|
237
|
+
except Exception as e:
|
|
238
|
+
last_err = e
|
|
239
|
+
if attempt < max_attempts - 1:
|
|
240
|
+
delay = _RETRY_DELAYS[attempt]
|
|
241
|
+
logger.warning(f"TTS attempt {attempt + 1} failed (provider={tts_provider}): {e} — retrying in {delay}s")
|
|
242
|
+
time.sleep(delay)
|
|
243
|
+
else:
|
|
244
|
+
logger.warning(f"TTS failed (provider={tts_provider}): {e} — trying fallback")
|
|
245
|
+
|
|
246
|
+
# ── Fallback to alternate provider ───────────────────────────────
|
|
247
|
+
fallback_id = _FALLBACK_CHAIN.get(tts_provider)
|
|
248
|
+
if fallback_id:
|
|
249
|
+
logger.info(f"TTS falling back: {tts_provider} → {fallback_id}")
|
|
250
|
+
try:
|
|
251
|
+
fallback_provider = get_provider(fallback_id)
|
|
252
|
+
fallback_voice = fallback_provider.get_default_voice()
|
|
253
|
+
audio_bytes = _generate_with_provider(fallback_id, text, fallback_voice)
|
|
254
|
+
logger.info(f"TTS fallback OK: provider={fallback_id}, voice={fallback_voice}")
|
|
255
|
+
return base64.b64encode(audio_bytes).decode('utf-8')
|
|
256
|
+
except Exception as fb_err:
|
|
257
|
+
logger.error(f"TTS fallback also failed (provider={fallback_id}): {fb_err}")
|
|
258
|
+
|
|
259
|
+
logger.error(f"TTS generation failed — all providers exhausted for: '{text[:60]}'")
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
__all__ = [
|
|
264
|
+
'get_groq_client',
|
|
265
|
+
'generate_groq_tts',
|
|
266
|
+
'generate_tts_chunked',
|
|
267
|
+
'generate_tts_b64',
|
|
268
|
+
'get_provider',
|
|
269
|
+
'list_providers',
|
|
270
|
+
]
|