openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebSpeechSTT — Browser-native speech recognition provider (Web Speech API)
|
|
3
|
+
* Free, no API keys needed.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* import { WebSpeechSTT, WakeWordDetector } from './WebSpeechSTT.js';
|
|
7
|
+
*
|
|
8
|
+
* const stt = new WebSpeechSTT();
|
|
9
|
+
* stt.onResult = (text) => console.log('Heard:', text);
|
|
10
|
+
* await stt.start();
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// Detect iOS — affects mic stream lifetime and recognition restart timing
|
|
14
|
+
const _isIOS = /iPad|iPhone|iPod/.test(navigator.userAgent) && !window.MSStream;
|
|
15
|
+
|
|
16
|
+
// Post real STT errors to the server so session monitoring can track them.
|
|
17
|
+
// no-speech and aborted are normal Chrome behaviour — don't report those.
|
|
18
|
+
function _reportSTTError(error, message, source = 'stt') {
|
|
19
|
+
try {
|
|
20
|
+
fetch('/api/stt-events', {
|
|
21
|
+
method: 'POST',
|
|
22
|
+
headers: { 'Content-Type': 'application/json' },
|
|
23
|
+
body: JSON.stringify({ error, message, provider: 'webspeech', source }),
|
|
24
|
+
}).catch(() => {}); // fire-and-forget, never block STT
|
|
25
|
+
} catch (_) {}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// ===== WEB SPEECH STT =====
|
|
29
|
+
// Browser-native speech recognition (free, no API keys needed)
|
|
30
|
+
class WebSpeechSTT {
|
|
31
|
+
constructor() {
|
|
32
|
+
this.recognition = null;
|
|
33
|
+
this.isListening = false;
|
|
34
|
+
this.onResult = null;
|
|
35
|
+
this.onError = null;
|
|
36
|
+
this.onListenFinal = null; // Listen panel hook — called with each final transcript
|
|
37
|
+
this.onInterim = null; // Listen panel hook — interim text
|
|
38
|
+
|
|
39
|
+
// Silence detection for continuous listening
|
|
40
|
+
this.silenceTimer = null;
|
|
41
|
+
this.silenceDelayMs = 3500; // 3.5s — 3s was cutting people off mid-sentence
|
|
42
|
+
this.accumulatedText = '';
|
|
43
|
+
this.isProcessing = false;
|
|
44
|
+
|
|
45
|
+
// PTT support
|
|
46
|
+
this._micMuted = false;
|
|
47
|
+
this._pttHolding = false;
|
|
48
|
+
|
|
49
|
+
// Keep mic stream alive during active listening (critical on iOS —
|
|
50
|
+
// releasing and re-acquiring the stream can re-trigger permission prompts)
|
|
51
|
+
this._micStream = null;
|
|
52
|
+
|
|
53
|
+
// Store constructor ref — recognition instance is created on first start(),
|
|
54
|
+
// NOT in constructor. Having two SpeechRecognition instances (even if only
|
|
55
|
+
// one is started) causes Chrome to route audio incorrectly, breaking wake word.
|
|
56
|
+
this._SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
|
57
|
+
if (!this._SpeechRecognition) {
|
|
58
|
+
console.warn('Web Speech API not supported in this browser');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Create the recognition instance on first use and wire up all handlers.
|
|
63
|
+
// Called once from start(), then the instance persists forever.
|
|
64
|
+
// Monkey-patches in app.js poll for stt.recognition and apply within 200ms.
|
|
65
|
+
_ensureRecognition() {
|
|
66
|
+
if (this.recognition) return true;
|
|
67
|
+
if (!this._SpeechRecognition) return false;
|
|
68
|
+
|
|
69
|
+
this.recognition = new this._SpeechRecognition();
|
|
70
|
+
this.recognition.continuous = true;
|
|
71
|
+
this.recognition.interimResults = true;
|
|
72
|
+
this.recognition.lang = 'en-US';
|
|
73
|
+
this.recognition.maxAlternatives = 1;
|
|
74
|
+
|
|
75
|
+
this.recognition.onresult = (event) => {
|
|
76
|
+
if (this.isProcessing) return;
|
|
77
|
+
if (this._micMuted) return; // PTT mode — mic should be silent
|
|
78
|
+
|
|
79
|
+
// ANY result (interim or final) means the user is still speaking.
|
|
80
|
+
// Reset the silence timer on every event so we never cut off mid-speech.
|
|
81
|
+
if (this.silenceTimer) {
|
|
82
|
+
clearTimeout(this.silenceTimer);
|
|
83
|
+
this.silenceTimer = null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let finalTranscript = '';
|
|
87
|
+
for (let i = event.resultIndex; i < event.results.length; i++) {
|
|
88
|
+
if (event.results[i].isFinal) {
|
|
89
|
+
finalTranscript += event.results[i][0].transcript;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (finalTranscript.trim()) {
|
|
94
|
+
// APPEND — user can speak across multiple Chrome final results
|
|
95
|
+
this.accumulatedText = this.accumulatedText
|
|
96
|
+
? this.accumulatedText + ' ' + finalTranscript.trim()
|
|
97
|
+
: finalTranscript.trim();
|
|
98
|
+
console.log('STT Final:', finalTranscript, '| Accumulated:', this.accumulatedText);
|
|
99
|
+
// Listen panel hook
|
|
100
|
+
if (this.onListenFinal) this.onListenFinal(finalTranscript.trim());
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Start/restart silence timer — only fires when Chrome stops sending ANY results
|
|
104
|
+
if (this.accumulatedText) {
|
|
105
|
+
this.silenceTimer = setTimeout(() => {
|
|
106
|
+
const text = this.accumulatedText.trim();
|
|
107
|
+
// Filter out garbage: punctuation-only, single words under 3 chars
|
|
108
|
+
const meaningful = text.replace(/[^a-zA-Z0-9]/g, '');
|
|
109
|
+
if (text && meaningful.length >= 2 && !this.isProcessing) {
|
|
110
|
+
console.log('Sending to AI:', text);
|
|
111
|
+
this.isProcessing = true;
|
|
112
|
+
if (this.onResult) this.onResult(text);
|
|
113
|
+
this.accumulatedText = '';
|
|
114
|
+
} else if (text) {
|
|
115
|
+
console.log('STT filtered garbage:', text);
|
|
116
|
+
this.accumulatedText = '';
|
|
117
|
+
}
|
|
118
|
+
}, this.silenceDelayMs);
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
this.recognition.onerror = (event) => {
|
|
123
|
+
if (event.error === 'no-speech' || event.error === 'aborted') {
|
|
124
|
+
console.log('STT:', event.error, '(normal, will auto-restart)');
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
if (event.error === 'audio-capture') {
|
|
128
|
+
console.error('STT: audio-capture — microphone hardware unavailable');
|
|
129
|
+
_reportSTTError('audio-capture', 'Microphone hardware unavailable', 'stt');
|
|
130
|
+
if (this.onError) this.onError('audio-capture');
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
console.error('STT Error:', event.error);
|
|
134
|
+
_reportSTTError(event.error, `STT recognition error: ${event.error}`, 'stt');
|
|
135
|
+
if (this.onError) this.onError(event.error);
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
this.recognition.onend = () => {
|
|
139
|
+
if (this.isListening && !this.isProcessing && !this._micMuted) {
|
|
140
|
+
const restartDelay = _isIOS ? 500 : 300;
|
|
141
|
+
setTimeout(() => {
|
|
142
|
+
if (this.isListening && !this.isProcessing && !this._micMuted) {
|
|
143
|
+
try {
|
|
144
|
+
this.recognition.start();
|
|
145
|
+
} catch (e) {
|
|
146
|
+
// Already started
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}, restartDelay);
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
console.log('STT: SpeechRecognition instance created');
|
|
154
|
+
return true;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
isSupported() {
|
|
158
|
+
return !!this._SpeechRecognition;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
async start() {
|
|
162
|
+
if (this._micMuted) return false;
|
|
163
|
+
if (!this._ensureRecognition()) {
|
|
164
|
+
console.error('Speech recognition not supported');
|
|
165
|
+
return false;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Request mic permission and keep the stream alive.
|
|
169
|
+
try {
|
|
170
|
+
if (!this._micStream) {
|
|
171
|
+
this._micStream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
172
|
+
}
|
|
173
|
+
} catch (e) {
|
|
174
|
+
console.error('Mic access failed:', e.name, e.message);
|
|
175
|
+
if (e.name === 'NotFoundError' || e.name === 'DevicesNotFoundError') {
|
|
176
|
+
if (this.onError) this.onError('no-device');
|
|
177
|
+
} else {
|
|
178
|
+
if (this.onError) this.onError('not-allowed');
|
|
179
|
+
}
|
|
180
|
+
return false;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
try {
|
|
184
|
+
this.isListening = true;
|
|
185
|
+
this.recognition.start();
|
|
186
|
+
console.log('STT started');
|
|
187
|
+
return true;
|
|
188
|
+
} catch (e) {
|
|
189
|
+
console.error('Failed to start STT:', e);
|
|
190
|
+
this.isListening = false;
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
stop() {
|
|
196
|
+
if (this.silenceTimer) {
|
|
197
|
+
clearTimeout(this.silenceTimer);
|
|
198
|
+
this.silenceTimer = null;
|
|
199
|
+
}
|
|
200
|
+
if (this.recognition) {
|
|
201
|
+
this.isListening = false;
|
|
202
|
+
this.isProcessing = false;
|
|
203
|
+
this._micMuted = false;
|
|
204
|
+
this._pttHolding = false;
|
|
205
|
+
this.recognition.stop();
|
|
206
|
+
console.log('STT stopped');
|
|
207
|
+
}
|
|
208
|
+
// Release the mic stream when fully stopped
|
|
209
|
+
if (this._micStream) {
|
|
210
|
+
this._micStream.getTracks().forEach(t => t.stop());
|
|
211
|
+
this._micStream = null;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
resetProcessing() {
|
|
216
|
+
this.isProcessing = false;
|
|
217
|
+
this.accumulatedText = '';
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/** Alias for mute() — VoiceConversation calls pause() during greeting. */
|
|
221
|
+
pause() {
|
|
222
|
+
this.mute();
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Mute STT immediately — called when TTS starts speaking.
|
|
227
|
+
* Sets isProcessing=true so onresult ignores all incoming audio,
|
|
228
|
+
* and clears any pending silence timer so queued echo text is discarded.
|
|
229
|
+
* onend will not restart the engine while muted, stopping the abort loop.
|
|
230
|
+
*/
|
|
231
|
+
mute() {
|
|
232
|
+
this.isProcessing = true;
|
|
233
|
+
if (this.silenceTimer) {
|
|
234
|
+
clearTimeout(this.silenceTimer);
|
|
235
|
+
this.silenceTimer = null;
|
|
236
|
+
}
|
|
237
|
+
this.accumulatedText = '';
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Resume STT after TTS finishes — clears mute flag and explicitly
|
|
242
|
+
* restarts the recognition engine (which may have stopped during mute).
|
|
243
|
+
* Called by VoiceSession._resumeListening() after the settling delay.
|
|
244
|
+
*/
|
|
245
|
+
resume() {
|
|
246
|
+
this.isProcessing = false;
|
|
247
|
+
this.accumulatedText = '';
|
|
248
|
+
if (this.silenceTimer) {
|
|
249
|
+
clearTimeout(this.silenceTimer);
|
|
250
|
+
this.silenceTimer = null;
|
|
251
|
+
}
|
|
252
|
+
if (this.isListening && !this._micMuted) {
|
|
253
|
+
try {
|
|
254
|
+
this.recognition.start();
|
|
255
|
+
} catch (e) {
|
|
256
|
+
// Already running — fine
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// --- PTT helpers (called from PTT code in app.js) ---
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* PTT activate — start listening for push-to-talk.
|
|
265
|
+
* Called when user presses the PTT button.
|
|
266
|
+
*/
|
|
267
|
+
pttActivate() {
|
|
268
|
+
this._pttHolding = true;
|
|
269
|
+
this._micMuted = false;
|
|
270
|
+
this.isProcessing = false;
|
|
271
|
+
this.accumulatedText = '';
|
|
272
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
273
|
+
|
|
274
|
+
// Start recognition fresh
|
|
275
|
+
if (!this._ensureRecognition()) return;
|
|
276
|
+
try {
|
|
277
|
+
this.recognition.start();
|
|
278
|
+
} catch (e) {
|
|
279
|
+
// Already running — fine
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* PTT release — stop listening and force-send transcript.
|
|
285
|
+
* Called when user releases the PTT button.
|
|
286
|
+
*/
|
|
287
|
+
pttRelease() {
|
|
288
|
+
this._pttHolding = false;
|
|
289
|
+
this._micMuted = true;
|
|
290
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
291
|
+
|
|
292
|
+
// Check if Chrome already finalized text during the hold
|
|
293
|
+
const immediate = this.accumulatedText.trim();
|
|
294
|
+
if (immediate && this.onResult) {
|
|
295
|
+
console.log('PTT release — sending:', immediate);
|
|
296
|
+
this.isProcessing = true;
|
|
297
|
+
this.onResult(immediate);
|
|
298
|
+
this.accumulatedText = '';
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Stop recognition — Chrome finalizes any pending speech as isFinal
|
|
302
|
+
// (muted state prevents onend restart)
|
|
303
|
+
if (this.recognition) {
|
|
304
|
+
try { this.recognition.stop(); } catch (e) {}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// If nothing was finalized during hold, wait for Chrome's post-stop results.
|
|
308
|
+
// Chrome fires onresult with isFinal=true when recognition.stop() is called,
|
|
309
|
+
// but the event is async. Give it time to arrive, then send.
|
|
310
|
+
if (!immediate) {
|
|
311
|
+
setTimeout(() => {
|
|
312
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
313
|
+
const text = this.accumulatedText.trim();
|
|
314
|
+
if (text && this.onResult) {
|
|
315
|
+
console.log('PTT release (delayed) — sending:', text);
|
|
316
|
+
this.isProcessing = true;
|
|
317
|
+
this.onResult(text);
|
|
318
|
+
}
|
|
319
|
+
this.accumulatedText = '';
|
|
320
|
+
}, 400);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* PTT mute — stop recognition and discard.
|
|
326
|
+
* Called when PTT mode is toggled ON (mic off by default).
|
|
327
|
+
*/
|
|
328
|
+
pttMute() {
|
|
329
|
+
this._pttHolding = false;
|
|
330
|
+
this._micMuted = true;
|
|
331
|
+
this.isProcessing = true;
|
|
332
|
+
this.accumulatedText = '';
|
|
333
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
334
|
+
|
|
335
|
+
if (this.recognition) {
|
|
336
|
+
try { this.recognition.stop(); } catch (e) {}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* PTT unmute — resume continuous listening.
|
|
342
|
+
* Called when PTT mode is toggled OFF.
|
|
343
|
+
*/
|
|
344
|
+
pttUnmute() {
|
|
345
|
+
this._micMuted = false;
|
|
346
|
+
this._pttHolding = false;
|
|
347
|
+
this.isProcessing = false;
|
|
348
|
+
this.accumulatedText = '';
|
|
349
|
+
|
|
350
|
+
if (this.isListening && this.recognition) {
|
|
351
|
+
try { this.recognition.start(); } catch (e) {}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// ===== WAKE WORD DETECTOR =====
|
|
357
|
+
// Listens for wake words in passive mode.
|
|
358
|
+
// Uses getUserMedia() before recognition.start() — without an active mic stream,
|
|
359
|
+
// Chrome's SpeechRecognition immediately aborts every cycle and never captures speech.
|
|
360
|
+
class WakeWordDetector {
|
|
361
|
+
constructor() {
|
|
362
|
+
this.recognition = null;
|
|
363
|
+
this.isListening = false;
|
|
364
|
+
this.onWakeWordDetected = null;
|
|
365
|
+
this._micPermissionGranted = false;
|
|
366
|
+
|
|
367
|
+
// Wake words to listen for (overridden per-profile via applyProfile)
|
|
368
|
+
this.wakeWords = ['wake up'];
|
|
369
|
+
|
|
370
|
+
// Check browser support
|
|
371
|
+
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
|
372
|
+
if (!SpeechRecognition) {
|
|
373
|
+
console.warn('Web Speech API not supported in this browser - wake word detection unavailable');
|
|
374
|
+
return;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
this.recognition = new SpeechRecognition();
|
|
378
|
+
this.recognition.continuous = true;
|
|
379
|
+
this.recognition.interimResults = true; // Must be true — Chrome produces nothing without it
|
|
380
|
+
this.recognition.lang = 'en-US';
|
|
381
|
+
|
|
382
|
+
this.recognition.onresult = (event) => {
|
|
383
|
+
// Check ALL results (interim + final) for wake words
|
|
384
|
+
for (let i = event.resultIndex; i < event.results.length; i++) {
|
|
385
|
+
const transcript = event.results[i][0].transcript.toLowerCase();
|
|
386
|
+
console.log(`Wake word detector heard (${event.results[i].isFinal ? 'final' : 'interim'}):`, transcript);
|
|
387
|
+
|
|
388
|
+
if (this.wakeWords.some(wakeWord => transcript.includes(wakeWord))) {
|
|
389
|
+
console.log('Wake word detected!');
|
|
390
|
+
if (this.onWakeWordDetected) {
|
|
391
|
+
this.onWakeWordDetected();
|
|
392
|
+
}
|
|
393
|
+
return; // Stop checking once detected
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
this.recognition.onerror = (event) => {
|
|
399
|
+
if (event.error === 'no-speech' || event.error === 'aborted') {
|
|
400
|
+
return; // Normal during passive listening
|
|
401
|
+
}
|
|
402
|
+
console.warn('Wake word detector error:', event.error);
|
|
403
|
+
_reportSTTError(event.error, `Wake word error: ${event.error}`, 'wake_word');
|
|
404
|
+
};
|
|
405
|
+
|
|
406
|
+
this.recognition.onend = () => {
|
|
407
|
+
// Auto-restart if we're supposed to be listening.
|
|
408
|
+
// 300ms delay gives Chrome time to release the speech service connection.
|
|
409
|
+
if (this.isListening) {
|
|
410
|
+
setTimeout(() => {
|
|
411
|
+
if (this.isListening) {
|
|
412
|
+
try {
|
|
413
|
+
this.recognition.start();
|
|
414
|
+
} catch (e) {
|
|
415
|
+
// Already started
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}, 300);
|
|
419
|
+
}
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
isSupported() {
|
|
424
|
+
return this.recognition !== null;
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
async start() {
|
|
428
|
+
if (!this.recognition) {
|
|
429
|
+
console.error('Speech recognition not supported');
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Ensure mic permission is granted before recognition.start().
|
|
434
|
+
// Without this, Chrome aborts every cycle. We release the stream
|
|
435
|
+
// immediately — we just need the permission grant, not the raw audio.
|
|
436
|
+
// Holding the stream can starve SpeechRecognition of mic access.
|
|
437
|
+
if (!this._micPermissionGranted) {
|
|
438
|
+
try {
|
|
439
|
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
440
|
+
stream.getTracks().forEach(t => t.stop()); // Release immediately
|
|
441
|
+
this._micPermissionGranted = true;
|
|
442
|
+
console.log('Wake word: mic permission granted');
|
|
443
|
+
} catch (e) {
|
|
444
|
+
console.error('Wake word: mic access failed:', e.name, e.message);
|
|
445
|
+
return false;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
try {
|
|
450
|
+
this.isListening = true;
|
|
451
|
+
this.recognition.start();
|
|
452
|
+
console.log('Wake word detector started');
|
|
453
|
+
return true;
|
|
454
|
+
} catch (e) {
|
|
455
|
+
console.error('Failed to start wake word detector:', e);
|
|
456
|
+
this.isListening = false;
|
|
457
|
+
return false;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
stop() {
|
|
462
|
+
if (this.recognition) {
|
|
463
|
+
this.isListening = false;
|
|
464
|
+
this.recognition.stop();
|
|
465
|
+
console.log('Wake word detector stopped');
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
async toggle() {
|
|
470
|
+
if (this.isListening) {
|
|
471
|
+
this.stop();
|
|
472
|
+
return false;
|
|
473
|
+
} else {
|
|
474
|
+
return await this.start();
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
export { WebSpeechSTT, WakeWordDetector };
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base TTS Provider Interface
|
|
3
|
+
* All TTS providers must implement these methods
|
|
4
|
+
*/
|
|
5
|
+
export class BaseTTSProvider {
|
|
6
|
+
constructor(config = {}) {
|
|
7
|
+
this.config = config;
|
|
8
|
+
this.name = 'base';
|
|
9
|
+
this.voices = [];
|
|
10
|
+
this.currentVoice = null;
|
|
11
|
+
this.isPlaying = false;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Initialize the provider
|
|
16
|
+
* @returns {Promise<boolean>} Success
|
|
17
|
+
*/
|
|
18
|
+
async init() {
|
|
19
|
+
throw new Error('init() must be implemented');
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Get available voices
|
|
24
|
+
* @returns {string[]} Array of voice names
|
|
25
|
+
*/
|
|
26
|
+
getVoices() {
|
|
27
|
+
return this.voices;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Set the current voice
|
|
32
|
+
* @param {string} voiceName
|
|
33
|
+
*/
|
|
34
|
+
setVoice(voiceName) {
|
|
35
|
+
if (this.voices.includes(voiceName)) {
|
|
36
|
+
this.currentVoice = voiceName;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Synthesize speech from text
|
|
42
|
+
* @param {string} text - Text to speak
|
|
43
|
+
* @param {object} options - Optional parameters
|
|
44
|
+
* @returns {Promise<AudioBuffer|HTMLAudioElement|null>}
|
|
45
|
+
*/
|
|
46
|
+
async speak(text, options = {}) {
|
|
47
|
+
throw new Error('speak() must be implemented');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Stop current playback
|
|
52
|
+
*/
|
|
53
|
+
stop() {
|
|
54
|
+
this.isPlaying = false;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Check if provider is ready
|
|
59
|
+
* @returns {boolean}
|
|
60
|
+
*/
|
|
61
|
+
isReady() {
|
|
62
|
+
return false;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Get cost per minute (0 = free)
|
|
67
|
+
* @returns {number}
|
|
68
|
+
*/
|
|
69
|
+
getCostPerMinute() {
|
|
70
|
+
return 0;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Cleanup resources
|
|
75
|
+
*/
|
|
76
|
+
destroy() {
|
|
77
|
+
this.stop();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export default BaseTTSProvider;
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hume TTS Provider
|
|
3
|
+
* Hume EVI handles TTS internally - this is a passthrough provider
|
|
4
|
+
* that works with HumeAdapter
|
|
5
|
+
*/
|
|
6
|
+
import { BaseTTSProvider } from './BaseTTSProvider.js';
|
|
7
|
+
|
|
8
|
+
export class HumeProvider extends BaseTTSProvider {
|
|
9
|
+
constructor(config = {}) {
|
|
10
|
+
super(config);
|
|
11
|
+
this.name = 'hume';
|
|
12
|
+
this.serverUrl = config.serverUrl || '';
|
|
13
|
+
this.configId = config.hume?.configId || '';
|
|
14
|
+
this.voiceId = config.hume?.voiceId || '';
|
|
15
|
+
this.voiceName = config.hume?.voiceName || 'Default';
|
|
16
|
+
this.voices = [this.voiceName]; // Hume uses configured voice
|
|
17
|
+
this.currentVoice = this.voiceName;
|
|
18
|
+
|
|
19
|
+
// Hume handles its own audio - these are for external callbacks
|
|
20
|
+
this.onSpeaking = null;
|
|
21
|
+
this.onListening = null;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async init() {
|
|
25
|
+
console.log('[Hume] Initializing...');
|
|
26
|
+
// Get config info from backend
|
|
27
|
+
try {
|
|
28
|
+
const response = await fetch(`${this.serverUrl}/api/hume/token`);
|
|
29
|
+
if (response.ok) {
|
|
30
|
+
const data = await response.json();
|
|
31
|
+
if (data.config_id) {
|
|
32
|
+
this.configId = data.config_id;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
} catch (error) {
|
|
36
|
+
console.warn('[Hume] Could not fetch config:', error);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
console.log('[Hume] Ready');
|
|
40
|
+
return true;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Hume handles TTS internally via WebSocket
|
|
45
|
+
* This method is for standalone TTS calls (not via EVI)
|
|
46
|
+
*/
|
|
47
|
+
async speak(text, options = {}) {
|
|
48
|
+
console.warn('[Hume] speak() called - Hume normally handles TTS internally via EVI');
|
|
49
|
+
// Hume TTS is handled by the EVI WebSocket connection
|
|
50
|
+
// This is here for interface compatibility
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
getVoices() {
|
|
55
|
+
return this.voices;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
setVoice(voiceName) {
|
|
59
|
+
// Hume voice is configured on the backend
|
|
60
|
+
console.log('[Hume] Voice configured on backend:', voiceName);
|
|
61
|
+
this.currentVoice = voiceName;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
isReady() {
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
getCostPerMinute() {
|
|
69
|
+
return 0.032; // $0.032/minute
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
destroy() {
|
|
73
|
+
// Nothing to clean up
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export default HumeProvider;
|