openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,766 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DeepgramStreamingSTT — Real-time streaming speech recognition via Deepgram WebSocket.
|
|
3
|
+
*
|
|
4
|
+
* Instead of recording a blob, stopping, uploading, and waiting (like DeepgramSTT),
|
|
5
|
+
* this streams raw audio directly to Deepgram's WebSocket API as the user speaks.
|
|
6
|
+
* Transcripts come back in real-time — no batch upload delay.
|
|
7
|
+
*
|
|
8
|
+
* Falls back to WebSpeechSTT automatically if Deepgram is unavailable (bad key,
|
|
9
|
+
* network issue, outage). The fallback is transparent — all callbacks and PTT
|
|
10
|
+
* methods are proxied through, so callers don't need to know which engine is active.
|
|
11
|
+
*
|
|
12
|
+
* Drop-in replacement for DeepgramSTT / GroqSTT / WebSpeechSTT.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* import { DeepgramStreamingSTT } from './DeepgramStreamingSTT.js';
|
|
16
|
+
*
|
|
17
|
+
* const stt = new DeepgramStreamingSTT();
|
|
18
|
+
* stt.onResult = (text) => console.log('Heard:', text);
|
|
19
|
+
* await stt.start();
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { WebSpeechSTT } from './WebSpeechSTT.js';
|
|
23
|
+
|
|
24
|
+
class DeepgramStreamingSTT {
|
|
25
|
+
constructor(config = {}) {
|
|
26
|
+
this.serverUrl = (config.serverUrl || window.AGENT_CONFIG?.serverUrl || window.location.origin).replace(/\/$/, '');
|
|
27
|
+
this.isListening = false;
|
|
28
|
+
this.onResult = null;
|
|
29
|
+
this.onError = null;
|
|
30
|
+
this.onListenFinal = null; // Listen panel hook — called with each final transcript
|
|
31
|
+
this.onInterim = null; // Called with interim text as user speaks
|
|
32
|
+
this.isProcessing = false;
|
|
33
|
+
this.accumulatedText = '';
|
|
34
|
+
|
|
35
|
+
// PTT support
|
|
36
|
+
this._micMuted = false;
|
|
37
|
+
this._pttHolding = false;
|
|
38
|
+
this._muteActive = false;
|
|
39
|
+
|
|
40
|
+
// Profile-overridable settings (same interface as DeepgramSTT)
|
|
41
|
+
this.silenceDelayMs = 800; // Not used for VAD (Deepgram handles it), but kept for profile compat
|
|
42
|
+
this.accumulationDelayMs = config.accumulationDelayMs || 0;
|
|
43
|
+
this.vadThreshold = 25; // Not used (Deepgram server-side VAD), kept for profile compat
|
|
44
|
+
this.minSpeechMs = 300; // Not used (Deepgram server-side VAD), kept for profile compat
|
|
45
|
+
this.maxRecordingMs = 45000; // Not used (streaming is continuous), kept for profile compat
|
|
46
|
+
|
|
47
|
+
// Deepgram WebSocket state
|
|
48
|
+
this._ws = null;
|
|
49
|
+
this._stream = null;
|
|
50
|
+
this._audioCtx = null;
|
|
51
|
+
this._processorNode = null;
|
|
52
|
+
this._sourceNode = null;
|
|
53
|
+
this._accumulationTimer = null;
|
|
54
|
+
this._keepAliveInterval = null;
|
|
55
|
+
this._reconnecting = false;
|
|
56
|
+
this._intentionalClose = false;
|
|
57
|
+
this._reconnectFailures = 0;
|
|
58
|
+
|
|
59
|
+
// Deepgram model config
|
|
60
|
+
this._model = config.model || 'nova-2';
|
|
61
|
+
this._language = config.language || 'en';
|
|
62
|
+
|
|
63
|
+
// Fallback: WebSpeechSTT when Deepgram is unavailable
|
|
64
|
+
this._fallback = null; // lazily created WebSpeechSTT
|
|
65
|
+
this._usingFallback = false; // true when actively using fallback
|
|
66
|
+
|
|
67
|
+
// Hallucination filtering (same set as server-side)
|
|
68
|
+
this._hallucinations = new Set([
|
|
69
|
+
'thank you', 'thanks for watching', 'thanks for listening',
|
|
70
|
+
'subscribe', 'please subscribe', 'like and subscribe',
|
|
71
|
+
'the end', 'subtitles by', 'translated by', 'closed captioning',
|
|
72
|
+
'voice command for ai assistant', 'voice command for ai',
|
|
73
|
+
'thanks', 'thank you so much',
|
|
74
|
+
]);
|
|
75
|
+
this._hallucinationSubstrings = [
|
|
76
|
+
'voice command for ai', 'thanks for watching', 'thanks for listening',
|
|
77
|
+
'like and subscribe', 'please subscribe',
|
|
78
|
+
'subtitles by', 'translated by', 'closed captioning',
|
|
79
|
+
];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
isSupported() {
|
|
83
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
async start() {
|
|
87
|
+
if (this.isListening) return true;
|
|
88
|
+
if (this._micMuted) return false;
|
|
89
|
+
|
|
90
|
+
// If already in fallback mode, delegate
|
|
91
|
+
if (this._usingFallback && this._fallback) {
|
|
92
|
+
return this._fallback.start();
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
try {
|
|
96
|
+
// Get mic stream
|
|
97
|
+
if (!this._stream || !this._stream.active) {
|
|
98
|
+
this._stream = await navigator.mediaDevices.getUserMedia({
|
|
99
|
+
audio: {
|
|
100
|
+
echoCancellation: true,
|
|
101
|
+
noiseSuppression: true,
|
|
102
|
+
sampleRate: 16000,
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Connect to Deepgram WebSocket
|
|
108
|
+
const connected = await this._connectWebSocket();
|
|
109
|
+
if (!connected) {
|
|
110
|
+
console.warn('Deepgram unavailable — falling back to WebSpeech');
|
|
111
|
+
return this._activateFallback();
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Start streaming audio
|
|
115
|
+
this._startAudioPipeline();
|
|
116
|
+
|
|
117
|
+
this.isListening = true;
|
|
118
|
+
this._reconnectFailures = 0;
|
|
119
|
+
console.log('Deepgram Streaming STT started');
|
|
120
|
+
return true;
|
|
121
|
+
} catch (error) {
|
|
122
|
+
console.error('Failed to start Deepgram Streaming STT:', error);
|
|
123
|
+
// Mic errors should not trigger fallback — they'd fail on WebSpeech too
|
|
124
|
+
if (error.name === 'NotFoundError' || error.name === 'DevicesNotFoundError') {
|
|
125
|
+
if (this.onError) this.onError('no-device');
|
|
126
|
+
return false;
|
|
127
|
+
} else if (error.name === 'NotAllowedError') {
|
|
128
|
+
if (this.onError) this.onError('not-allowed');
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
// Network / Deepgram error — try fallback
|
|
132
|
+
console.warn('Deepgram error — falling back to WebSpeech');
|
|
133
|
+
return this._activateFallback();
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
stop() {
|
|
138
|
+
if (this._usingFallback && this._fallback) {
|
|
139
|
+
this._fallback.stop();
|
|
140
|
+
this.isListening = false;
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
this.isListening = false;
|
|
145
|
+
this._micMuted = false;
|
|
146
|
+
this._muteActive = false;
|
|
147
|
+
this._intentionalClose = true;
|
|
148
|
+
|
|
149
|
+
this._stopAudioPipeline();
|
|
150
|
+
this._closeWebSocket();
|
|
151
|
+
this._clearTimers();
|
|
152
|
+
|
|
153
|
+
// Release mic stream
|
|
154
|
+
if (this._stream) {
|
|
155
|
+
this._stream.getTracks().forEach(t => t.stop());
|
|
156
|
+
this._stream = null;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
console.log('Deepgram Streaming STT stopped');
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
resetProcessing() {
|
|
163
|
+
if (this._usingFallback && this._fallback) {
|
|
164
|
+
this._fallback.resetProcessing();
|
|
165
|
+
return;
|
|
166
|
+
}
|
|
167
|
+
this.isProcessing = false;
|
|
168
|
+
this.accumulatedText = '';
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/** Alias for mute() — VoiceConversation calls pause() during greeting. */
|
|
172
|
+
pause() {
|
|
173
|
+
this.mute();
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Mute STT — called when TTS starts speaking.
|
|
178
|
+
* Sends KeepAlive to Deepgram to pause without disconnecting,
|
|
179
|
+
* and ignores any incoming transcripts.
|
|
180
|
+
*/
|
|
181
|
+
mute() {
|
|
182
|
+
if (this._usingFallback && this._fallback) {
|
|
183
|
+
this._fallback.mute();
|
|
184
|
+
return;
|
|
185
|
+
}
|
|
186
|
+
this._muteActive = true;
|
|
187
|
+
this.isProcessing = true;
|
|
188
|
+
this.accumulatedText = '';
|
|
189
|
+
if (this._accumulationTimer) {
|
|
190
|
+
clearTimeout(this._accumulationTimer);
|
|
191
|
+
this._accumulationTimer = null;
|
|
192
|
+
}
|
|
193
|
+
// Don't close the WebSocket — just stop sending audio.
|
|
194
|
+
// Deepgram's KeepAlive keeps the connection alive without audio.
|
|
195
|
+
this._sendKeepAlive();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Resume STT after TTS finishes.
|
|
200
|
+
* Audio pipeline is still running, just start paying attention again.
|
|
201
|
+
*/
|
|
202
|
+
resume() {
|
|
203
|
+
if (this._usingFallback && this._fallback) {
|
|
204
|
+
this._fallback.resume();
|
|
205
|
+
return;
|
|
206
|
+
}
|
|
207
|
+
this._muteActive = false;
|
|
208
|
+
this.isProcessing = false;
|
|
209
|
+
this.accumulatedText = '';
|
|
210
|
+
|
|
211
|
+
// If WebSocket died during mute, reconnect
|
|
212
|
+
if (this.isListening && !this._micMuted && (!this._ws || this._ws.readyState !== WebSocket.OPEN)) {
|
|
213
|
+
this._connectWebSocket().then(ok => {
|
|
214
|
+
if (ok) {
|
|
215
|
+
this._startAudioPipeline();
|
|
216
|
+
} else {
|
|
217
|
+
// Reconnect failed — fall back
|
|
218
|
+
console.warn('Deepgram reconnect failed on resume — falling back to WebSpeech');
|
|
219
|
+
this._activateFallback();
|
|
220
|
+
}
|
|
221
|
+
}).catch(err => {
|
|
222
|
+
console.error('Deepgram Streaming STT: reconnect on resume failed:', err);
|
|
223
|
+
this._activateFallback();
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// --- PTT helpers (proxy to fallback when active) ---
|
|
229
|
+
|
|
230
|
+
pttActivate() {
|
|
231
|
+
if (this._usingFallback && this._fallback) { this._fallback.pttActivate(); return; }
|
|
232
|
+
this._pttHolding = true;
|
|
233
|
+
this._micMuted = false;
|
|
234
|
+
this._muteActive = false;
|
|
235
|
+
this.isProcessing = false;
|
|
236
|
+
this.accumulatedText = '';
|
|
237
|
+
if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
|
|
238
|
+
|
|
239
|
+
// Ensure WebSocket and audio pipeline are active
|
|
240
|
+
if (!this._ws || this._ws.readyState !== WebSocket.OPEN) {
|
|
241
|
+
this._connectWebSocket().then(ok => {
|
|
242
|
+
if (ok) this._startAudioPipeline();
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
pttRelease() {
|
|
248
|
+
if (this._usingFallback && this._fallback) { this._fallback.pttRelease(); return; }
|
|
249
|
+
this._pttHolding = false;
|
|
250
|
+
this._micMuted = true;
|
|
251
|
+
|
|
252
|
+
// Tell Deepgram we're done speaking — triggers final transcript
|
|
253
|
+
if (this._ws && this._ws.readyState === WebSocket.OPEN) {
|
|
254
|
+
this._ws.send(JSON.stringify({ type: 'CloseStream' }));
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Wait briefly for final transcript, then send accumulated
|
|
258
|
+
setTimeout(() => {
|
|
259
|
+
const text = this.accumulatedText.trim();
|
|
260
|
+
if (text && this.onResult) {
|
|
261
|
+
console.log('PTT release — sending:', text);
|
|
262
|
+
this.isProcessing = true;
|
|
263
|
+
this.onResult(text);
|
|
264
|
+
}
|
|
265
|
+
this.accumulatedText = '';
|
|
266
|
+
}, 300);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
pttMute() {
|
|
270
|
+
if (this._usingFallback && this._fallback) { this._fallback.pttMute(); return; }
|
|
271
|
+
this._pttHolding = false;
|
|
272
|
+
this._micMuted = true;
|
|
273
|
+
this.isProcessing = true;
|
|
274
|
+
this.accumulatedText = '';
|
|
275
|
+
if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
pttUnmute() {
|
|
279
|
+
if (this._usingFallback && this._fallback) { this._fallback.pttUnmute(); return; }
|
|
280
|
+
this._micMuted = false;
|
|
281
|
+
this._pttHolding = false;
|
|
282
|
+
this.isProcessing = false;
|
|
283
|
+
this.accumulatedText = '';
|
|
284
|
+
|
|
285
|
+
if (this.isListening && (!this._ws || this._ws.readyState !== WebSocket.OPEN)) {
|
|
286
|
+
this._connectWebSocket().then(ok => {
|
|
287
|
+
if (ok) this._startAudioPipeline();
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// ---- Fallback ----
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Activate WebSpeech fallback. Tears down any Deepgram state, creates a
|
|
296
|
+
* WebSpeechSTT instance, wires all callbacks through, and starts it.
|
|
297
|
+
*/
|
|
298
|
+
_activateFallback() {
|
|
299
|
+
// Clean up Deepgram state
|
|
300
|
+
this._stopAudioPipeline();
|
|
301
|
+
this._closeWebSocket();
|
|
302
|
+
this._clearTimers();
|
|
303
|
+
// Release mic stream — WebSpeech manages its own
|
|
304
|
+
if (this._stream) {
|
|
305
|
+
this._stream.getTracks().forEach(t => t.stop());
|
|
306
|
+
this._stream = null;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
this._usingFallback = true;
|
|
310
|
+
|
|
311
|
+
if (!this._fallback) {
|
|
312
|
+
this._fallback = new WebSpeechSTT();
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Wire callbacks through so callers see the same interface
|
|
316
|
+
this._syncFallbackCallbacks();
|
|
317
|
+
|
|
318
|
+
console.warn('[STT] Now using WebSpeech fallback');
|
|
319
|
+
// Report so the UI can show a notice if desired
|
|
320
|
+
try {
|
|
321
|
+
fetch('/api/stt-events', {
|
|
322
|
+
method: 'POST',
|
|
323
|
+
headers: { 'Content-Type': 'application/json' },
|
|
324
|
+
body: JSON.stringify({
|
|
325
|
+
error: 'deepgram-fallback',
|
|
326
|
+
message: 'Deepgram unavailable — using WebSpeech fallback',
|
|
327
|
+
provider: 'deepgram-streaming',
|
|
328
|
+
source: 'stt',
|
|
329
|
+
}),
|
|
330
|
+
}).catch(() => {});
|
|
331
|
+
} catch (_) {}
|
|
332
|
+
|
|
333
|
+
return this._fallback.start().then(ok => {
|
|
334
|
+
this.isListening = ok;
|
|
335
|
+
return ok;
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Sync current callback references to the fallback instance.
|
|
341
|
+
* Called when fallback activates and whenever callbacks might have changed.
|
|
342
|
+
*/
|
|
343
|
+
_syncFallbackCallbacks() {
|
|
344
|
+
if (!this._fallback) return;
|
|
345
|
+
this._fallback.onResult = (...args) => { if (this.onResult) this.onResult(...args); };
|
|
346
|
+
this._fallback.onError = (...args) => { if (this.onError) this.onError(...args); };
|
|
347
|
+
this._fallback.onListenFinal = (...args) => { if (this.onListenFinal) this.onListenFinal(...args); };
|
|
348
|
+
// WebSpeechSTT has onInterim — proxy it
|
|
349
|
+
this._fallback.onInterim = (...args) => { if (this.onInterim) this.onInterim(...args); };
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// ---- WebSocket Connection ----
|
|
353
|
+
|
|
354
|
+
async _connectWebSocket() {
|
|
355
|
+
// Get a temporary API key from our server (don't expose the real key to the browser)
|
|
356
|
+
let apiKey;
|
|
357
|
+
try {
|
|
358
|
+
const resp = await fetch(`${this.serverUrl}/api/stt/deepgram/token`);
|
|
359
|
+
if (!resp.ok) {
|
|
360
|
+
console.error('Deepgram token endpoint failed:', resp.status);
|
|
361
|
+
return false;
|
|
362
|
+
}
|
|
363
|
+
const data = await resp.json();
|
|
364
|
+
apiKey = data.token;
|
|
365
|
+
if (!apiKey) {
|
|
366
|
+
console.error('Deepgram token endpoint returned no token');
|
|
367
|
+
return false;
|
|
368
|
+
}
|
|
369
|
+
} catch (err) {
|
|
370
|
+
console.error('Failed to get Deepgram token:', err);
|
|
371
|
+
return false;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return new Promise((resolve) => {
|
|
375
|
+
const params = new URLSearchParams({
|
|
376
|
+
model: this._model,
|
|
377
|
+
language: this._language,
|
|
378
|
+
smart_format: 'true',
|
|
379
|
+
punctuate: 'true',
|
|
380
|
+
interim_results: 'true',
|
|
381
|
+
utterance_end_ms: '1000',
|
|
382
|
+
vad_events: 'true',
|
|
383
|
+
endpointing: '300',
|
|
384
|
+
encoding: 'linear16',
|
|
385
|
+
sample_rate: '16000',
|
|
386
|
+
channels: '1',
|
|
387
|
+
});
|
|
388
|
+
|
|
389
|
+
const url = `wss://api.deepgram.com/v1/listen?${params}`;
|
|
390
|
+
this._intentionalClose = false;
|
|
391
|
+
|
|
392
|
+
try {
|
|
393
|
+
this._ws = new WebSocket(url, ['token', apiKey]);
|
|
394
|
+
} catch (err) {
|
|
395
|
+
console.error('Deepgram WebSocket creation failed:', err);
|
|
396
|
+
resolve(false);
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
const timeout = setTimeout(() => {
|
|
401
|
+
if (this._ws && this._ws.readyState === WebSocket.CONNECTING) {
|
|
402
|
+
console.error('Deepgram WebSocket connection timeout');
|
|
403
|
+
this._ws.close();
|
|
404
|
+
resolve(false);
|
|
405
|
+
}
|
|
406
|
+
}, 5000);
|
|
407
|
+
|
|
408
|
+
this._ws.onopen = () => {
|
|
409
|
+
clearTimeout(timeout);
|
|
410
|
+
console.log('Deepgram WebSocket connected');
|
|
411
|
+
this._reconnectFailures = 0;
|
|
412
|
+
this._startKeepAlive();
|
|
413
|
+
resolve(true);
|
|
414
|
+
};
|
|
415
|
+
|
|
416
|
+
this._ws.onmessage = (event) => {
|
|
417
|
+
this._handleMessage(event);
|
|
418
|
+
};
|
|
419
|
+
|
|
420
|
+
this._ws.onerror = (event) => {
|
|
421
|
+
clearTimeout(timeout);
|
|
422
|
+
console.error('Deepgram WebSocket error:', event);
|
|
423
|
+
};
|
|
424
|
+
|
|
425
|
+
this._ws.onclose = (event) => {
|
|
426
|
+
clearTimeout(timeout);
|
|
427
|
+
this._stopKeepAlive();
|
|
428
|
+
console.log(`Deepgram WebSocket closed (code: ${event.code})`);
|
|
429
|
+
|
|
430
|
+
// Auto-reconnect if not intentional and still supposed to be listening
|
|
431
|
+
if (!this._intentionalClose && this.isListening && !this._micMuted && !this._reconnecting) {
|
|
432
|
+
this._reconnectFailures++;
|
|
433
|
+
|
|
434
|
+
// After 3 failed reconnects, give up and fall back to WebSpeech
|
|
435
|
+
if (this._reconnectFailures >= 3) {
|
|
436
|
+
console.warn(`Deepgram: ${this._reconnectFailures} reconnect failures — falling back to WebSpeech`);
|
|
437
|
+
this._activateFallback();
|
|
438
|
+
return;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
this._reconnecting = true;
|
|
442
|
+
const delay = Math.min(1000 * Math.pow(2, this._reconnectFailures - 1), 5000);
|
|
443
|
+
console.log(`Deepgram: reconnecting in ${delay}ms (attempt ${this._reconnectFailures}/3)...`);
|
|
444
|
+
setTimeout(() => {
|
|
445
|
+
this._reconnecting = false;
|
|
446
|
+
if (this.isListening && !this._intentionalClose) {
|
|
447
|
+
this._connectWebSocket().then(ok => {
|
|
448
|
+
if (ok) {
|
|
449
|
+
this._startAudioPipeline();
|
|
450
|
+
} else {
|
|
451
|
+
// Connection failed — count as another failure and maybe fallback
|
|
452
|
+
this._reconnectFailures++;
|
|
453
|
+
if (this._reconnectFailures >= 3) {
|
|
454
|
+
console.warn('Deepgram: reconnect failed — falling back to WebSpeech');
|
|
455
|
+
this._activateFallback();
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
}, delay);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
if (this._ws === null) return; // already cleaned up
|
|
464
|
+
resolve(false);
|
|
465
|
+
};
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
_closeWebSocket() {
|
|
470
|
+
this._stopKeepAlive();
|
|
471
|
+
if (this._ws) {
|
|
472
|
+
this._intentionalClose = true;
|
|
473
|
+
// Send CloseStream to get final transcript before closing
|
|
474
|
+
if (this._ws.readyState === WebSocket.OPEN) {
|
|
475
|
+
try {
|
|
476
|
+
this._ws.send(JSON.stringify({ type: 'CloseStream' }));
|
|
477
|
+
} catch (_) {}
|
|
478
|
+
}
|
|
479
|
+
this._ws.close();
|
|
480
|
+
this._ws = null;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
_sendKeepAlive() {
|
|
485
|
+
if (this._ws && this._ws.readyState === WebSocket.OPEN) {
|
|
486
|
+
try {
|
|
487
|
+
this._ws.send(JSON.stringify({ type: 'KeepAlive' }));
|
|
488
|
+
} catch (_) {}
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
_startKeepAlive() {
|
|
493
|
+
this._stopKeepAlive();
|
|
494
|
+
// Send KeepAlive every 8 seconds to prevent timeout
|
|
495
|
+
this._keepAliveInterval = setInterval(() => {
|
|
496
|
+
this._sendKeepAlive();
|
|
497
|
+
}, 8000);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
_stopKeepAlive() {
|
|
501
|
+
if (this._keepAliveInterval) {
|
|
502
|
+
clearInterval(this._keepAliveInterval);
|
|
503
|
+
this._keepAliveInterval = null;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// ---- Audio Pipeline ----
|
|
508
|
+
|
|
509
|
+
_startAudioPipeline() {
|
|
510
|
+
// Clean up existing pipeline
|
|
511
|
+
this._stopAudioPipeline();
|
|
512
|
+
|
|
513
|
+
if (!this._stream || !this._stream.active) return;
|
|
514
|
+
|
|
515
|
+
this._audioCtx = new AudioContext({ sampleRate: 16000 });
|
|
516
|
+
this._sourceNode = this._audioCtx.createMediaStreamSource(this._stream);
|
|
517
|
+
|
|
518
|
+
// ScriptProcessorNode for raw PCM access (AudioWorklet would be better
|
|
519
|
+
// but requires a separate file and HTTPS — this works everywhere)
|
|
520
|
+
const bufferSize = 4096;
|
|
521
|
+
this._processorNode = this._audioCtx.createScriptProcessor(bufferSize, 1, 1);
|
|
522
|
+
|
|
523
|
+
this._processorNode.onaudioprocess = (event) => {
|
|
524
|
+
if (this._muteActive || this._micMuted) return;
|
|
525
|
+
if (!this._ws || this._ws.readyState !== WebSocket.OPEN) return;
|
|
526
|
+
|
|
527
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
528
|
+
|
|
529
|
+
// Convert Float32 [-1, 1] to Int16 PCM
|
|
530
|
+
const pcm16 = new Int16Array(inputData.length);
|
|
531
|
+
for (let i = 0; i < inputData.length; i++) {
|
|
532
|
+
const s = Math.max(-1, Math.min(1, inputData[i]));
|
|
533
|
+
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// Send raw PCM bytes to Deepgram
|
|
537
|
+
this._ws.send(pcm16.buffer);
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
this._sourceNode.connect(this._processorNode);
|
|
541
|
+
this._processorNode.connect(this._audioCtx.destination);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
_stopAudioPipeline() {
|
|
545
|
+
if (this._processorNode) {
|
|
546
|
+
this._processorNode.disconnect();
|
|
547
|
+
this._processorNode = null;
|
|
548
|
+
}
|
|
549
|
+
if (this._sourceNode) {
|
|
550
|
+
this._sourceNode.disconnect();
|
|
551
|
+
this._sourceNode = null;
|
|
552
|
+
}
|
|
553
|
+
if (this._audioCtx) {
|
|
554
|
+
this._audioCtx.close().catch(() => {});
|
|
555
|
+
this._audioCtx = null;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// ---- Message Handling ----
|
|
560
|
+
|
|
561
|
+
_handleMessage(event) {
|
|
562
|
+
let data;
|
|
563
|
+
try {
|
|
564
|
+
data = JSON.parse(event.data);
|
|
565
|
+
} catch (_) {
|
|
566
|
+
return;
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// Speech started event (Deepgram VAD)
|
|
570
|
+
if (data.type === 'SpeechStarted') {
|
|
571
|
+
// Could emit event for UI feedback
|
|
572
|
+
return;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
// UtteranceEnd — Deepgram detected end of utterance (silence after speech)
|
|
576
|
+
if (data.type === 'UtteranceEnd') {
|
|
577
|
+
this._flushAccumulated();
|
|
578
|
+
return;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
// Transcript results
|
|
582
|
+
if (data.type === 'Results') {
|
|
583
|
+
const channel = data.channel;
|
|
584
|
+
if (!channel || !channel.alternatives || !channel.alternatives.length) return;
|
|
585
|
+
|
|
586
|
+
const transcript = channel.alternatives[0].transcript || '';
|
|
587
|
+
const isFinal = data.is_final;
|
|
588
|
+
const speechFinal = data.speech_final;
|
|
589
|
+
|
|
590
|
+
if (!transcript.trim()) return;
|
|
591
|
+
|
|
592
|
+
// Ignore during mute (TTS playing)
|
|
593
|
+
if (this._muteActive || (this.isProcessing && !this._pttHolding)) return;
|
|
594
|
+
|
|
595
|
+
if (isFinal) {
|
|
596
|
+
// Filter hallucinations
|
|
597
|
+
if (this._isHallucination(transcript)) {
|
|
598
|
+
console.log('Deepgram Streaming: filtered hallucination:', transcript);
|
|
599
|
+
return;
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
console.log('Deepgram Streaming final:', transcript);
|
|
603
|
+
if (this.onListenFinal) this.onListenFinal(transcript.trim());
|
|
604
|
+
|
|
605
|
+
// PTT mode: accumulate and wait for pttRelease to send
|
|
606
|
+
if (this._pttHolding) {
|
|
607
|
+
this.accumulatedText = this.accumulatedText
|
|
608
|
+
? this.accumulatedText + ' ' + transcript.trim()
|
|
609
|
+
: transcript.trim();
|
|
610
|
+
return;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// Accumulate finals
|
|
614
|
+
this.accumulatedText = this.accumulatedText
|
|
615
|
+
? this.accumulatedText + ' ' + transcript.trim()
|
|
616
|
+
: transcript.trim();
|
|
617
|
+
|
|
618
|
+
// If speech_final (Deepgram's endpointing), flush after short accumulation window
|
|
619
|
+
if (speechFinal) {
|
|
620
|
+
if (this._accumulationTimer) {
|
|
621
|
+
clearTimeout(this._accumulationTimer);
|
|
622
|
+
}
|
|
623
|
+
this._accumulationTimer = setTimeout(() => {
|
|
624
|
+
this._accumulationTimer = null;
|
|
625
|
+
this._flushAccumulated();
|
|
626
|
+
}, this.accumulationDelayMs);
|
|
627
|
+
}
|
|
628
|
+
} else {
|
|
629
|
+
// Interim result — show live feedback
|
|
630
|
+
if (this.onInterim) {
|
|
631
|
+
const preview = this.accumulatedText
|
|
632
|
+
? this.accumulatedText + ' ' + transcript.trim()
|
|
633
|
+
: transcript.trim();
|
|
634
|
+
this.onInterim(preview);
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
_flushAccumulated() {
|
|
641
|
+
if (this._accumulationTimer) {
|
|
642
|
+
clearTimeout(this._accumulationTimer);
|
|
643
|
+
this._accumulationTimer = null;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
const text = this.accumulatedText.trim();
|
|
647
|
+
if (!text) return;
|
|
648
|
+
|
|
649
|
+
// Filter garbage
|
|
650
|
+
const meaningful = text.replace(/[^a-zA-Z0-9]/g, '');
|
|
651
|
+
if (meaningful.length < 2) {
|
|
652
|
+
console.log('Deepgram Streaming: filtered too short:', text);
|
|
653
|
+
this.accumulatedText = '';
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
if (this._isHallucination(text)) {
|
|
658
|
+
console.log('Deepgram Streaming: filtered hallucination:', text);
|
|
659
|
+
this.accumulatedText = '';
|
|
660
|
+
return;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
console.log('Deepgram Streaming result:', text);
|
|
664
|
+
this.isProcessing = true;
|
|
665
|
+
if (this.onResult) this.onResult(text);
|
|
666
|
+
this.accumulatedText = '';
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
_isHallucination(text) {
|
|
670
|
+
const lower = text.toLowerCase().replace(/[.!?,;:]+$/, '');
|
|
671
|
+
if (this._hallucinations.has(lower)) return true;
|
|
672
|
+
|
|
673
|
+
const meaningful = text.replace(/[^a-zA-Z0-9]/g, '');
|
|
674
|
+
if (meaningful.length < 3) return true;
|
|
675
|
+
|
|
676
|
+
for (const sub of this._hallucinationSubstrings) {
|
|
677
|
+
if (lower.includes(sub)) return true;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
// Repetitive pattern check
|
|
681
|
+
const words = text.match(/[a-zA-Z]+/g);
|
|
682
|
+
if (words && words.length >= 4) {
|
|
683
|
+
const counts = {};
|
|
684
|
+
for (const w of words) {
|
|
685
|
+
const wl = w.toLowerCase();
|
|
686
|
+
counts[wl] = (counts[wl] || 0) + 1;
|
|
687
|
+
}
|
|
688
|
+
const max = Math.max(...Object.values(counts));
|
|
689
|
+
if (max / words.length >= 0.5) return true;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
return false;
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
_clearTimers() {
|
|
696
|
+
if (this._accumulationTimer) {
|
|
697
|
+
clearTimeout(this._accumulationTimer);
|
|
698
|
+
this._accumulationTimer = null;
|
|
699
|
+
}
|
|
700
|
+
this._stopKeepAlive();
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
// ===== DEEPGRAM STREAMING WAKE WORD DETECTOR =====
|
|
706
|
+
class DeepgramStreamingWakeWordDetector {
|
|
707
|
+
constructor() {
|
|
708
|
+
this.isListening = false;
|
|
709
|
+
this.onWakeWordDetected = null;
|
|
710
|
+
this.wakeWords = ['wake up'];
|
|
711
|
+
this._stt = null;
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
isSupported() {
|
|
715
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
async start() {
|
|
719
|
+
if (this.isListening) return true;
|
|
720
|
+
|
|
721
|
+
this._stt = new DeepgramStreamingSTT();
|
|
722
|
+
|
|
723
|
+
this._stt.onResult = (transcript) => {
|
|
724
|
+
const lower = transcript.toLowerCase();
|
|
725
|
+
console.log(`Wake word detector heard: "${transcript}"`);
|
|
726
|
+
if (this.wakeWords.some(ww => lower.includes(ww))) {
|
|
727
|
+
console.log('Wake word detected!');
|
|
728
|
+
if (this.onWakeWordDetected) this.onWakeWordDetected();
|
|
729
|
+
}
|
|
730
|
+
};
|
|
731
|
+
|
|
732
|
+
this._stt.onError = (error) => {
|
|
733
|
+
console.warn('Wake word detector error:', error);
|
|
734
|
+
};
|
|
735
|
+
|
|
736
|
+
this.isListening = true;
|
|
737
|
+
const ok = await this._stt.start();
|
|
738
|
+
if (!ok) {
|
|
739
|
+
this.isListening = false;
|
|
740
|
+
return false;
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
console.log('Deepgram Streaming wake word detector started');
|
|
744
|
+
return true;
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
stop() {
|
|
748
|
+
this.isListening = false;
|
|
749
|
+
if (this._stt) {
|
|
750
|
+
this._stt.stop();
|
|
751
|
+
this._stt = null;
|
|
752
|
+
}
|
|
753
|
+
console.log('Deepgram Streaming wake word detector stopped');
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
async toggle() {
|
|
757
|
+
if (this.isListening) {
|
|
758
|
+
this.stop();
|
|
759
|
+
return false;
|
|
760
|
+
} else {
|
|
761
|
+
return await this.start();
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
export { DeepgramStreamingSTT, DeepgramStreamingWakeWordDetector };
|