openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
@@ -0,0 +1,766 @@
1
+ /**
2
+ * DeepgramStreamingSTT — Real-time streaming speech recognition via Deepgram WebSocket.
3
+ *
4
+ * Instead of recording a blob, stopping, uploading, and waiting (like DeepgramSTT),
5
+ * this streams raw audio directly to Deepgram's WebSocket API as the user speaks.
6
+ * Transcripts come back in real-time — no batch upload delay.
7
+ *
8
+ * Falls back to WebSpeechSTT automatically if Deepgram is unavailable (bad key,
9
+ * network issue, outage). The fallback is transparent — all callbacks and PTT
10
+ * methods are proxied through, so callers don't need to know which engine is active.
11
+ *
12
+ * Drop-in replacement for DeepgramSTT / GroqSTT / WebSpeechSTT.
13
+ *
14
+ * Usage:
15
+ * import { DeepgramStreamingSTT } from './DeepgramStreamingSTT.js';
16
+ *
17
+ * const stt = new DeepgramStreamingSTT();
18
+ * stt.onResult = (text) => console.log('Heard:', text);
19
+ * await stt.start();
20
+ */
21
+
22
+ import { WebSpeechSTT } from './WebSpeechSTT.js';
23
+
24
+ class DeepgramStreamingSTT {
25
+ constructor(config = {}) {
26
+ this.serverUrl = (config.serverUrl || window.AGENT_CONFIG?.serverUrl || window.location.origin).replace(/\/$/, '');
27
+ this.isListening = false;
28
+ this.onResult = null;
29
+ this.onError = null;
30
+ this.onListenFinal = null; // Listen panel hook — called with each final transcript
31
+ this.onInterim = null; // Called with interim text as user speaks
32
+ this.isProcessing = false;
33
+ this.accumulatedText = '';
34
+
35
+ // PTT support
36
+ this._micMuted = false;
37
+ this._pttHolding = false;
38
+ this._muteActive = false;
39
+
40
+ // Profile-overridable settings (same interface as DeepgramSTT)
41
+ this.silenceDelayMs = 800; // Not used for VAD (Deepgram handles it), but kept for profile compat
42
+ this.accumulationDelayMs = config.accumulationDelayMs || 0;
43
+ this.vadThreshold = 25; // Not used (Deepgram server-side VAD), kept for profile compat
44
+ this.minSpeechMs = 300; // Not used (Deepgram server-side VAD), kept for profile compat
45
+ this.maxRecordingMs = 45000; // Not used (streaming is continuous), kept for profile compat
46
+
47
+ // Deepgram WebSocket state
48
+ this._ws = null;
49
+ this._stream = null;
50
+ this._audioCtx = null;
51
+ this._processorNode = null;
52
+ this._sourceNode = null;
53
+ this._accumulationTimer = null;
54
+ this._keepAliveInterval = null;
55
+ this._reconnecting = false;
56
+ this._intentionalClose = false;
57
+ this._reconnectFailures = 0;
58
+
59
+ // Deepgram model config
60
+ this._model = config.model || 'nova-2';
61
+ this._language = config.language || 'en';
62
+
63
+ // Fallback: WebSpeechSTT when Deepgram is unavailable
64
+ this._fallback = null; // lazily created WebSpeechSTT
65
+ this._usingFallback = false; // true when actively using fallback
66
+
67
+ // Hallucination filtering (same set as server-side)
68
+ this._hallucinations = new Set([
69
+ 'thank you', 'thanks for watching', 'thanks for listening',
70
+ 'subscribe', 'please subscribe', 'like and subscribe',
71
+ 'the end', 'subtitles by', 'translated by', 'closed captioning',
72
+ 'voice command for ai assistant', 'voice command for ai',
73
+ 'thanks', 'thank you so much',
74
+ ]);
75
+ this._hallucinationSubstrings = [
76
+ 'voice command for ai', 'thanks for watching', 'thanks for listening',
77
+ 'like and subscribe', 'please subscribe',
78
+ 'subtitles by', 'translated by', 'closed captioning',
79
+ ];
80
+ }
81
+
82
+ isSupported() {
83
+ return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
84
+ }
85
+
86
+ async start() {
87
+ if (this.isListening) return true;
88
+ if (this._micMuted) return false;
89
+
90
+ // If already in fallback mode, delegate
91
+ if (this._usingFallback && this._fallback) {
92
+ return this._fallback.start();
93
+ }
94
+
95
+ try {
96
+ // Get mic stream
97
+ if (!this._stream || !this._stream.active) {
98
+ this._stream = await navigator.mediaDevices.getUserMedia({
99
+ audio: {
100
+ echoCancellation: true,
101
+ noiseSuppression: true,
102
+ sampleRate: 16000,
103
+ }
104
+ });
105
+ }
106
+
107
+ // Connect to Deepgram WebSocket
108
+ const connected = await this._connectWebSocket();
109
+ if (!connected) {
110
+ console.warn('Deepgram unavailable — falling back to WebSpeech');
111
+ return this._activateFallback();
112
+ }
113
+
114
+ // Start streaming audio
115
+ this._startAudioPipeline();
116
+
117
+ this.isListening = true;
118
+ this._reconnectFailures = 0;
119
+ console.log('Deepgram Streaming STT started');
120
+ return true;
121
+ } catch (error) {
122
+ console.error('Failed to start Deepgram Streaming STT:', error);
123
+ // Mic errors should not trigger fallback — they'd fail on WebSpeech too
124
+ if (error.name === 'NotFoundError' || error.name === 'DevicesNotFoundError') {
125
+ if (this.onError) this.onError('no-device');
126
+ return false;
127
+ } else if (error.name === 'NotAllowedError') {
128
+ if (this.onError) this.onError('not-allowed');
129
+ return false;
130
+ }
131
+ // Network / Deepgram error — try fallback
132
+ console.warn('Deepgram error — falling back to WebSpeech');
133
+ return this._activateFallback();
134
+ }
135
+ }
136
+
137
+ stop() {
138
+ if (this._usingFallback && this._fallback) {
139
+ this._fallback.stop();
140
+ this.isListening = false;
141
+ return;
142
+ }
143
+
144
+ this.isListening = false;
145
+ this._micMuted = false;
146
+ this._muteActive = false;
147
+ this._intentionalClose = true;
148
+
149
+ this._stopAudioPipeline();
150
+ this._closeWebSocket();
151
+ this._clearTimers();
152
+
153
+ // Release mic stream
154
+ if (this._stream) {
155
+ this._stream.getTracks().forEach(t => t.stop());
156
+ this._stream = null;
157
+ }
158
+
159
+ console.log('Deepgram Streaming STT stopped');
160
+ }
161
+
162
+ resetProcessing() {
163
+ if (this._usingFallback && this._fallback) {
164
+ this._fallback.resetProcessing();
165
+ return;
166
+ }
167
+ this.isProcessing = false;
168
+ this.accumulatedText = '';
169
+ }
170
+
171
+ /** Alias for mute() — VoiceConversation calls pause() during greeting. */
172
+ pause() {
173
+ this.mute();
174
+ }
175
+
176
+ /**
177
+ * Mute STT — called when TTS starts speaking.
178
+ * Sends KeepAlive to Deepgram to pause without disconnecting,
179
+ * and ignores any incoming transcripts.
180
+ */
181
+ mute() {
182
+ if (this._usingFallback && this._fallback) {
183
+ this._fallback.mute();
184
+ return;
185
+ }
186
+ this._muteActive = true;
187
+ this.isProcessing = true;
188
+ this.accumulatedText = '';
189
+ if (this._accumulationTimer) {
190
+ clearTimeout(this._accumulationTimer);
191
+ this._accumulationTimer = null;
192
+ }
193
+ // Don't close the WebSocket — just stop sending audio.
194
+ // Deepgram's KeepAlive keeps the connection alive without audio.
195
+ this._sendKeepAlive();
196
+ }
197
+
198
+ /**
199
+ * Resume STT after TTS finishes.
200
+ * Audio pipeline is still running, just start paying attention again.
201
+ */
202
+ resume() {
203
+ if (this._usingFallback && this._fallback) {
204
+ this._fallback.resume();
205
+ return;
206
+ }
207
+ this._muteActive = false;
208
+ this.isProcessing = false;
209
+ this.accumulatedText = '';
210
+
211
+ // If WebSocket died during mute, reconnect
212
+ if (this.isListening && !this._micMuted && (!this._ws || this._ws.readyState !== WebSocket.OPEN)) {
213
+ this._connectWebSocket().then(ok => {
214
+ if (ok) {
215
+ this._startAudioPipeline();
216
+ } else {
217
+ // Reconnect failed — fall back
218
+ console.warn('Deepgram reconnect failed on resume — falling back to WebSpeech');
219
+ this._activateFallback();
220
+ }
221
+ }).catch(err => {
222
+ console.error('Deepgram Streaming STT: reconnect on resume failed:', err);
223
+ this._activateFallback();
224
+ });
225
+ }
226
+ }
227
+
228
+ // --- PTT helpers (proxy to fallback when active) ---
229
+
230
+ pttActivate() {
231
+ if (this._usingFallback && this._fallback) { this._fallback.pttActivate(); return; }
232
+ this._pttHolding = true;
233
+ this._micMuted = false;
234
+ this._muteActive = false;
235
+ this.isProcessing = false;
236
+ this.accumulatedText = '';
237
+ if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
238
+
239
+ // Ensure WebSocket and audio pipeline are active
240
+ if (!this._ws || this._ws.readyState !== WebSocket.OPEN) {
241
+ this._connectWebSocket().then(ok => {
242
+ if (ok) this._startAudioPipeline();
243
+ });
244
+ }
245
+ }
246
+
247
+ pttRelease() {
248
+ if (this._usingFallback && this._fallback) { this._fallback.pttRelease(); return; }
249
+ this._pttHolding = false;
250
+ this._micMuted = true;
251
+
252
+ // Tell Deepgram we're done speaking — triggers final transcript
253
+ if (this._ws && this._ws.readyState === WebSocket.OPEN) {
254
+ this._ws.send(JSON.stringify({ type: 'CloseStream' }));
255
+ }
256
+
257
+ // Wait briefly for final transcript, then send accumulated
258
+ setTimeout(() => {
259
+ const text = this.accumulatedText.trim();
260
+ if (text && this.onResult) {
261
+ console.log('PTT release — sending:', text);
262
+ this.isProcessing = true;
263
+ this.onResult(text);
264
+ }
265
+ this.accumulatedText = '';
266
+ }, 300);
267
+ }
268
+
269
+ pttMute() {
270
+ if (this._usingFallback && this._fallback) { this._fallback.pttMute(); return; }
271
+ this._pttHolding = false;
272
+ this._micMuted = true;
273
+ this.isProcessing = true;
274
+ this.accumulatedText = '';
275
+ if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
276
+ }
277
+
278
+ pttUnmute() {
279
+ if (this._usingFallback && this._fallback) { this._fallback.pttUnmute(); return; }
280
+ this._micMuted = false;
281
+ this._pttHolding = false;
282
+ this.isProcessing = false;
283
+ this.accumulatedText = '';
284
+
285
+ if (this.isListening && (!this._ws || this._ws.readyState !== WebSocket.OPEN)) {
286
+ this._connectWebSocket().then(ok => {
287
+ if (ok) this._startAudioPipeline();
288
+ });
289
+ }
290
+ }
291
+
292
+ // ---- Fallback ----
293
+
294
+ /**
295
+ * Activate WebSpeech fallback. Tears down any Deepgram state, creates a
296
+ * WebSpeechSTT instance, wires all callbacks through, and starts it.
297
+ */
298
+ _activateFallback() {
299
+ // Clean up Deepgram state
300
+ this._stopAudioPipeline();
301
+ this._closeWebSocket();
302
+ this._clearTimers();
303
+ // Release mic stream — WebSpeech manages its own
304
+ if (this._stream) {
305
+ this._stream.getTracks().forEach(t => t.stop());
306
+ this._stream = null;
307
+ }
308
+
309
+ this._usingFallback = true;
310
+
311
+ if (!this._fallback) {
312
+ this._fallback = new WebSpeechSTT();
313
+ }
314
+
315
+ // Wire callbacks through so callers see the same interface
316
+ this._syncFallbackCallbacks();
317
+
318
+ console.warn('[STT] Now using WebSpeech fallback');
319
+ // Report so the UI can show a notice if desired
320
+ try {
321
+ fetch('/api/stt-events', {
322
+ method: 'POST',
323
+ headers: { 'Content-Type': 'application/json' },
324
+ body: JSON.stringify({
325
+ error: 'deepgram-fallback',
326
+ message: 'Deepgram unavailable — using WebSpeech fallback',
327
+ provider: 'deepgram-streaming',
328
+ source: 'stt',
329
+ }),
330
+ }).catch(() => {});
331
+ } catch (_) {}
332
+
333
+ return this._fallback.start().then(ok => {
334
+ this.isListening = ok;
335
+ return ok;
336
+ });
337
+ }
338
+
339
+ /**
340
+ * Sync current callback references to the fallback instance.
341
+ * Called when fallback activates and whenever callbacks might have changed.
342
+ */
343
+ _syncFallbackCallbacks() {
344
+ if (!this._fallback) return;
345
+ this._fallback.onResult = (...args) => { if (this.onResult) this.onResult(...args); };
346
+ this._fallback.onError = (...args) => { if (this.onError) this.onError(...args); };
347
+ this._fallback.onListenFinal = (...args) => { if (this.onListenFinal) this.onListenFinal(...args); };
348
+ // WebSpeechSTT has onInterim — proxy it
349
+ this._fallback.onInterim = (...args) => { if (this.onInterim) this.onInterim(...args); };
350
+ }
351
+
352
+ // ---- WebSocket Connection ----
353
+
354
+ async _connectWebSocket() {
355
+ // Get a temporary API key from our server (don't expose the real key to the browser)
356
+ let apiKey;
357
+ try {
358
+ const resp = await fetch(`${this.serverUrl}/api/stt/deepgram/token`);
359
+ if (!resp.ok) {
360
+ console.error('Deepgram token endpoint failed:', resp.status);
361
+ return false;
362
+ }
363
+ const data = await resp.json();
364
+ apiKey = data.token;
365
+ if (!apiKey) {
366
+ console.error('Deepgram token endpoint returned no token');
367
+ return false;
368
+ }
369
+ } catch (err) {
370
+ console.error('Failed to get Deepgram token:', err);
371
+ return false;
372
+ }
373
+
374
+ return new Promise((resolve) => {
375
+ const params = new URLSearchParams({
376
+ model: this._model,
377
+ language: this._language,
378
+ smart_format: 'true',
379
+ punctuate: 'true',
380
+ interim_results: 'true',
381
+ utterance_end_ms: '1000',
382
+ vad_events: 'true',
383
+ endpointing: '300',
384
+ encoding: 'linear16',
385
+ sample_rate: '16000',
386
+ channels: '1',
387
+ });
388
+
389
+ const url = `wss://api.deepgram.com/v1/listen?${params}`;
390
+ this._intentionalClose = false;
391
+
392
+ try {
393
+ this._ws = new WebSocket(url, ['token', apiKey]);
394
+ } catch (err) {
395
+ console.error('Deepgram WebSocket creation failed:', err);
396
+ resolve(false);
397
+ return;
398
+ }
399
+
400
+ const timeout = setTimeout(() => {
401
+ if (this._ws && this._ws.readyState === WebSocket.CONNECTING) {
402
+ console.error('Deepgram WebSocket connection timeout');
403
+ this._ws.close();
404
+ resolve(false);
405
+ }
406
+ }, 5000);
407
+
408
+ this._ws.onopen = () => {
409
+ clearTimeout(timeout);
410
+ console.log('Deepgram WebSocket connected');
411
+ this._reconnectFailures = 0;
412
+ this._startKeepAlive();
413
+ resolve(true);
414
+ };
415
+
416
+ this._ws.onmessage = (event) => {
417
+ this._handleMessage(event);
418
+ };
419
+
420
+ this._ws.onerror = (event) => {
421
+ clearTimeout(timeout);
422
+ console.error('Deepgram WebSocket error:', event);
423
+ };
424
+
425
+ this._ws.onclose = (event) => {
426
+ clearTimeout(timeout);
427
+ this._stopKeepAlive();
428
+ console.log(`Deepgram WebSocket closed (code: ${event.code})`);
429
+
430
+ // Auto-reconnect if not intentional and still supposed to be listening
431
+ if (!this._intentionalClose && this.isListening && !this._micMuted && !this._reconnecting) {
432
+ this._reconnectFailures++;
433
+
434
+ // After 3 failed reconnects, give up and fall back to WebSpeech
435
+ if (this._reconnectFailures >= 3) {
436
+ console.warn(`Deepgram: ${this._reconnectFailures} reconnect failures — falling back to WebSpeech`);
437
+ this._activateFallback();
438
+ return;
439
+ }
440
+
441
+ this._reconnecting = true;
442
+ const delay = Math.min(1000 * Math.pow(2, this._reconnectFailures - 1), 5000);
443
+ console.log(`Deepgram: reconnecting in ${delay}ms (attempt ${this._reconnectFailures}/3)...`);
444
+ setTimeout(() => {
445
+ this._reconnecting = false;
446
+ if (this.isListening && !this._intentionalClose) {
447
+ this._connectWebSocket().then(ok => {
448
+ if (ok) {
449
+ this._startAudioPipeline();
450
+ } else {
451
+ // Connection failed — count as another failure and maybe fallback
452
+ this._reconnectFailures++;
453
+ if (this._reconnectFailures >= 3) {
454
+ console.warn('Deepgram: reconnect failed — falling back to WebSpeech');
455
+ this._activateFallback();
456
+ }
457
+ }
458
+ });
459
+ }
460
+ }, delay);
461
+ }
462
+
463
+ if (this._ws === null) return; // already cleaned up
464
+ resolve(false);
465
+ };
466
+ });
467
+ }
468
+
469
+ _closeWebSocket() {
470
+ this._stopKeepAlive();
471
+ if (this._ws) {
472
+ this._intentionalClose = true;
473
+ // Send CloseStream to get final transcript before closing
474
+ if (this._ws.readyState === WebSocket.OPEN) {
475
+ try {
476
+ this._ws.send(JSON.stringify({ type: 'CloseStream' }));
477
+ } catch (_) {}
478
+ }
479
+ this._ws.close();
480
+ this._ws = null;
481
+ }
482
+ }
483
+
484
+ _sendKeepAlive() {
485
+ if (this._ws && this._ws.readyState === WebSocket.OPEN) {
486
+ try {
487
+ this._ws.send(JSON.stringify({ type: 'KeepAlive' }));
488
+ } catch (_) {}
489
+ }
490
+ }
491
+
492
+ _startKeepAlive() {
493
+ this._stopKeepAlive();
494
+ // Send KeepAlive every 8 seconds to prevent timeout
495
+ this._keepAliveInterval = setInterval(() => {
496
+ this._sendKeepAlive();
497
+ }, 8000);
498
+ }
499
+
500
+ _stopKeepAlive() {
501
+ if (this._keepAliveInterval) {
502
+ clearInterval(this._keepAliveInterval);
503
+ this._keepAliveInterval = null;
504
+ }
505
+ }
506
+
507
+ // ---- Audio Pipeline ----
508
+
509
+ _startAudioPipeline() {
510
+ // Clean up existing pipeline
511
+ this._stopAudioPipeline();
512
+
513
+ if (!this._stream || !this._stream.active) return;
514
+
515
+ this._audioCtx = new AudioContext({ sampleRate: 16000 });
516
+ this._sourceNode = this._audioCtx.createMediaStreamSource(this._stream);
517
+
518
+ // ScriptProcessorNode for raw PCM access (AudioWorklet would be better
519
+ // but requires a separate file and HTTPS — this works everywhere)
520
+ const bufferSize = 4096;
521
+ this._processorNode = this._audioCtx.createScriptProcessor(bufferSize, 1, 1);
522
+
523
+ this._processorNode.onaudioprocess = (event) => {
524
+ if (this._muteActive || this._micMuted) return;
525
+ if (!this._ws || this._ws.readyState !== WebSocket.OPEN) return;
526
+
527
+ const inputData = event.inputBuffer.getChannelData(0);
528
+
529
+ // Convert Float32 [-1, 1] to Int16 PCM
530
+ const pcm16 = new Int16Array(inputData.length);
531
+ for (let i = 0; i < inputData.length; i++) {
532
+ const s = Math.max(-1, Math.min(1, inputData[i]));
533
+ pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
534
+ }
535
+
536
+ // Send raw PCM bytes to Deepgram
537
+ this._ws.send(pcm16.buffer);
538
+ };
539
+
540
+ this._sourceNode.connect(this._processorNode);
541
+ this._processorNode.connect(this._audioCtx.destination);
542
+ }
543
+
544
+ _stopAudioPipeline() {
545
+ if (this._processorNode) {
546
+ this._processorNode.disconnect();
547
+ this._processorNode = null;
548
+ }
549
+ if (this._sourceNode) {
550
+ this._sourceNode.disconnect();
551
+ this._sourceNode = null;
552
+ }
553
+ if (this._audioCtx) {
554
+ this._audioCtx.close().catch(() => {});
555
+ this._audioCtx = null;
556
+ }
557
+ }
558
+
559
+ // ---- Message Handling ----
560
+
561
+ _handleMessage(event) {
562
+ let data;
563
+ try {
564
+ data = JSON.parse(event.data);
565
+ } catch (_) {
566
+ return;
567
+ }
568
+
569
+ // Speech started event (Deepgram VAD)
570
+ if (data.type === 'SpeechStarted') {
571
+ // Could emit event for UI feedback
572
+ return;
573
+ }
574
+
575
+ // UtteranceEnd — Deepgram detected end of utterance (silence after speech)
576
+ if (data.type === 'UtteranceEnd') {
577
+ this._flushAccumulated();
578
+ return;
579
+ }
580
+
581
+ // Transcript results
582
+ if (data.type === 'Results') {
583
+ const channel = data.channel;
584
+ if (!channel || !channel.alternatives || !channel.alternatives.length) return;
585
+
586
+ const transcript = channel.alternatives[0].transcript || '';
587
+ const isFinal = data.is_final;
588
+ const speechFinal = data.speech_final;
589
+
590
+ if (!transcript.trim()) return;
591
+
592
+ // Ignore during mute (TTS playing)
593
+ if (this._muteActive || (this.isProcessing && !this._pttHolding)) return;
594
+
595
+ if (isFinal) {
596
+ // Filter hallucinations
597
+ if (this._isHallucination(transcript)) {
598
+ console.log('Deepgram Streaming: filtered hallucination:', transcript);
599
+ return;
600
+ }
601
+
602
+ console.log('Deepgram Streaming final:', transcript);
603
+ if (this.onListenFinal) this.onListenFinal(transcript.trim());
604
+
605
+ // PTT mode: accumulate and wait for pttRelease to send
606
+ if (this._pttHolding) {
607
+ this.accumulatedText = this.accumulatedText
608
+ ? this.accumulatedText + ' ' + transcript.trim()
609
+ : transcript.trim();
610
+ return;
611
+ }
612
+
613
+ // Accumulate finals
614
+ this.accumulatedText = this.accumulatedText
615
+ ? this.accumulatedText + ' ' + transcript.trim()
616
+ : transcript.trim();
617
+
618
+ // If speech_final (Deepgram's endpointing), flush after short accumulation window
619
+ if (speechFinal) {
620
+ if (this._accumulationTimer) {
621
+ clearTimeout(this._accumulationTimer);
622
+ }
623
+ this._accumulationTimer = setTimeout(() => {
624
+ this._accumulationTimer = null;
625
+ this._flushAccumulated();
626
+ }, this.accumulationDelayMs);
627
+ }
628
+ } else {
629
+ // Interim result — show live feedback
630
+ if (this.onInterim) {
631
+ const preview = this.accumulatedText
632
+ ? this.accumulatedText + ' ' + transcript.trim()
633
+ : transcript.trim();
634
+ this.onInterim(preview);
635
+ }
636
+ }
637
+ }
638
+ }
639
+
640
+ _flushAccumulated() {
641
+ if (this._accumulationTimer) {
642
+ clearTimeout(this._accumulationTimer);
643
+ this._accumulationTimer = null;
644
+ }
645
+
646
+ const text = this.accumulatedText.trim();
647
+ if (!text) return;
648
+
649
+ // Filter garbage
650
+ const meaningful = text.replace(/[^a-zA-Z0-9]/g, '');
651
+ if (meaningful.length < 2) {
652
+ console.log('Deepgram Streaming: filtered too short:', text);
653
+ this.accumulatedText = '';
654
+ return;
655
+ }
656
+
657
+ if (this._isHallucination(text)) {
658
+ console.log('Deepgram Streaming: filtered hallucination:', text);
659
+ this.accumulatedText = '';
660
+ return;
661
+ }
662
+
663
+ console.log('Deepgram Streaming result:', text);
664
+ this.isProcessing = true;
665
+ if (this.onResult) this.onResult(text);
666
+ this.accumulatedText = '';
667
+ }
668
+
669
+ _isHallucination(text) {
670
+ const lower = text.toLowerCase().replace(/[.!?,;:]+$/, '');
671
+ if (this._hallucinations.has(lower)) return true;
672
+
673
+ const meaningful = text.replace(/[^a-zA-Z0-9]/g, '');
674
+ if (meaningful.length < 3) return true;
675
+
676
+ for (const sub of this._hallucinationSubstrings) {
677
+ if (lower.includes(sub)) return true;
678
+ }
679
+
680
+ // Repetitive pattern check
681
+ const words = text.match(/[a-zA-Z]+/g);
682
+ if (words && words.length >= 4) {
683
+ const counts = {};
684
+ for (const w of words) {
685
+ const wl = w.toLowerCase();
686
+ counts[wl] = (counts[wl] || 0) + 1;
687
+ }
688
+ const max = Math.max(...Object.values(counts));
689
+ if (max / words.length >= 0.5) return true;
690
+ }
691
+
692
+ return false;
693
+ }
694
+
695
+ _clearTimers() {
696
+ if (this._accumulationTimer) {
697
+ clearTimeout(this._accumulationTimer);
698
+ this._accumulationTimer = null;
699
+ }
700
+ this._stopKeepAlive();
701
+ }
702
+ }
703
+
704
+
705
+ // ===== DEEPGRAM STREAMING WAKE WORD DETECTOR =====
706
+ class DeepgramStreamingWakeWordDetector {
707
+ constructor() {
708
+ this.isListening = false;
709
+ this.onWakeWordDetected = null;
710
+ this.wakeWords = ['wake up'];
711
+ this._stt = null;
712
+ }
713
+
714
+ isSupported() {
715
+ return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
716
+ }
717
+
718
+ async start() {
719
+ if (this.isListening) return true;
720
+
721
+ this._stt = new DeepgramStreamingSTT();
722
+
723
+ this._stt.onResult = (transcript) => {
724
+ const lower = transcript.toLowerCase();
725
+ console.log(`Wake word detector heard: "${transcript}"`);
726
+ if (this.wakeWords.some(ww => lower.includes(ww))) {
727
+ console.log('Wake word detected!');
728
+ if (this.onWakeWordDetected) this.onWakeWordDetected();
729
+ }
730
+ };
731
+
732
+ this._stt.onError = (error) => {
733
+ console.warn('Wake word detector error:', error);
734
+ };
735
+
736
+ this.isListening = true;
737
+ const ok = await this._stt.start();
738
+ if (!ok) {
739
+ this.isListening = false;
740
+ return false;
741
+ }
742
+
743
+ console.log('Deepgram Streaming wake word detector started');
744
+ return true;
745
+ }
746
+
747
+ stop() {
748
+ this.isListening = false;
749
+ if (this._stt) {
750
+ this._stt.stop();
751
+ this._stt = null;
752
+ }
753
+ console.log('Deepgram Streaming wake word detector stopped');
754
+ }
755
+
756
+ async toggle() {
757
+ if (this.isListening) {
758
+ this.stop();
759
+ return false;
760
+ } else {
761
+ return await this.start();
762
+ }
763
+ }
764
+ }
765
+
766
+ export { DeepgramStreamingSTT, DeepgramStreamingWakeWordDetector };