openvoiceui 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/.env.example +104 -0
  2. package/Dockerfile +30 -0
  3. package/LICENSE +21 -0
  4. package/README.md +638 -0
  5. package/SETUP.md +360 -0
  6. package/app.py +232 -0
  7. package/auto-approve-devices.js +111 -0
  8. package/cli/index.js +372 -0
  9. package/config/__init__.py +4 -0
  10. package/config/default.yaml +43 -0
  11. package/config/flags.yaml +67 -0
  12. package/config/loader.py +203 -0
  13. package/config/providers.yaml +71 -0
  14. package/config/speech_normalization.yaml +182 -0
  15. package/config/theme.json +4 -0
  16. package/data/greetings.json +25 -0
  17. package/default-pages/ai-image-creator.html +915 -0
  18. package/default-pages/bulk-image-uploader.html +492 -0
  19. package/default-pages/desktop.html +2865 -0
  20. package/default-pages/file-explorer.html +854 -0
  21. package/default-pages/interactive-map.html +655 -0
  22. package/default-pages/style-guide.html +1005 -0
  23. package/default-pages/website-setup.html +1623 -0
  24. package/deploy/openclaw/Dockerfile +46 -0
  25. package/deploy/openvoiceui.service +30 -0
  26. package/deploy/setup-nginx.sh +50 -0
  27. package/deploy/setup-sudo.sh +306 -0
  28. package/deploy/skill-runner/Dockerfile +19 -0
  29. package/deploy/skill-runner/requirements.txt +14 -0
  30. package/deploy/skill-runner/server.py +269 -0
  31. package/deploy/supertonic/Dockerfile +22 -0
  32. package/deploy/supertonic/server.py +79 -0
  33. package/docker-compose.pinokio.yml +11 -0
  34. package/docker-compose.yml +59 -0
  35. package/greetings.json +25 -0
  36. package/index.html +65 -0
  37. package/inject-device-identity.js +142 -0
  38. package/package.json +82 -0
  39. package/profiles/default.json +114 -0
  40. package/profiles/manager.py +354 -0
  41. package/profiles/schema.json +337 -0
  42. package/prompts/voice-system-prompt.md +149 -0
  43. package/providers/__init__.py +39 -0
  44. package/providers/base.py +63 -0
  45. package/providers/llm/__init__.py +12 -0
  46. package/providers/llm/base.py +71 -0
  47. package/providers/llm/clawdbot_provider.py +112 -0
  48. package/providers/llm/zai_provider.py +115 -0
  49. package/providers/registry.py +320 -0
  50. package/providers/stt/__init__.py +12 -0
  51. package/providers/stt/base.py +58 -0
  52. package/providers/stt/webspeech_provider.py +49 -0
  53. package/providers/stt/whisper_provider.py +100 -0
  54. package/providers/tts/__init__.py +20 -0
  55. package/providers/tts/base.py +91 -0
  56. package/providers/tts/groq_provider.py +74 -0
  57. package/providers/tts/supertonic_provider.py +72 -0
  58. package/requirements.txt +38 -0
  59. package/routes/__init__.py +10 -0
  60. package/routes/admin.py +515 -0
  61. package/routes/canvas.py +1315 -0
  62. package/routes/chat.py +51 -0
  63. package/routes/conversation.py +2158 -0
  64. package/routes/elevenlabs_hybrid.py +306 -0
  65. package/routes/greetings.py +98 -0
  66. package/routes/icons.py +279 -0
  67. package/routes/image_gen.py +364 -0
  68. package/routes/instructions.py +190 -0
  69. package/routes/music.py +838 -0
  70. package/routes/onboarding.py +43 -0
  71. package/routes/pi.py +62 -0
  72. package/routes/profiles.py +215 -0
  73. package/routes/report_issue.py +68 -0
  74. package/routes/static_files.py +533 -0
  75. package/routes/suno.py +664 -0
  76. package/routes/theme.py +81 -0
  77. package/routes/transcripts.py +199 -0
  78. package/routes/vision.py +348 -0
  79. package/routes/workspace.py +288 -0
  80. package/server.py +1510 -0
  81. package/services/__init__.py +1 -0
  82. package/services/auth.py +143 -0
  83. package/services/canvas_versioning.py +239 -0
  84. package/services/db_pool.py +107 -0
  85. package/services/gateway.py +16 -0
  86. package/services/gateway_manager.py +333 -0
  87. package/services/gateways/__init__.py +12 -0
  88. package/services/gateways/base.py +110 -0
  89. package/services/gateways/compat.py +264 -0
  90. package/services/gateways/openclaw.py +1134 -0
  91. package/services/health.py +100 -0
  92. package/services/memory_client.py +455 -0
  93. package/services/paths.py +26 -0
  94. package/services/speech_normalizer.py +285 -0
  95. package/services/tts.py +270 -0
  96. package/setup-config.js +262 -0
  97. package/sounds/air_horn.mp3 +0 -0
  98. package/sounds/bruh.mp3 +0 -0
  99. package/sounds/crowd_cheer.mp3 +0 -0
  100. package/sounds/gunshot.mp3 +0 -0
  101. package/sounds/impact.mp3 +0 -0
  102. package/sounds/lets_go.mp3 +0 -0
  103. package/sounds/record_stop.mp3 +0 -0
  104. package/sounds/rewind.mp3 +0 -0
  105. package/sounds/sad_trombone.mp3 +0 -0
  106. package/sounds/scratch_long.mp3 +0 -0
  107. package/sounds/yeah.mp3 +0 -0
  108. package/src/adapters/ClawdBotAdapter.js +264 -0
  109. package/src/adapters/_template.js +133 -0
  110. package/src/adapters/elevenlabs-classic.js +841 -0
  111. package/src/adapters/elevenlabs-hybrid.js +812 -0
  112. package/src/adapters/hume-evi.js +676 -0
  113. package/src/admin.html +1339 -0
  114. package/src/app.js +8802 -0
  115. package/src/core/Config.js +173 -0
  116. package/src/core/EmotionEngine.js +307 -0
  117. package/src/core/EventBridge.js +180 -0
  118. package/src/core/EventBus.js +117 -0
  119. package/src/core/VoiceSession.js +607 -0
  120. package/src/face/BaseFace.js +259 -0
  121. package/src/face/EyeFace.js +208 -0
  122. package/src/face/HaloSmokeFace.js +509 -0
  123. package/src/face/manifest.json +27 -0
  124. package/src/face/previews/eyes.svg +16 -0
  125. package/src/face/previews/orb.svg +29 -0
  126. package/src/features/MusicPlayer.js +620 -0
  127. package/src/features/Soundboard.js +128 -0
  128. package/src/providers/DeepgramSTT.js +472 -0
  129. package/src/providers/DeepgramStreamingSTT.js +766 -0
  130. package/src/providers/GroqSTT.js +559 -0
  131. package/src/providers/TTSPlayer.js +323 -0
  132. package/src/providers/WebSpeechSTT.js +479 -0
  133. package/src/providers/tts/BaseTTSProvider.js +81 -0
  134. package/src/providers/tts/HumeProvider.js +77 -0
  135. package/src/providers/tts/SupertonicProvider.js +174 -0
  136. package/src/providers/tts/index.js +140 -0
  137. package/src/shell/adapter-registry.js +154 -0
  138. package/src/shell/caller-bridge.js +35 -0
  139. package/src/shell/camera-bridge.js +28 -0
  140. package/src/shell/canvas-bridge.js +32 -0
  141. package/src/shell/commercial-bridge.js +44 -0
  142. package/src/shell/face-bridge.js +44 -0
  143. package/src/shell/music-bridge.js +60 -0
  144. package/src/shell/orchestrator.js +233 -0
  145. package/src/shell/profile-discovery.js +303 -0
  146. package/src/shell/sounds-bridge.js +28 -0
  147. package/src/shell/transcript-bridge.js +61 -0
  148. package/src/shell/waveform-bridge.js +33 -0
  149. package/src/styles/base.css +2862 -0
  150. package/src/styles/face.css +417 -0
  151. package/src/styles/pi-overrides.css +89 -0
  152. package/src/styles/theme-dark.css +67 -0
  153. package/src/test-tts.html +175 -0
  154. package/src/ui/AppShell.js +544 -0
  155. package/src/ui/ProfileSwitcher.js +228 -0
  156. package/src/ui/SessionControl.js +240 -0
  157. package/src/ui/face/FacePicker.js +195 -0
  158. package/src/ui/face/FaceRenderer.js +309 -0
  159. package/src/ui/settings/PlaylistEditor.js +366 -0
  160. package/src/ui/settings/SettingsPanel.css +684 -0
  161. package/src/ui/settings/SettingsPanel.js +419 -0
  162. package/src/ui/settings/TTSVoicePreview.js +210 -0
  163. package/src/ui/themes/ThemeManager.js +213 -0
  164. package/src/ui/visualizers/BaseVisualizer.js +29 -0
  165. package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
  166. package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
  167. package/static/emulators/jsdos/js-dos.css +1 -0
  168. package/static/emulators/jsdos/js-dos.js +22 -0
  169. package/static/favicon.svg +55 -0
  170. package/static/icons/apple-touch-icon.png +0 -0
  171. package/static/icons/favicon-32.png +0 -0
  172. package/static/icons/icon-192.png +0 -0
  173. package/static/icons/icon-512.png +0 -0
  174. package/static/install.html +449 -0
  175. package/static/manifest.json +26 -0
  176. package/static/sw.js +21 -0
  177. package/tts_providers/__init__.py +136 -0
  178. package/tts_providers/base_provider.py +319 -0
  179. package/tts_providers/groq_provider.py +155 -0
  180. package/tts_providers/hume_provider.py +226 -0
  181. package/tts_providers/providers_config.json +119 -0
  182. package/tts_providers/qwen3_provider.py +371 -0
  183. package/tts_providers/resemble_provider.py +315 -0
  184. package/tts_providers/supertonic_provider.py +557 -0
  185. package/tts_providers/supertonic_tts.py +399 -0
@@ -0,0 +1,607 @@
1
+ /**
2
+ * VoiceSession — slim orchestrator (replaces monolithic ClawdBotMode)
3
+ *
4
+ * Wires together the extracted modules:
5
+ * - WebSpeechSTT (P3-T3) — speech recognition
6
+ * - TTSPlayer (P3-T4) — audio playback + waveform
7
+ * - FaceManager (P3-T5) — face mood / amplitude
8
+ * - MusicPlayer (P3-T6) — music + ducking (auto via EventBus tts:start/tts:stop)
9
+ * - EmotionEngine (P3-T8) — emotion inference → face mood
10
+ * - EventBus (P3-T1) — pub/sub glue
11
+ *
12
+ * VoiceSession is NOT responsible for UI updates (transcript panel, action console,
13
+ * canvas/music commands from AI text). It emits events on eventBus and callers
14
+ * subscribe. The one exception is the canvas/music command parser which is
15
+ * included here because it is pure logic with no DOM dependency.
16
+ *
17
+ * Usage:
18
+ * import { VoiceSession } from './core/VoiceSession.js';
19
+ *
20
+ * const session = new VoiceSession({ serverUrl: 'https://your-server' });
21
+ * await session.start();
22
+ *
23
+ * // Subscribe via EventBus:
24
+ * import { eventBus } from './core/EventBus.js';
25
+ * eventBus.on('session:message', ({ role, text }) => { ... });
26
+ * eventBus.on('session:streaming', ({ text }) => { ... });
27
+ * eventBus.on('session:thinking', () => { ... });
28
+ * eventBus.on('session:listening', () => { ... });
29
+ * eventBus.on('session:error', ({ message }) => { ... });
30
+ * eventBus.on('session:tool', ({ name }) => { ... });
31
+ * eventBus.on('tts:start', () => { ... });
32
+ * eventBus.on('tts:stop', () => { ... });
33
+ *
34
+ * EventBus events emitted (inbound — modules listen):
35
+ * 'tts:start' consumed by MusicPlayer.duck(true)
36
+ * 'tts:stop' consumed by MusicPlayer.duck(false)
37
+ *
38
+ * ADR-009: simple manager pattern (no framework)
39
+ */
40
+
41
+ import { eventBus } from './EventBus.js';
42
+ import { WebSpeechSTT, WakeWordDetector } from '../providers/WebSpeechSTT.js';
43
+ import { DeepgramStreamingSTT } from '../providers/DeepgramStreamingSTT.js';
44
+ import { TTSPlayer } from '../providers/TTSPlayer.js';
45
+ import { faceManager } from '../face/BaseFace.js';
46
+ import { emotionEngine } from './EmotionEngine.js';
47
+
48
+ /**
49
+ * Create the STT instance based on the server profile setting.
50
+ * Falls back to WebSpeechSTT if no profile or unknown provider.
51
+ */
52
+ function _createSTT() {
53
+ const provider = window._serverProfile?.stt?.provider || 'webspeech';
54
+ if (provider === 'deepgram-streaming' || provider === 'deepgram') {
55
+ console.log('[VoiceSession] STT provider: Deepgram Streaming');
56
+ return new DeepgramStreamingSTT();
57
+ }
58
+ // webspeech / groq / other — use WebSpeechSTT as before
59
+ console.log('[VoiceSession] STT provider: Chrome Web Speech');
60
+ return new WebSpeechSTT();
61
+ }
62
+
63
+ export class VoiceSession {
64
+ /**
65
+ * @param {object} opts
66
+ * @param {string} opts.serverUrl — base URL of the Flask server
67
+ * @param {WakeWordDetector} [opts.wakeDetector] — shared wake detector (optional)
68
+ * @param {MusicPlayer} [opts.musicPlayer] — shared music player (optional)
69
+ */
70
+ constructor({ serverUrl = '', wakeDetector = null, musicPlayer = null } = {}) {
71
+ this.serverUrl = serverUrl;
72
+ this.musicPlayer = musicPlayer;
73
+
74
+ // Sub-modules
75
+ this.stt = _createSTT();
76
+ this.tts = new TTSPlayer();
77
+ this.wakeDetector = wakeDetector;
78
+
79
+ // Session state
80
+ this.sessionId = null;
81
+ this._ttsPlaying = false;
82
+ this._sessionGreeted = false;
83
+ this._pendingGreeting = null;
84
+ this._lastResponse = null;
85
+ this._restartWakeAfter = false;
86
+ this._active = false;
87
+ }
88
+
89
+ // ── Lifecycle ────────────────────────────────────────────────────────────
90
+
91
+ /**
92
+ * Initialize audio, wire up callbacks, send greeting, start STT.
93
+ */
94
+ async start() {
95
+ if (this._active) return;
96
+ this._active = true;
97
+
98
+ // Start emotion engine (wires session:message → faceManager.setMood)
99
+ emotionEngine.start();
100
+
101
+ // Init TTS audio context (requires user gesture — caller must ensure this)
102
+ await this.tts.init();
103
+
104
+ // Wire TTS → face amplitude
105
+ this.tts.onAmplitude = (value) => faceManager.setAmplitude(value);
106
+
107
+ // Wire TTS speaking state → EventBus (MusicPlayer auto-ducks on these)
108
+ this.tts.onSpeakingChange = (isSpeaking) => {
109
+ this._ttsPlaying = isSpeaking;
110
+ if (isSpeaking) {
111
+ eventBus.emit('tts:start', {});
112
+ // Mute STT immediately when TTS starts — clears any queued echo text
113
+ // and blocks onresult until TTS finishes. PTT/text interrupts bypass this.
114
+ if (this.stt.mute) this.stt.mute();
115
+ } else {
116
+ eventBus.emit('tts:stop', {});
117
+ // After TTS ends, signal STT can resume
118
+ this._resumeListening();
119
+ }
120
+ };
121
+
122
+ // Wire STT results → sendMessage
123
+ this.stt.onResult = (transcript) => {
124
+ if (this._ttsPlaying) {
125
+ console.log('[VoiceSession] Ignoring transcript during TTS:', transcript);
126
+ return;
127
+ }
128
+ if (transcript && transcript.trim()) {
129
+ this.sendMessage(transcript.trim());
130
+ }
131
+ };
132
+
133
+ this.stt.onError = (error) => {
134
+ console.error('[VoiceSession] STT error:', error);
135
+ eventBus.emit('session:error', { message: `Microphone: ${error}` });
136
+ };
137
+
138
+ // Stop wake detector before starting STT (both use Web Speech API)
139
+ if (this.wakeDetector?.isListening) {
140
+ this.wakeDetector.stop();
141
+ this._restartWakeAfter = true;
142
+ }
143
+
144
+ // Generate session ID
145
+ this.sessionId = `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
146
+ console.log('[VoiceSession] Session started:', this.sessionId);
147
+
148
+ // Send greeting first (awaits TTS playback before starting STT)
149
+ if (!this._sessionGreeted) {
150
+ this._sessionGreeted = true;
151
+ await this._sendGreeting();
152
+ }
153
+
154
+ // Start listening
155
+ const started = await this.stt.start();
156
+ if (started) {
157
+ console.log('[VoiceSession] Listening started');
158
+ eventBus.emit('session:listening', {});
159
+ } else {
160
+ eventBus.emit('session:error', { message: 'Failed to start microphone' });
161
+ }
162
+
163
+ eventBus.emit('session:start', { sessionId: this.sessionId });
164
+ }
165
+
166
+ /**
167
+ * Stop STT, stop TTS, restore wake detector.
168
+ */
169
+ stop() {
170
+ if (!this._active) return;
171
+ this._active = false;
172
+
173
+ this.stt.stop();
174
+ this.tts.stop();
175
+ emotionEngine.stop();
176
+
177
+ this._sessionGreeted = false;
178
+ this._pendingGreeting = null;
179
+
180
+ // Restore wake detector
181
+ if (this._restartWakeAfter && this.wakeDetector?.isSupported()) {
182
+ this.wakeDetector.start();
183
+ this._restartWakeAfter = false;
184
+ }
185
+
186
+ console.log('[VoiceSession] Session stopped');
187
+ eventBus.emit('session:stop', {});
188
+ }
189
+
190
+ /**
191
+ * Destroy resources (TTSPlayer AudioContext).
192
+ */
193
+ destroy() {
194
+ this.stop();
195
+ this.tts.destroy();
196
+ }
197
+
198
+ // ── Message sending ──────────────────────────────────────────────────────
199
+
200
+ /**
201
+ * Send a user message to the server, stream the response.
202
+ * @param {string} text
203
+ */
204
+ async sendMessage(text) {
205
+ if (!text?.trim()) return;
206
+
207
+ // Prepend pending greeting context if first user reply
208
+ let messageToSend = text.trim();
209
+ if (this._pendingGreeting) {
210
+ messageToSend = `[You just greeted with: "${this._pendingGreeting}"] User replied: ${messageToSend}`;
211
+ this._pendingGreeting = null;
212
+ }
213
+
214
+ eventBus.emit('session:message', { role: 'user', text: text.trim() });
215
+ eventBus.emit('session:thinking', {});
216
+ faceManager.setMood('thinking');
217
+
218
+ const provider = localStorage.getItem('voice_provider') || 'groq';
219
+ const voice = localStorage.getItem('voice_voice') || 'M1';
220
+
221
+ try {
222
+ const response = await fetch(`${this.serverUrl}/api/conversation?stream=1`, {
223
+ method: 'POST',
224
+ headers: { 'Content-Type': 'application/json' },
225
+ body: JSON.stringify({
226
+ message: messageToSend,
227
+ tts_provider: provider,
228
+ voice: voice,
229
+ session_id: this.sessionId,
230
+ ui_context: this._getUIContext()
231
+ })
232
+ });
233
+
234
+ if (!response.ok) {
235
+ throw new Error(`API error: ${response.status}`);
236
+ }
237
+
238
+ await this._processStream(response.body);
239
+
240
+ } catch (error) {
241
+ console.error('[VoiceSession] sendMessage error:', error);
242
+ faceManager.setMood('sad');
243
+ eventBus.emit('session:error', { message: error.message });
244
+ setTimeout(() => faceManager.setMood('neutral'), 2000);
245
+ } finally {
246
+ // Safety net: if no TTS played, re-enable STT after a delay
247
+ setTimeout(() => {
248
+ if (!this._ttsPlaying && this._active && !this.stt.isListening) {
249
+ console.log('[VoiceSession] Safety net: restarting STT');
250
+ if (this.stt.resetProcessing) this.stt.resetProcessing();
251
+ this.stt.start();
252
+ eventBus.emit('session:listening', {});
253
+ }
254
+ }, 2000);
255
+ }
256
+ }
257
+
258
+ // ── Stream processing ────────────────────────────────────────────────────
259
+
260
+ /**
261
+ * Read and process NDJSON stream from /api/conversation.
262
+ * @param {ReadableStream} body
263
+ */
264
+ async _processStream(body) {
265
+ const reader = body.getReader();
266
+ const decoder = new TextDecoder();
267
+ let buffer = '';
268
+ let streamingText = '';
269
+ let firstDelta = false;
270
+ const processedCmds = new Set();
271
+
272
+ try {
273
+ while (true) {
274
+ const { done, value } = await reader.read();
275
+ if (done) break;
276
+ buffer += decoder.decode(value, { stream: true });
277
+
278
+ let newlineIdx;
279
+ while ((newlineIdx = buffer.indexOf('\n')) !== -1) {
280
+ const line = buffer.slice(0, newlineIdx).trim();
281
+ buffer = buffer.slice(newlineIdx + 1);
282
+ if (!line) continue;
283
+
284
+ let data;
285
+ try {
286
+ data = JSON.parse(line);
287
+ } catch (_) {
288
+ console.warn('[VoiceSession] Failed to parse stream line:', line);
289
+ continue;
290
+ }
291
+
292
+ if (data.type === 'delta') {
293
+ streamingText += data.text;
294
+ if (!firstDelta) {
295
+ firstDelta = true;
296
+ faceManager.setMood('neutral');
297
+ eventBus.emit('session:streaming', { text: this._stripCmdTags(streamingText), started: true });
298
+ } else {
299
+ eventBus.emit('session:streaming', { text: this._stripCmdTags(streamingText) });
300
+ }
301
+ this._checkCmdsInStream(streamingText, processedCmds);
302
+ }
303
+
304
+ if (data.type === 'action') {
305
+ eventBus.emit('session:action', { action: data.action });
306
+ if (data.action?.type === 'tool' && data.action?.phase === 'start') {
307
+ eventBus.emit('session:tool', { name: data.action.name });
308
+ }
309
+ }
310
+
311
+ if (data.type === 'text_done') {
312
+ const fullText = data.response || streamingText;
313
+ const cleanText = this._stripReasoningTokens(fullText);
314
+ const displayText = this._stripCmdTags(cleanText);
315
+
316
+ if (displayText === this._lastResponse) {
317
+ console.log('[VoiceSession] Skipping duplicate response');
318
+ reader.cancel();
319
+ return;
320
+ }
321
+ this._lastResponse = displayText;
322
+
323
+ // Forward server-provided emotion state to EmotionEngine (ADR-004)
324
+ if (data.emotion_state) {
325
+ eventBus.emit('session:emotion', data.emotion_state);
326
+ }
327
+
328
+ eventBus.emit('session:message', { role: 'assistant', text: displayText });
329
+ this._handleCmds(cleanText, processedCmds);
330
+
331
+ if (data.actions) {
332
+ eventBus.emit('session:actions', { actions: data.actions });
333
+ }
334
+ }
335
+
336
+ if (data.type === 'audio') {
337
+ if (data.audio) {
338
+ console.log(`[VoiceSession] TTS ready (tts:${data.timing?.tts_ms}ms)`);
339
+ // Mute STT before queuing audio — onSpeakingChange(true) will
340
+ // also mute, but muting here ensures no echo from audio buffering lag
341
+ if (this.stt.mute) this.stt.mute();
342
+ this.tts.queue(data.audio);
343
+ } else {
344
+ console.warn('[VoiceSession] Audio event had no audio data');
345
+ }
346
+ }
347
+
348
+ if (data.type === 'session_reset') {
349
+ console.warn('[VoiceSession] Server session reset:', data.old, '→', data.new);
350
+ eventBus.emit('session:reset', { old: data.old, new: data.new, reason: data.reason });
351
+ }
352
+
353
+ if (data.type === 'error') {
354
+ console.error('[VoiceSession] Stream error:', data.error);
355
+ eventBus.emit('session:error', { message: data.error });
356
+ }
357
+ }
358
+ }
359
+ } finally {
360
+ try { reader.cancel(); } catch (_) {}
361
+ }
362
+ }
363
+
364
+ // ── Greeting ─────────────────────────────────────────────────────────────
365
+
366
+ /**
367
+ * Play a random greeting via TTS before starting STT.
368
+ * Returns a Promise that resolves when TTS finishes.
369
+ */
370
+ async _sendGreeting() {
371
+ const greetings = [
372
+ "Hey there! What can I help you with?",
373
+ "Ready when you are. What's up?",
374
+ "Voice assistant online. What do you need?",
375
+ "I'm listening. Go ahead.",
376
+ "Hello! What would you like to do?",
377
+ "Standing by. What can I do for you?",
378
+ "At your service. What's on your mind?",
379
+ "Hey! What are we working on?",
380
+ "Online and ready. Fire away.",
381
+ "What's up? I'm all ears."
382
+ ];
383
+
384
+ const greeting = greetings[Math.floor(Math.random() * greetings.length)];
385
+ this._pendingGreeting = greeting;
386
+
387
+ eventBus.emit('session:message', { role: 'assistant', text: greeting });
388
+
389
+ return new Promise(async (resolve) => {
390
+ try {
391
+ const provider = localStorage.getItem('voice_provider') || 'groq';
392
+ const voice = localStorage.getItem('voice_voice') || 'M1';
393
+
394
+ const response = await fetch(`${this.serverUrl}/api/tts/generate`, {
395
+ method: 'POST',
396
+ headers: { 'Content-Type': 'application/json' },
397
+ body: JSON.stringify({ text: greeting, provider, voice })
398
+ });
399
+
400
+ if (!response.ok) {
401
+ resolve();
402
+ return;
403
+ }
404
+
405
+ const blob = await response.blob();
406
+ const base64 = await this._blobToBase64(blob);
407
+ await this.tts.play(base64);
408
+ resolve();
409
+
410
+ } catch (error) {
411
+ console.error('[VoiceSession] Greeting TTS error:', error);
412
+ resolve();
413
+ }
414
+ });
415
+ }
416
+
417
+ // ── Command parsing ──────────────────────────────────────────────────────
418
+
419
+ /**
420
+ * Strip [CANVAS:...], [MUSIC_PLAY], etc. tags from display text.
421
+ * @param {string} text
422
+ * @returns {string}
423
+ */
424
+ _stripCmdTags(text) {
425
+ if (!text) return '';
426
+ return text
427
+ .replace(/```html[\s\S]*?```/gi, '')
428
+ .replace(/```[\s\S]*?```/g, '')
429
+ .replace(/```html[\s\S]*/gi, '')
430
+ .replace(/```[\s\S]*/g, '')
431
+ .replace(/\[CANVAS_MENU\]/gi, '')
432
+ .replace(/\[CANVAS:[^\]]*\]/gi, '')
433
+ .replace(/\[MUSIC_PLAY(?::[^\]]*)?\]/gi, '')
434
+ .replace(/\[MUSIC_STOP\]/gi, '')
435
+ .replace(/\[MUSIC_NEXT\]/gi, '')
436
+ .replace(/\[SESSION_RESET\]/gi, '')
437
+ .replace(/\[SUNO_GENERATE:[^\]]*\]/gi, '')
438
+ .replace(/\[SPOTIFY:[^\]]*\]/gi, '')
439
+ .replace(/\[SLEEP\]/gi, '')
440
+ .replace(/\[REGISTER_FACE:[^\]]*\]/gi, '')
441
+ .replace(/\[SOUND:[^\]]*\]/gi, '')
442
+ .trim();
443
+ }
444
+
445
+ /**
446
+ * Check for commands while stream is in progress — emit events, don't block.
447
+ * Callers (UI) handle actual DOM/control actions.
448
+ */
449
+ _checkCmdsInStream(text, seen) {
450
+ if (!text) return;
451
+
452
+ if (/\[CANVAS_MENU\]/i.test(text) && !seen.has('CANVAS_MENU')) {
453
+ seen.add('CANVAS_MENU');
454
+ eventBus.emit('cmd:canvas_menu', {});
455
+ }
456
+
457
+ const canvasMatch = text.match(/\[CANVAS:([^\]]+)\]/i);
458
+ if (canvasMatch && !seen.has('CANVAS_PAGE')) {
459
+ seen.add('CANVAS_PAGE');
460
+ eventBus.emit('cmd:canvas_page', { page: canvasMatch[1].trim() });
461
+ }
462
+
463
+ const musicPlay = text.match(/\[MUSIC_PLAY(?::([^\]]+))?\]/i);
464
+ if (musicPlay && !seen.has('MUSIC_PLAY')) {
465
+ seen.add('MUSIC_PLAY');
466
+ const track = musicPlay[1]?.trim() || null;
467
+ eventBus.emit('cmd:music_play', { track });
468
+ if (this.musicPlayer) {
469
+ track ? this.musicPlayer.play(track) : this.musicPlayer.play();
470
+ }
471
+ }
472
+
473
+ if (/\[MUSIC_STOP\]/i.test(text) && !seen.has('MUSIC_STOP')) {
474
+ seen.add('MUSIC_STOP');
475
+ eventBus.emit('cmd:music_stop', {});
476
+ if (this.musicPlayer) this.musicPlayer.stop();
477
+ }
478
+
479
+ if (/\[MUSIC_NEXT\]/i.test(text) && !seen.has('MUSIC_NEXT')) {
480
+ seen.add('MUSIC_NEXT');
481
+ eventBus.emit('cmd:music_next', {});
482
+ if (this.musicPlayer) this.musicPlayer.next();
483
+ }
484
+
485
+ const sunoMatch = text.match(/\[SUNO_GENERATE:([^\]]+)\]/i);
486
+ if (sunoMatch && !seen.has('SUNO_GENERATE')) {
487
+ seen.add('SUNO_GENERATE');
488
+ eventBus.emit('cmd:suno_generate', { prompt: sunoMatch[1].trim() });
489
+ }
490
+
491
+ const spotifyMatch = text.match(/\[SPOTIFY:([^|\]]+)(?:\|([^\]]+))?\]/i);
492
+ if (spotifyMatch && !seen.has('SPOTIFY')) {
493
+ seen.add('SPOTIFY');
494
+ eventBus.emit('cmd:spotify', { track: spotifyMatch[1].trim(), artist: spotifyMatch[2]?.trim() || '' });
495
+ }
496
+
497
+ const soundMatch = text.match(/\[SOUND:([^\]]+)\]/i);
498
+ if (soundMatch && !seen.has('SOUND')) {
499
+ seen.add('SOUND');
500
+ eventBus.emit('cmd:sound', { sound: soundMatch[1].trim() });
501
+ }
502
+
503
+ const faceMatch = text.match(/\[REGISTER_FACE:([^\]]+)\]/i);
504
+ if (faceMatch && !seen.has('REGISTER_FACE')) {
505
+ seen.add('REGISTER_FACE');
506
+ eventBus.emit('cmd:register_face', { name: faceMatch[1].trim() });
507
+ }
508
+
509
+ if (/\[SLEEP\]/i.test(text) && !seen.has('SLEEP')) {
510
+ seen.add('SLEEP');
511
+ eventBus.emit('cmd:sleep', {});
512
+ }
513
+ }
514
+
515
+ /**
516
+ * Final command pass after text_done (catches anything missed during stream).
517
+ */
518
+ _handleCmds(text, seen) {
519
+ this._checkCmdsInStream(text, seen);
520
+ // AI music trigger scanning (phrase-based, not tag-based)
521
+ if (this.musicPlayer?.checkTriggers) {
522
+ this.musicPlayer.checkTriggers(text);
523
+ }
524
+ }
525
+
526
+ // ── Helpers ──────────────────────────────────────────────────────────────
527
+
528
+ /**
529
+ * Resume STT after TTS playback ends.
530
+ * Called from tts.onSpeakingChange(false).
531
+ */
532
+ _resumeListening() {
533
+ if (!this._active) return;
534
+ // 250ms settling delay: lets audio tail-off clear the mic before re-enabling
535
+ // STT so the last fragment of TTS audio isn't captured as user speech.
536
+ // (Reduced from 600ms — DeepgramStreamingSTT mutes its audio pipeline during
537
+ // TTS so echo is already suppressed; 250ms is enough for speaker decay.)
538
+ setTimeout(() => {
539
+ if (!this._active) return;
540
+ // resume() clears the mute flag AND restarts the engine — the engine
541
+ // may have stopped during TTS because onend no longer auto-restarts
542
+ // while muted. resetProcessing() alone won't restart a dead engine.
543
+ if (this.stt.resume) {
544
+ this.stt.resume();
545
+ } else if (this.stt.resetProcessing) {
546
+ this.stt.resetProcessing();
547
+ }
548
+ eventBus.emit('session:listening', {});
549
+ }, 250);
550
+ }
551
+
552
+ /**
553
+ * Strip LLM reasoning tokens (e.g., chain-of-thought preamble).
554
+ * @param {string} text
555
+ * @returns {string}
556
+ */
557
+ _stripReasoningTokens(text) {
558
+ if (!text) return text;
559
+ const patterns = [
560
+ /^.*?I should.*?\./s,
561
+ /^.*?NO_REPLY.*?\./s,
562
+ /^.*?The user.*?\./s,
563
+ /^.*?They say.*?\./s
564
+ ];
565
+ let cleaned = text;
566
+ for (const p of patterns) {
567
+ cleaned = cleaned.replace(p, '');
568
+ }
569
+ return cleaned.trim();
570
+ }
571
+
572
+ /**
573
+ * Gather UI context for the conversation API.
574
+ * @returns {object}
575
+ */
576
+ _getUIContext() {
577
+ const ctx = {
578
+ canvasDisplayed: window.canvasContext?.current_page ?? null,
579
+ canvasVisible: false,
580
+ musicPlaying: this.musicPlayer?.isPlaying ?? false,
581
+ musicTrack: this.musicPlayer?.currentMetadata?.title ?? null,
582
+ musicPanelOpen: this.musicPlayer ? this.musicPlayer.panelState !== 'closed' : false
583
+ };
584
+ return ctx;
585
+ }
586
+
587
+ /**
588
+ * Convert a Blob to a base64 string.
589
+ * @param {Blob} blob
590
+ * @returns {Promise<string>}
591
+ */
592
+ _blobToBase64(blob) {
593
+ return new Promise((resolve, reject) => {
594
+ const reader = new FileReader();
595
+ reader.onloadend = () => {
596
+ const dataUrl = reader.result;
597
+ // Strip the data URL prefix (e.g., "data:audio/wav;base64,")
598
+ const base64 = dataUrl.split(',')[1];
599
+ resolve(base64);
600
+ };
601
+ reader.onerror = reject;
602
+ reader.readAsDataURL(blob);
603
+ });
604
+ }
605
+ }
606
+
607
+ export default VoiceSession;