openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VoiceSession — slim orchestrator (replaces monolithic ClawdBotMode)
|
|
3
|
+
*
|
|
4
|
+
* Wires together the extracted modules:
|
|
5
|
+
* - WebSpeechSTT (P3-T3) — speech recognition
|
|
6
|
+
* - TTSPlayer (P3-T4) — audio playback + waveform
|
|
7
|
+
* - FaceManager (P3-T5) — face mood / amplitude
|
|
8
|
+
* - MusicPlayer (P3-T6) — music + ducking (auto via EventBus tts:start/tts:stop)
|
|
9
|
+
* - EmotionEngine (P3-T8) — emotion inference → face mood
|
|
10
|
+
* - EventBus (P3-T1) — pub/sub glue
|
|
11
|
+
*
|
|
12
|
+
* VoiceSession is NOT responsible for UI updates (transcript panel, action console,
|
|
13
|
+
* canvas/music commands from AI text). It emits events on eventBus and callers
|
|
14
|
+
* subscribe. The one exception is the canvas/music command parser which is
|
|
15
|
+
* included here because it is pure logic with no DOM dependency.
|
|
16
|
+
*
|
|
17
|
+
* Usage:
|
|
18
|
+
* import { VoiceSession } from './core/VoiceSession.js';
|
|
19
|
+
*
|
|
20
|
+
* const session = new VoiceSession({ serverUrl: 'https://your-server' });
|
|
21
|
+
* await session.start();
|
|
22
|
+
*
|
|
23
|
+
* // Subscribe via EventBus:
|
|
24
|
+
* import { eventBus } from './core/EventBus.js';
|
|
25
|
+
* eventBus.on('session:message', ({ role, text }) => { ... });
|
|
26
|
+
* eventBus.on('session:streaming', ({ text }) => { ... });
|
|
27
|
+
* eventBus.on('session:thinking', () => { ... });
|
|
28
|
+
* eventBus.on('session:listening', () => { ... });
|
|
29
|
+
* eventBus.on('session:error', ({ message }) => { ... });
|
|
30
|
+
* eventBus.on('session:tool', ({ name }) => { ... });
|
|
31
|
+
* eventBus.on('tts:start', () => { ... });
|
|
32
|
+
* eventBus.on('tts:stop', () => { ... });
|
|
33
|
+
*
|
|
34
|
+
* EventBus events emitted (inbound — modules listen):
|
|
35
|
+
* 'tts:start' consumed by MusicPlayer.duck(true)
|
|
36
|
+
* 'tts:stop' consumed by MusicPlayer.duck(false)
|
|
37
|
+
*
|
|
38
|
+
* ADR-009: simple manager pattern (no framework)
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
import { eventBus } from './EventBus.js';
|
|
42
|
+
import { WebSpeechSTT, WakeWordDetector } from '../providers/WebSpeechSTT.js';
|
|
43
|
+
import { DeepgramStreamingSTT } from '../providers/DeepgramStreamingSTT.js';
|
|
44
|
+
import { TTSPlayer } from '../providers/TTSPlayer.js';
|
|
45
|
+
import { faceManager } from '../face/BaseFace.js';
|
|
46
|
+
import { emotionEngine } from './EmotionEngine.js';
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Create the STT instance based on the server profile setting.
|
|
50
|
+
* Falls back to WebSpeechSTT if no profile or unknown provider.
|
|
51
|
+
*/
|
|
52
|
+
function _createSTT() {
|
|
53
|
+
const provider = window._serverProfile?.stt?.provider || 'webspeech';
|
|
54
|
+
if (provider === 'deepgram-streaming' || provider === 'deepgram') {
|
|
55
|
+
console.log('[VoiceSession] STT provider: Deepgram Streaming');
|
|
56
|
+
return new DeepgramStreamingSTT();
|
|
57
|
+
}
|
|
58
|
+
// webspeech / groq / other — use WebSpeechSTT as before
|
|
59
|
+
console.log('[VoiceSession] STT provider: Chrome Web Speech');
|
|
60
|
+
return new WebSpeechSTT();
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export class VoiceSession {
|
|
64
|
+
/**
|
|
65
|
+
* @param {object} opts
|
|
66
|
+
* @param {string} opts.serverUrl — base URL of the Flask server
|
|
67
|
+
* @param {WakeWordDetector} [opts.wakeDetector] — shared wake detector (optional)
|
|
68
|
+
* @param {MusicPlayer} [opts.musicPlayer] — shared music player (optional)
|
|
69
|
+
*/
|
|
70
|
+
constructor({ serverUrl = '', wakeDetector = null, musicPlayer = null } = {}) {
|
|
71
|
+
this.serverUrl = serverUrl;
|
|
72
|
+
this.musicPlayer = musicPlayer;
|
|
73
|
+
|
|
74
|
+
// Sub-modules
|
|
75
|
+
this.stt = _createSTT();
|
|
76
|
+
this.tts = new TTSPlayer();
|
|
77
|
+
this.wakeDetector = wakeDetector;
|
|
78
|
+
|
|
79
|
+
// Session state
|
|
80
|
+
this.sessionId = null;
|
|
81
|
+
this._ttsPlaying = false;
|
|
82
|
+
this._sessionGreeted = false;
|
|
83
|
+
this._pendingGreeting = null;
|
|
84
|
+
this._lastResponse = null;
|
|
85
|
+
this._restartWakeAfter = false;
|
|
86
|
+
this._active = false;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ── Lifecycle ────────────────────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Initialize audio, wire up callbacks, send greeting, start STT.
|
|
93
|
+
*/
|
|
94
|
+
async start() {
|
|
95
|
+
if (this._active) return;
|
|
96
|
+
this._active = true;
|
|
97
|
+
|
|
98
|
+
// Start emotion engine (wires session:message → faceManager.setMood)
|
|
99
|
+
emotionEngine.start();
|
|
100
|
+
|
|
101
|
+
// Init TTS audio context (requires user gesture — caller must ensure this)
|
|
102
|
+
await this.tts.init();
|
|
103
|
+
|
|
104
|
+
// Wire TTS → face amplitude
|
|
105
|
+
this.tts.onAmplitude = (value) => faceManager.setAmplitude(value);
|
|
106
|
+
|
|
107
|
+
// Wire TTS speaking state → EventBus (MusicPlayer auto-ducks on these)
|
|
108
|
+
this.tts.onSpeakingChange = (isSpeaking) => {
|
|
109
|
+
this._ttsPlaying = isSpeaking;
|
|
110
|
+
if (isSpeaking) {
|
|
111
|
+
eventBus.emit('tts:start', {});
|
|
112
|
+
// Mute STT immediately when TTS starts — clears any queued echo text
|
|
113
|
+
// and blocks onresult until TTS finishes. PTT/text interrupts bypass this.
|
|
114
|
+
if (this.stt.mute) this.stt.mute();
|
|
115
|
+
} else {
|
|
116
|
+
eventBus.emit('tts:stop', {});
|
|
117
|
+
// After TTS ends, signal STT can resume
|
|
118
|
+
this._resumeListening();
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
// Wire STT results → sendMessage
|
|
123
|
+
this.stt.onResult = (transcript) => {
|
|
124
|
+
if (this._ttsPlaying) {
|
|
125
|
+
console.log('[VoiceSession] Ignoring transcript during TTS:', transcript);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
if (transcript && transcript.trim()) {
|
|
129
|
+
this.sendMessage(transcript.trim());
|
|
130
|
+
}
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
this.stt.onError = (error) => {
|
|
134
|
+
console.error('[VoiceSession] STT error:', error);
|
|
135
|
+
eventBus.emit('session:error', { message: `Microphone: ${error}` });
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
// Stop wake detector before starting STT (both use Web Speech API)
|
|
139
|
+
if (this.wakeDetector?.isListening) {
|
|
140
|
+
this.wakeDetector.stop();
|
|
141
|
+
this._restartWakeAfter = true;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Generate session ID
|
|
145
|
+
this.sessionId = `session_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
|
146
|
+
console.log('[VoiceSession] Session started:', this.sessionId);
|
|
147
|
+
|
|
148
|
+
// Send greeting first (awaits TTS playback before starting STT)
|
|
149
|
+
if (!this._sessionGreeted) {
|
|
150
|
+
this._sessionGreeted = true;
|
|
151
|
+
await this._sendGreeting();
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Start listening
|
|
155
|
+
const started = await this.stt.start();
|
|
156
|
+
if (started) {
|
|
157
|
+
console.log('[VoiceSession] Listening started');
|
|
158
|
+
eventBus.emit('session:listening', {});
|
|
159
|
+
} else {
|
|
160
|
+
eventBus.emit('session:error', { message: 'Failed to start microphone' });
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
eventBus.emit('session:start', { sessionId: this.sessionId });
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Stop STT, stop TTS, restore wake detector.
|
|
168
|
+
*/
|
|
169
|
+
stop() {
|
|
170
|
+
if (!this._active) return;
|
|
171
|
+
this._active = false;
|
|
172
|
+
|
|
173
|
+
this.stt.stop();
|
|
174
|
+
this.tts.stop();
|
|
175
|
+
emotionEngine.stop();
|
|
176
|
+
|
|
177
|
+
this._sessionGreeted = false;
|
|
178
|
+
this._pendingGreeting = null;
|
|
179
|
+
|
|
180
|
+
// Restore wake detector
|
|
181
|
+
if (this._restartWakeAfter && this.wakeDetector?.isSupported()) {
|
|
182
|
+
this.wakeDetector.start();
|
|
183
|
+
this._restartWakeAfter = false;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
console.log('[VoiceSession] Session stopped');
|
|
187
|
+
eventBus.emit('session:stop', {});
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Destroy resources (TTSPlayer AudioContext).
|
|
192
|
+
*/
|
|
193
|
+
destroy() {
|
|
194
|
+
this.stop();
|
|
195
|
+
this.tts.destroy();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// ── Message sending ──────────────────────────────────────────────────────
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Send a user message to the server, stream the response.
|
|
202
|
+
* @param {string} text
|
|
203
|
+
*/
|
|
204
|
+
async sendMessage(text) {
|
|
205
|
+
if (!text?.trim()) return;
|
|
206
|
+
|
|
207
|
+
// Prepend pending greeting context if first user reply
|
|
208
|
+
let messageToSend = text.trim();
|
|
209
|
+
if (this._pendingGreeting) {
|
|
210
|
+
messageToSend = `[You just greeted with: "${this._pendingGreeting}"] User replied: ${messageToSend}`;
|
|
211
|
+
this._pendingGreeting = null;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
eventBus.emit('session:message', { role: 'user', text: text.trim() });
|
|
215
|
+
eventBus.emit('session:thinking', {});
|
|
216
|
+
faceManager.setMood('thinking');
|
|
217
|
+
|
|
218
|
+
const provider = localStorage.getItem('voice_provider') || 'groq';
|
|
219
|
+
const voice = localStorage.getItem('voice_voice') || 'M1';
|
|
220
|
+
|
|
221
|
+
try {
|
|
222
|
+
const response = await fetch(`${this.serverUrl}/api/conversation?stream=1`, {
|
|
223
|
+
method: 'POST',
|
|
224
|
+
headers: { 'Content-Type': 'application/json' },
|
|
225
|
+
body: JSON.stringify({
|
|
226
|
+
message: messageToSend,
|
|
227
|
+
tts_provider: provider,
|
|
228
|
+
voice: voice,
|
|
229
|
+
session_id: this.sessionId,
|
|
230
|
+
ui_context: this._getUIContext()
|
|
231
|
+
})
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
if (!response.ok) {
|
|
235
|
+
throw new Error(`API error: ${response.status}`);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
await this._processStream(response.body);
|
|
239
|
+
|
|
240
|
+
} catch (error) {
|
|
241
|
+
console.error('[VoiceSession] sendMessage error:', error);
|
|
242
|
+
faceManager.setMood('sad');
|
|
243
|
+
eventBus.emit('session:error', { message: error.message });
|
|
244
|
+
setTimeout(() => faceManager.setMood('neutral'), 2000);
|
|
245
|
+
} finally {
|
|
246
|
+
// Safety net: if no TTS played, re-enable STT after a delay
|
|
247
|
+
setTimeout(() => {
|
|
248
|
+
if (!this._ttsPlaying && this._active && !this.stt.isListening) {
|
|
249
|
+
console.log('[VoiceSession] Safety net: restarting STT');
|
|
250
|
+
if (this.stt.resetProcessing) this.stt.resetProcessing();
|
|
251
|
+
this.stt.start();
|
|
252
|
+
eventBus.emit('session:listening', {});
|
|
253
|
+
}
|
|
254
|
+
}, 2000);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// ── Stream processing ────────────────────────────────────────────────────
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Read and process NDJSON stream from /api/conversation.
|
|
262
|
+
* @param {ReadableStream} body
|
|
263
|
+
*/
|
|
264
|
+
async _processStream(body) {
|
|
265
|
+
const reader = body.getReader();
|
|
266
|
+
const decoder = new TextDecoder();
|
|
267
|
+
let buffer = '';
|
|
268
|
+
let streamingText = '';
|
|
269
|
+
let firstDelta = false;
|
|
270
|
+
const processedCmds = new Set();
|
|
271
|
+
|
|
272
|
+
try {
|
|
273
|
+
while (true) {
|
|
274
|
+
const { done, value } = await reader.read();
|
|
275
|
+
if (done) break;
|
|
276
|
+
buffer += decoder.decode(value, { stream: true });
|
|
277
|
+
|
|
278
|
+
let newlineIdx;
|
|
279
|
+
while ((newlineIdx = buffer.indexOf('\n')) !== -1) {
|
|
280
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
281
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
282
|
+
if (!line) continue;
|
|
283
|
+
|
|
284
|
+
let data;
|
|
285
|
+
try {
|
|
286
|
+
data = JSON.parse(line);
|
|
287
|
+
} catch (_) {
|
|
288
|
+
console.warn('[VoiceSession] Failed to parse stream line:', line);
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (data.type === 'delta') {
|
|
293
|
+
streamingText += data.text;
|
|
294
|
+
if (!firstDelta) {
|
|
295
|
+
firstDelta = true;
|
|
296
|
+
faceManager.setMood('neutral');
|
|
297
|
+
eventBus.emit('session:streaming', { text: this._stripCmdTags(streamingText), started: true });
|
|
298
|
+
} else {
|
|
299
|
+
eventBus.emit('session:streaming', { text: this._stripCmdTags(streamingText) });
|
|
300
|
+
}
|
|
301
|
+
this._checkCmdsInStream(streamingText, processedCmds);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
if (data.type === 'action') {
|
|
305
|
+
eventBus.emit('session:action', { action: data.action });
|
|
306
|
+
if (data.action?.type === 'tool' && data.action?.phase === 'start') {
|
|
307
|
+
eventBus.emit('session:tool', { name: data.action.name });
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if (data.type === 'text_done') {
|
|
312
|
+
const fullText = data.response || streamingText;
|
|
313
|
+
const cleanText = this._stripReasoningTokens(fullText);
|
|
314
|
+
const displayText = this._stripCmdTags(cleanText);
|
|
315
|
+
|
|
316
|
+
if (displayText === this._lastResponse) {
|
|
317
|
+
console.log('[VoiceSession] Skipping duplicate response');
|
|
318
|
+
reader.cancel();
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
this._lastResponse = displayText;
|
|
322
|
+
|
|
323
|
+
// Forward server-provided emotion state to EmotionEngine (ADR-004)
|
|
324
|
+
if (data.emotion_state) {
|
|
325
|
+
eventBus.emit('session:emotion', data.emotion_state);
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
eventBus.emit('session:message', { role: 'assistant', text: displayText });
|
|
329
|
+
this._handleCmds(cleanText, processedCmds);
|
|
330
|
+
|
|
331
|
+
if (data.actions) {
|
|
332
|
+
eventBus.emit('session:actions', { actions: data.actions });
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
if (data.type === 'audio') {
|
|
337
|
+
if (data.audio) {
|
|
338
|
+
console.log(`[VoiceSession] TTS ready (tts:${data.timing?.tts_ms}ms)`);
|
|
339
|
+
// Mute STT before queuing audio — onSpeakingChange(true) will
|
|
340
|
+
// also mute, but muting here ensures no echo from audio buffering lag
|
|
341
|
+
if (this.stt.mute) this.stt.mute();
|
|
342
|
+
this.tts.queue(data.audio);
|
|
343
|
+
} else {
|
|
344
|
+
console.warn('[VoiceSession] Audio event had no audio data');
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
if (data.type === 'session_reset') {
|
|
349
|
+
console.warn('[VoiceSession] Server session reset:', data.old, '→', data.new);
|
|
350
|
+
eventBus.emit('session:reset', { old: data.old, new: data.new, reason: data.reason });
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if (data.type === 'error') {
|
|
354
|
+
console.error('[VoiceSession] Stream error:', data.error);
|
|
355
|
+
eventBus.emit('session:error', { message: data.error });
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
} finally {
|
|
360
|
+
try { reader.cancel(); } catch (_) {}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// ── Greeting ─────────────────────────────────────────────────────────────
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Play a random greeting via TTS before starting STT.
|
|
368
|
+
* Returns a Promise that resolves when TTS finishes.
|
|
369
|
+
*/
|
|
370
|
+
async _sendGreeting() {
|
|
371
|
+
const greetings = [
|
|
372
|
+
"Hey there! What can I help you with?",
|
|
373
|
+
"Ready when you are. What's up?",
|
|
374
|
+
"Voice assistant online. What do you need?",
|
|
375
|
+
"I'm listening. Go ahead.",
|
|
376
|
+
"Hello! What would you like to do?",
|
|
377
|
+
"Standing by. What can I do for you?",
|
|
378
|
+
"At your service. What's on your mind?",
|
|
379
|
+
"Hey! What are we working on?",
|
|
380
|
+
"Online and ready. Fire away.",
|
|
381
|
+
"What's up? I'm all ears."
|
|
382
|
+
];
|
|
383
|
+
|
|
384
|
+
const greeting = greetings[Math.floor(Math.random() * greetings.length)];
|
|
385
|
+
this._pendingGreeting = greeting;
|
|
386
|
+
|
|
387
|
+
eventBus.emit('session:message', { role: 'assistant', text: greeting });
|
|
388
|
+
|
|
389
|
+
return new Promise(async (resolve) => {
|
|
390
|
+
try {
|
|
391
|
+
const provider = localStorage.getItem('voice_provider') || 'groq';
|
|
392
|
+
const voice = localStorage.getItem('voice_voice') || 'M1';
|
|
393
|
+
|
|
394
|
+
const response = await fetch(`${this.serverUrl}/api/tts/generate`, {
|
|
395
|
+
method: 'POST',
|
|
396
|
+
headers: { 'Content-Type': 'application/json' },
|
|
397
|
+
body: JSON.stringify({ text: greeting, provider, voice })
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
if (!response.ok) {
|
|
401
|
+
resolve();
|
|
402
|
+
return;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
const blob = await response.blob();
|
|
406
|
+
const base64 = await this._blobToBase64(blob);
|
|
407
|
+
await this.tts.play(base64);
|
|
408
|
+
resolve();
|
|
409
|
+
|
|
410
|
+
} catch (error) {
|
|
411
|
+
console.error('[VoiceSession] Greeting TTS error:', error);
|
|
412
|
+
resolve();
|
|
413
|
+
}
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// ── Command parsing ──────────────────────────────────────────────────────
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Strip [CANVAS:...], [MUSIC_PLAY], etc. tags from display text.
|
|
421
|
+
* @param {string} text
|
|
422
|
+
* @returns {string}
|
|
423
|
+
*/
|
|
424
|
+
_stripCmdTags(text) {
|
|
425
|
+
if (!text) return '';
|
|
426
|
+
return text
|
|
427
|
+
.replace(/```html[\s\S]*?```/gi, '')
|
|
428
|
+
.replace(/```[\s\S]*?```/g, '')
|
|
429
|
+
.replace(/```html[\s\S]*/gi, '')
|
|
430
|
+
.replace(/```[\s\S]*/g, '')
|
|
431
|
+
.replace(/\[CANVAS_MENU\]/gi, '')
|
|
432
|
+
.replace(/\[CANVAS:[^\]]*\]/gi, '')
|
|
433
|
+
.replace(/\[MUSIC_PLAY(?::[^\]]*)?\]/gi, '')
|
|
434
|
+
.replace(/\[MUSIC_STOP\]/gi, '')
|
|
435
|
+
.replace(/\[MUSIC_NEXT\]/gi, '')
|
|
436
|
+
.replace(/\[SESSION_RESET\]/gi, '')
|
|
437
|
+
.replace(/\[SUNO_GENERATE:[^\]]*\]/gi, '')
|
|
438
|
+
.replace(/\[SPOTIFY:[^\]]*\]/gi, '')
|
|
439
|
+
.replace(/\[SLEEP\]/gi, '')
|
|
440
|
+
.replace(/\[REGISTER_FACE:[^\]]*\]/gi, '')
|
|
441
|
+
.replace(/\[SOUND:[^\]]*\]/gi, '')
|
|
442
|
+
.trim();
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Check for commands while stream is in progress — emit events, don't block.
|
|
447
|
+
* Callers (UI) handle actual DOM/control actions.
|
|
448
|
+
*/
|
|
449
|
+
_checkCmdsInStream(text, seen) {
|
|
450
|
+
if (!text) return;
|
|
451
|
+
|
|
452
|
+
if (/\[CANVAS_MENU\]/i.test(text) && !seen.has('CANVAS_MENU')) {
|
|
453
|
+
seen.add('CANVAS_MENU');
|
|
454
|
+
eventBus.emit('cmd:canvas_menu', {});
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
const canvasMatch = text.match(/\[CANVAS:([^\]]+)\]/i);
|
|
458
|
+
if (canvasMatch && !seen.has('CANVAS_PAGE')) {
|
|
459
|
+
seen.add('CANVAS_PAGE');
|
|
460
|
+
eventBus.emit('cmd:canvas_page', { page: canvasMatch[1].trim() });
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
const musicPlay = text.match(/\[MUSIC_PLAY(?::([^\]]+))?\]/i);
|
|
464
|
+
if (musicPlay && !seen.has('MUSIC_PLAY')) {
|
|
465
|
+
seen.add('MUSIC_PLAY');
|
|
466
|
+
const track = musicPlay[1]?.trim() || null;
|
|
467
|
+
eventBus.emit('cmd:music_play', { track });
|
|
468
|
+
if (this.musicPlayer) {
|
|
469
|
+
track ? this.musicPlayer.play(track) : this.musicPlayer.play();
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
if (/\[MUSIC_STOP\]/i.test(text) && !seen.has('MUSIC_STOP')) {
|
|
474
|
+
seen.add('MUSIC_STOP');
|
|
475
|
+
eventBus.emit('cmd:music_stop', {});
|
|
476
|
+
if (this.musicPlayer) this.musicPlayer.stop();
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (/\[MUSIC_NEXT\]/i.test(text) && !seen.has('MUSIC_NEXT')) {
|
|
480
|
+
seen.add('MUSIC_NEXT');
|
|
481
|
+
eventBus.emit('cmd:music_next', {});
|
|
482
|
+
if (this.musicPlayer) this.musicPlayer.next();
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
const sunoMatch = text.match(/\[SUNO_GENERATE:([^\]]+)\]/i);
|
|
486
|
+
if (sunoMatch && !seen.has('SUNO_GENERATE')) {
|
|
487
|
+
seen.add('SUNO_GENERATE');
|
|
488
|
+
eventBus.emit('cmd:suno_generate', { prompt: sunoMatch[1].trim() });
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
const spotifyMatch = text.match(/\[SPOTIFY:([^|\]]+)(?:\|([^\]]+))?\]/i);
|
|
492
|
+
if (spotifyMatch && !seen.has('SPOTIFY')) {
|
|
493
|
+
seen.add('SPOTIFY');
|
|
494
|
+
eventBus.emit('cmd:spotify', { track: spotifyMatch[1].trim(), artist: spotifyMatch[2]?.trim() || '' });
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
const soundMatch = text.match(/\[SOUND:([^\]]+)\]/i);
|
|
498
|
+
if (soundMatch && !seen.has('SOUND')) {
|
|
499
|
+
seen.add('SOUND');
|
|
500
|
+
eventBus.emit('cmd:sound', { sound: soundMatch[1].trim() });
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
const faceMatch = text.match(/\[REGISTER_FACE:([^\]]+)\]/i);
|
|
504
|
+
if (faceMatch && !seen.has('REGISTER_FACE')) {
|
|
505
|
+
seen.add('REGISTER_FACE');
|
|
506
|
+
eventBus.emit('cmd:register_face', { name: faceMatch[1].trim() });
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (/\[SLEEP\]/i.test(text) && !seen.has('SLEEP')) {
|
|
510
|
+
seen.add('SLEEP');
|
|
511
|
+
eventBus.emit('cmd:sleep', {});
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Final command pass after text_done (catches anything missed during stream).
|
|
517
|
+
*/
|
|
518
|
+
_handleCmds(text, seen) {
|
|
519
|
+
this._checkCmdsInStream(text, seen);
|
|
520
|
+
// AI music trigger scanning (phrase-based, not tag-based)
|
|
521
|
+
if (this.musicPlayer?.checkTriggers) {
|
|
522
|
+
this.musicPlayer.checkTriggers(text);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// ── Helpers ──────────────────────────────────────────────────────────────
|
|
527
|
+
|
|
528
|
+
/**
|
|
529
|
+
* Resume STT after TTS playback ends.
|
|
530
|
+
* Called from tts.onSpeakingChange(false).
|
|
531
|
+
*/
|
|
532
|
+
_resumeListening() {
|
|
533
|
+
if (!this._active) return;
|
|
534
|
+
// 250ms settling delay: lets audio tail-off clear the mic before re-enabling
|
|
535
|
+
// STT so the last fragment of TTS audio isn't captured as user speech.
|
|
536
|
+
// (Reduced from 600ms — DeepgramStreamingSTT mutes its audio pipeline during
|
|
537
|
+
// TTS so echo is already suppressed; 250ms is enough for speaker decay.)
|
|
538
|
+
setTimeout(() => {
|
|
539
|
+
if (!this._active) return;
|
|
540
|
+
// resume() clears the mute flag AND restarts the engine — the engine
|
|
541
|
+
// may have stopped during TTS because onend no longer auto-restarts
|
|
542
|
+
// while muted. resetProcessing() alone won't restart a dead engine.
|
|
543
|
+
if (this.stt.resume) {
|
|
544
|
+
this.stt.resume();
|
|
545
|
+
} else if (this.stt.resetProcessing) {
|
|
546
|
+
this.stt.resetProcessing();
|
|
547
|
+
}
|
|
548
|
+
eventBus.emit('session:listening', {});
|
|
549
|
+
}, 250);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
/**
|
|
553
|
+
* Strip LLM reasoning tokens (e.g., chain-of-thought preamble).
|
|
554
|
+
* @param {string} text
|
|
555
|
+
* @returns {string}
|
|
556
|
+
*/
|
|
557
|
+
_stripReasoningTokens(text) {
|
|
558
|
+
if (!text) return text;
|
|
559
|
+
const patterns = [
|
|
560
|
+
/^.*?I should.*?\./s,
|
|
561
|
+
/^.*?NO_REPLY.*?\./s,
|
|
562
|
+
/^.*?The user.*?\./s,
|
|
563
|
+
/^.*?They say.*?\./s
|
|
564
|
+
];
|
|
565
|
+
let cleaned = text;
|
|
566
|
+
for (const p of patterns) {
|
|
567
|
+
cleaned = cleaned.replace(p, '');
|
|
568
|
+
}
|
|
569
|
+
return cleaned.trim();
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
/**
|
|
573
|
+
* Gather UI context for the conversation API.
|
|
574
|
+
* @returns {object}
|
|
575
|
+
*/
|
|
576
|
+
_getUIContext() {
|
|
577
|
+
const ctx = {
|
|
578
|
+
canvasDisplayed: window.canvasContext?.current_page ?? null,
|
|
579
|
+
canvasVisible: false,
|
|
580
|
+
musicPlaying: this.musicPlayer?.isPlaying ?? false,
|
|
581
|
+
musicTrack: this.musicPlayer?.currentMetadata?.title ?? null,
|
|
582
|
+
musicPanelOpen: this.musicPlayer ? this.musicPlayer.panelState !== 'closed' : false
|
|
583
|
+
};
|
|
584
|
+
return ctx;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
/**
|
|
588
|
+
* Convert a Blob to a base64 string.
|
|
589
|
+
* @param {Blob} blob
|
|
590
|
+
* @returns {Promise<string>}
|
|
591
|
+
*/
|
|
592
|
+
_blobToBase64(blob) {
|
|
593
|
+
return new Promise((resolve, reject) => {
|
|
594
|
+
const reader = new FileReader();
|
|
595
|
+
reader.onloadend = () => {
|
|
596
|
+
const dataUrl = reader.result;
|
|
597
|
+
// Strip the data URL prefix (e.g., "data:audio/wav;base64,")
|
|
598
|
+
const base64 = dataUrl.split(',')[1];
|
|
599
|
+
resolve(base64);
|
|
600
|
+
};
|
|
601
|
+
reader.onerror = reject;
|
|
602
|
+
reader.readAsDataURL(blob);
|
|
603
|
+
});
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
export default VoiceSession;
|