openvoiceui 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +104 -0
- package/Dockerfile +30 -0
- package/LICENSE +21 -0
- package/README.md +638 -0
- package/SETUP.md +360 -0
- package/app.py +232 -0
- package/auto-approve-devices.js +111 -0
- package/cli/index.js +372 -0
- package/config/__init__.py +4 -0
- package/config/default.yaml +43 -0
- package/config/flags.yaml +67 -0
- package/config/loader.py +203 -0
- package/config/providers.yaml +71 -0
- package/config/speech_normalization.yaml +182 -0
- package/config/theme.json +4 -0
- package/data/greetings.json +25 -0
- package/default-pages/ai-image-creator.html +915 -0
- package/default-pages/bulk-image-uploader.html +492 -0
- package/default-pages/desktop.html +2865 -0
- package/default-pages/file-explorer.html +854 -0
- package/default-pages/interactive-map.html +655 -0
- package/default-pages/style-guide.html +1005 -0
- package/default-pages/website-setup.html +1623 -0
- package/deploy/openclaw/Dockerfile +46 -0
- package/deploy/openvoiceui.service +30 -0
- package/deploy/setup-nginx.sh +50 -0
- package/deploy/setup-sudo.sh +306 -0
- package/deploy/skill-runner/Dockerfile +19 -0
- package/deploy/skill-runner/requirements.txt +14 -0
- package/deploy/skill-runner/server.py +269 -0
- package/deploy/supertonic/Dockerfile +22 -0
- package/deploy/supertonic/server.py +79 -0
- package/docker-compose.pinokio.yml +11 -0
- package/docker-compose.yml +59 -0
- package/greetings.json +25 -0
- package/index.html +65 -0
- package/inject-device-identity.js +142 -0
- package/package.json +82 -0
- package/profiles/default.json +114 -0
- package/profiles/manager.py +354 -0
- package/profiles/schema.json +337 -0
- package/prompts/voice-system-prompt.md +149 -0
- package/providers/__init__.py +39 -0
- package/providers/base.py +63 -0
- package/providers/llm/__init__.py +12 -0
- package/providers/llm/base.py +71 -0
- package/providers/llm/clawdbot_provider.py +112 -0
- package/providers/llm/zai_provider.py +115 -0
- package/providers/registry.py +320 -0
- package/providers/stt/__init__.py +12 -0
- package/providers/stt/base.py +58 -0
- package/providers/stt/webspeech_provider.py +49 -0
- package/providers/stt/whisper_provider.py +100 -0
- package/providers/tts/__init__.py +20 -0
- package/providers/tts/base.py +91 -0
- package/providers/tts/groq_provider.py +74 -0
- package/providers/tts/supertonic_provider.py +72 -0
- package/requirements.txt +38 -0
- package/routes/__init__.py +10 -0
- package/routes/admin.py +515 -0
- package/routes/canvas.py +1315 -0
- package/routes/chat.py +51 -0
- package/routes/conversation.py +2158 -0
- package/routes/elevenlabs_hybrid.py +306 -0
- package/routes/greetings.py +98 -0
- package/routes/icons.py +279 -0
- package/routes/image_gen.py +364 -0
- package/routes/instructions.py +190 -0
- package/routes/music.py +838 -0
- package/routes/onboarding.py +43 -0
- package/routes/pi.py +62 -0
- package/routes/profiles.py +215 -0
- package/routes/report_issue.py +68 -0
- package/routes/static_files.py +533 -0
- package/routes/suno.py +664 -0
- package/routes/theme.py +81 -0
- package/routes/transcripts.py +199 -0
- package/routes/vision.py +348 -0
- package/routes/workspace.py +288 -0
- package/server.py +1510 -0
- package/services/__init__.py +1 -0
- package/services/auth.py +143 -0
- package/services/canvas_versioning.py +239 -0
- package/services/db_pool.py +107 -0
- package/services/gateway.py +16 -0
- package/services/gateway_manager.py +333 -0
- package/services/gateways/__init__.py +12 -0
- package/services/gateways/base.py +110 -0
- package/services/gateways/compat.py +264 -0
- package/services/gateways/openclaw.py +1134 -0
- package/services/health.py +100 -0
- package/services/memory_client.py +455 -0
- package/services/paths.py +26 -0
- package/services/speech_normalizer.py +285 -0
- package/services/tts.py +270 -0
- package/setup-config.js +262 -0
- package/sounds/air_horn.mp3 +0 -0
- package/sounds/bruh.mp3 +0 -0
- package/sounds/crowd_cheer.mp3 +0 -0
- package/sounds/gunshot.mp3 +0 -0
- package/sounds/impact.mp3 +0 -0
- package/sounds/lets_go.mp3 +0 -0
- package/sounds/record_stop.mp3 +0 -0
- package/sounds/rewind.mp3 +0 -0
- package/sounds/sad_trombone.mp3 +0 -0
- package/sounds/scratch_long.mp3 +0 -0
- package/sounds/yeah.mp3 +0 -0
- package/src/adapters/ClawdBotAdapter.js +264 -0
- package/src/adapters/_template.js +133 -0
- package/src/adapters/elevenlabs-classic.js +841 -0
- package/src/adapters/elevenlabs-hybrid.js +812 -0
- package/src/adapters/hume-evi.js +676 -0
- package/src/admin.html +1339 -0
- package/src/app.js +8802 -0
- package/src/core/Config.js +173 -0
- package/src/core/EmotionEngine.js +307 -0
- package/src/core/EventBridge.js +180 -0
- package/src/core/EventBus.js +117 -0
- package/src/core/VoiceSession.js +607 -0
- package/src/face/BaseFace.js +259 -0
- package/src/face/EyeFace.js +208 -0
- package/src/face/HaloSmokeFace.js +509 -0
- package/src/face/manifest.json +27 -0
- package/src/face/previews/eyes.svg +16 -0
- package/src/face/previews/orb.svg +29 -0
- package/src/features/MusicPlayer.js +620 -0
- package/src/features/Soundboard.js +128 -0
- package/src/providers/DeepgramSTT.js +472 -0
- package/src/providers/DeepgramStreamingSTT.js +766 -0
- package/src/providers/GroqSTT.js +559 -0
- package/src/providers/TTSPlayer.js +323 -0
- package/src/providers/WebSpeechSTT.js +479 -0
- package/src/providers/tts/BaseTTSProvider.js +81 -0
- package/src/providers/tts/HumeProvider.js +77 -0
- package/src/providers/tts/SupertonicProvider.js +174 -0
- package/src/providers/tts/index.js +140 -0
- package/src/shell/adapter-registry.js +154 -0
- package/src/shell/caller-bridge.js +35 -0
- package/src/shell/camera-bridge.js +28 -0
- package/src/shell/canvas-bridge.js +32 -0
- package/src/shell/commercial-bridge.js +44 -0
- package/src/shell/face-bridge.js +44 -0
- package/src/shell/music-bridge.js +60 -0
- package/src/shell/orchestrator.js +233 -0
- package/src/shell/profile-discovery.js +303 -0
- package/src/shell/sounds-bridge.js +28 -0
- package/src/shell/transcript-bridge.js +61 -0
- package/src/shell/waveform-bridge.js +33 -0
- package/src/styles/base.css +2862 -0
- package/src/styles/face.css +417 -0
- package/src/styles/pi-overrides.css +89 -0
- package/src/styles/theme-dark.css +67 -0
- package/src/test-tts.html +175 -0
- package/src/ui/AppShell.js +544 -0
- package/src/ui/ProfileSwitcher.js +228 -0
- package/src/ui/SessionControl.js +240 -0
- package/src/ui/face/FacePicker.js +195 -0
- package/src/ui/face/FaceRenderer.js +309 -0
- package/src/ui/settings/PlaylistEditor.js +366 -0
- package/src/ui/settings/SettingsPanel.css +684 -0
- package/src/ui/settings/SettingsPanel.js +419 -0
- package/src/ui/settings/TTSVoicePreview.js +210 -0
- package/src/ui/themes/ThemeManager.js +213 -0
- package/src/ui/visualizers/BaseVisualizer.js +29 -0
- package/src/ui/visualizers/PartyFXVisualizer.css +291 -0
- package/src/ui/visualizers/PartyFXVisualizer.js +637 -0
- package/static/emulators/jsdos/js-dos.css +1 -0
- package/static/emulators/jsdos/js-dos.js +22 -0
- package/static/favicon.svg +55 -0
- package/static/icons/apple-touch-icon.png +0 -0
- package/static/icons/favicon-32.png +0 -0
- package/static/icons/icon-192.png +0 -0
- package/static/icons/icon-512.png +0 -0
- package/static/install.html +449 -0
- package/static/manifest.json +26 -0
- package/static/sw.js +21 -0
- package/tts_providers/__init__.py +136 -0
- package/tts_providers/base_provider.py +319 -0
- package/tts_providers/groq_provider.py +155 -0
- package/tts_providers/hume_provider.py +226 -0
- package/tts_providers/providers_config.json +119 -0
- package/tts_providers/qwen3_provider.py +371 -0
- package/tts_providers/resemble_provider.py +315 -0
- package/tts_providers/supertonic_provider.py +557 -0
- package/tts_providers/supertonic_tts.py +399 -0
|
@@ -0,0 +1,559 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GroqSTT — Server-side speech recognition via Groq Whisper API.
|
|
3
|
+
* Captures audio with MediaRecorder, uses VAD to detect speech/silence,
|
|
4
|
+
* sends audio chunks to /api/stt/groq for transcription.
|
|
5
|
+
*
|
|
6
|
+
* Drop-in replacement for WebSpeechSTT with built-in PTT support.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { GroqSTT, GroqWakeWordDetector } from './GroqSTT.js';
|
|
10
|
+
*
|
|
11
|
+
* const stt = new GroqSTT();
|
|
12
|
+
* stt.onResult = (text) => console.log('Heard:', text);
|
|
13
|
+
* await stt.start();
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
// ===== GROQ STT =====
|
|
17
|
+
// Server-side speech recognition via Groq Whisper API
|
|
18
|
+
class GroqSTT {
|
|
19
|
+
constructor(config = {}) {
|
|
20
|
+
this.serverUrl = (config.serverUrl || window.AGENT_CONFIG?.serverUrl || window.location.origin).replace(/\/$/, '');
|
|
21
|
+
this.isListening = false;
|
|
22
|
+
this.onResult = null;
|
|
23
|
+
this.onError = null;
|
|
24
|
+
this.onListenFinal = null; // Listen panel hook — called with each transcript
|
|
25
|
+
this.onInterim = null; // Not used (Groq has no interim results)
|
|
26
|
+
this.mediaRecorder = null;
|
|
27
|
+
this.audioChunks = [];
|
|
28
|
+
this.stream = null;
|
|
29
|
+
this.isProcessing = false;
|
|
30
|
+
this.accumulatedText = ''; // PTT compatibility — last transcript
|
|
31
|
+
|
|
32
|
+
// PTT support (built-in, no monkey-patching needed)
|
|
33
|
+
this._micMuted = false;
|
|
34
|
+
this._pttHolding = false;
|
|
35
|
+
this._muteActive = false; // Set by mute(), cleared by resume() — survives API call finally blocks
|
|
36
|
+
|
|
37
|
+
// VAD (Voice Activity Detection) settings
|
|
38
|
+
this.silenceTimer = null;
|
|
39
|
+
this.silenceDelayMs = 800; // 0.8s silence = end of speech (profile can override)
|
|
40
|
+
this.accumulationDelayMs = config.accumulationDelayMs || 0; // No accumulation delay — send immediately (profile can override)
|
|
41
|
+
this.vadThreshold = 25; // FFT average amplitude threshold (profile can override)
|
|
42
|
+
this.minSpeechMs = 300; // Must sustain above threshold for this long before counting as speech
|
|
43
|
+
this.maxRecordingMs = 45000; // 45s max before auto-chunk (profile can override)
|
|
44
|
+
this.maxRecordingTimer = null;
|
|
45
|
+
this.isSpeaking = false;
|
|
46
|
+
this.stoppingRecorder = false;
|
|
47
|
+
this.hadSpeechInChunk = false;
|
|
48
|
+
this._speechStartTime = 0; // When sustained speech started
|
|
49
|
+
this._resumedSpeechStart = 0; // When resumed speech started (for clearing silence timer)
|
|
50
|
+
|
|
51
|
+
// Audio analysis for VAD
|
|
52
|
+
this._audioCtx = null;
|
|
53
|
+
this._analyser = null;
|
|
54
|
+
this._vadAnimFrame = null;
|
|
55
|
+
this._accumulationTimer = null; // Accumulate transcripts across chunks before sending
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
isSupported() {
|
|
59
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async start() {
|
|
63
|
+
if (this.isListening) return true;
|
|
64
|
+
if (this._micMuted) return false;
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
// Get mic stream (reuse existing if available)
|
|
68
|
+
if (!this.stream || !this.stream.active) {
|
|
69
|
+
this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
this._setupRecorder();
|
|
73
|
+
this._startVAD();
|
|
74
|
+
|
|
75
|
+
this.mediaRecorder.start();
|
|
76
|
+
this.isListening = true;
|
|
77
|
+
console.log('Groq STT started');
|
|
78
|
+
return true;
|
|
79
|
+
} catch (error) {
|
|
80
|
+
console.error('Failed to start Groq STT:', error);
|
|
81
|
+
if (error.name === 'NotFoundError' || error.name === 'DevicesNotFoundError') {
|
|
82
|
+
if (this.onError) this.onError('no-device');
|
|
83
|
+
} else if (error.name === 'NotAllowedError') {
|
|
84
|
+
if (this.onError) this.onError('not-allowed');
|
|
85
|
+
} else {
|
|
86
|
+
if (this.onError) this.onError(error);
|
|
87
|
+
}
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
_setupRecorder() {
|
|
93
|
+
const options = { mimeType: 'audio/webm;codecs=opus' };
|
|
94
|
+
this.mediaRecorder = new MediaRecorder(this.stream, options);
|
|
95
|
+
this.audioChunks = [];
|
|
96
|
+
|
|
97
|
+
this.mediaRecorder.ondataavailable = (event) => {
|
|
98
|
+
if (event.data.size > 0) {
|
|
99
|
+
this.audioChunks.push(event.data);
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
this.mediaRecorder.onstop = async () => {
|
|
104
|
+
// Snapshot and clear chunks immediately
|
|
105
|
+
const chunks = this.audioChunks;
|
|
106
|
+
const hadSpeech = this.hadSpeechInChunk;
|
|
107
|
+
this.audioChunks = [];
|
|
108
|
+
this.hadSpeechInChunk = false;
|
|
109
|
+
this.stoppingRecorder = false;
|
|
110
|
+
|
|
111
|
+
// Restart recording IMMEDIATELY to minimize the gap where audio is lost.
|
|
112
|
+
// The API call below runs in parallel — no words dropped at chunk boundaries.
|
|
113
|
+
if (this.isListening && !this._micMuted && !this._muteActive && !this._pttHolding) {
|
|
114
|
+
this.isSpeaking = false;
|
|
115
|
+
this.mediaRecorder.start();
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (chunks.length === 0) return;
|
|
119
|
+
|
|
120
|
+
// If muted (TTS playing), discard audio
|
|
121
|
+
if ((this.isProcessing || this._muteActive) && !this._pttHolding) {
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
this.isProcessing = true;
|
|
126
|
+
|
|
127
|
+
// Clear timers
|
|
128
|
+
if (this.silenceTimer) {
|
|
129
|
+
clearTimeout(this.silenceTimer);
|
|
130
|
+
this.silenceTimer = null;
|
|
131
|
+
}
|
|
132
|
+
if (this.maxRecordingTimer) {
|
|
133
|
+
clearTimeout(this.maxRecordingTimer);
|
|
134
|
+
this.maxRecordingTimer = null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const audioBlob = new Blob(chunks, { type: 'audio/webm' });
|
|
138
|
+
|
|
139
|
+
// Skip if no VAD speech detected AND audio is small — prevents Whisper
|
|
140
|
+
// hallucinations on silence. But if VAD missed speech (quiet mic), still
|
|
141
|
+
// send larger chunks and let Whisper decide.
|
|
142
|
+
if (!hadSpeech && audioBlob.size < 50000) {
|
|
143
|
+
console.log('Groq STT: skipping - no speech detected (' + audioBlob.size + ' bytes)');
|
|
144
|
+
this.isProcessing = false;
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
try {
|
|
149
|
+
console.log('Groq STT: sending audio (' + audioBlob.size + ' bytes)');
|
|
150
|
+
const formData = new FormData();
|
|
151
|
+
formData.append('audio', audioBlob, 'audio.webm');
|
|
152
|
+
|
|
153
|
+
const response = await fetch(`${this.serverUrl}/api/stt/groq`, {
|
|
154
|
+
method: 'POST',
|
|
155
|
+
body: formData
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
const data = await response.json();
|
|
159
|
+
|
|
160
|
+
if (data.transcript && data.transcript.trim()) {
|
|
161
|
+
console.log('Groq STT transcript:', data.transcript);
|
|
162
|
+
if (this.onListenFinal) this.onListenFinal(data.transcript);
|
|
163
|
+
|
|
164
|
+
// PTT mode: send immediately (user released button = done talking)
|
|
165
|
+
if (this._micMuted) {
|
|
166
|
+
this.accumulatedText = data.transcript.trim();
|
|
167
|
+
if (this.onResult) this.onResult(this.accumulatedText);
|
|
168
|
+
this.accumulatedText = '';
|
|
169
|
+
} else {
|
|
170
|
+
// Listen mode: accumulate across chunks, send after silence
|
|
171
|
+
this.accumulatedText = this.accumulatedText
|
|
172
|
+
? this.accumulatedText + ' ' + data.transcript.trim()
|
|
173
|
+
: data.transcript.trim();
|
|
174
|
+
|
|
175
|
+
// Clear any existing accumulation timer
|
|
176
|
+
if (this._accumulationTimer) {
|
|
177
|
+
clearTimeout(this._accumulationTimer);
|
|
178
|
+
this._accumulationTimer = null;
|
|
179
|
+
}
|
|
180
|
+
// Short window to merge consecutive chunks, then send
|
|
181
|
+
this._accumulationTimer = setTimeout(() => {
|
|
182
|
+
this._accumulationTimer = null;
|
|
183
|
+
const fullText = this.accumulatedText.trim();
|
|
184
|
+
if (fullText && this.onResult) {
|
|
185
|
+
console.log('Groq STT accumulated result:', fullText);
|
|
186
|
+
this.onResult(fullText);
|
|
187
|
+
}
|
|
188
|
+
this.accumulatedText = '';
|
|
189
|
+
}, this.accumulationDelayMs);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
} catch (error) {
|
|
193
|
+
console.error('Groq STT error:', error);
|
|
194
|
+
if (this.onError) this.onError(error);
|
|
195
|
+
} finally {
|
|
196
|
+
this.isProcessing = false;
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
_startVAD() {
|
|
202
|
+
// Only create AudioContext once per stream
|
|
203
|
+
if (this._audioCtx && this._audioCtx.state !== 'closed') {
|
|
204
|
+
// VAD already running, just restart the animation frame loop
|
|
205
|
+
if (!this._vadAnimFrame) this._runVADLoop();
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
this._audioCtx = new AudioContext();
|
|
210
|
+
const source = this._audioCtx.createMediaStreamSource(this.stream);
|
|
211
|
+
this._analyser = this._audioCtx.createAnalyser();
|
|
212
|
+
this._analyser.fftSize = 512;
|
|
213
|
+
source.connect(this._analyser);
|
|
214
|
+
|
|
215
|
+
this._runVADLoop();
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
_runVADLoop() {
|
|
219
|
+
const bufferLength = this._analyser.frequencyBinCount;
|
|
220
|
+
const dataArray = new Uint8Array(bufferLength);
|
|
221
|
+
|
|
222
|
+
const checkLevel = () => {
|
|
223
|
+
if (!this.isListening) {
|
|
224
|
+
this._vadAnimFrame = null;
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
this._analyser.getByteFrequencyData(dataArray);
|
|
229
|
+
const average = dataArray.reduce((a, b) => a + b) / bufferLength;
|
|
230
|
+
const isSpeakingNow = average > this.vadThreshold;
|
|
231
|
+
|
|
232
|
+
// Skip VAD processing while muted (TTS playing) — prevents speaker
|
|
233
|
+
// audio from being detected as speech and queuing phantom transcripts
|
|
234
|
+
if (this._muteActive) {
|
|
235
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
236
|
+
return;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (isSpeakingNow && !this.isSpeaking) {
|
|
240
|
+
// Potential speech — check minimum duration before confirming
|
|
241
|
+
const now = Date.now();
|
|
242
|
+
if (!this._speechStartTime) {
|
|
243
|
+
this._speechStartTime = now;
|
|
244
|
+
}
|
|
245
|
+
if (now - this._speechStartTime < this.minSpeechMs) {
|
|
246
|
+
// Still below minimum — don't confirm yet, just keep checking
|
|
247
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Speech confirmed (sustained above threshold for minSpeechMs)
|
|
252
|
+
this.isSpeaking = true;
|
|
253
|
+
this.hadSpeechInChunk = true;
|
|
254
|
+
this._speechStartTime = 0;
|
|
255
|
+
|
|
256
|
+
if (this.silenceTimer) {
|
|
257
|
+
clearTimeout(this.silenceTimer);
|
|
258
|
+
this.silenceTimer = null;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Max recording safety timer
|
|
262
|
+
if (!this.maxRecordingTimer && !this.isProcessing && !this.stoppingRecorder) {
|
|
263
|
+
this.maxRecordingTimer = setTimeout(() => {
|
|
264
|
+
this.maxRecordingTimer = null;
|
|
265
|
+
this.isSpeaking = false;
|
|
266
|
+
this.stoppingRecorder = true;
|
|
267
|
+
if (this.silenceTimer) {
|
|
268
|
+
clearTimeout(this.silenceTimer);
|
|
269
|
+
this.silenceTimer = null;
|
|
270
|
+
}
|
|
271
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
272
|
+
this.mediaRecorder.stop();
|
|
273
|
+
}
|
|
274
|
+
}, this.maxRecordingMs);
|
|
275
|
+
}
|
|
276
|
+
} else if (isSpeakingNow && this.isSpeaking) {
|
|
277
|
+
// Continued speech — user still talking after a brief dip.
|
|
278
|
+
// Only clear silence timer after sustained speech (minSpeechMs) to
|
|
279
|
+
// prevent ambient noise blips from keeping the recording open forever.
|
|
280
|
+
const now = Date.now();
|
|
281
|
+
if (!this._resumedSpeechStart) {
|
|
282
|
+
this._resumedSpeechStart = now;
|
|
283
|
+
}
|
|
284
|
+
if (now - this._resumedSpeechStart >= this.minSpeechMs && this.silenceTimer) {
|
|
285
|
+
clearTimeout(this.silenceTimer);
|
|
286
|
+
this.silenceTimer = null;
|
|
287
|
+
this._resumedSpeechStart = 0;
|
|
288
|
+
}
|
|
289
|
+
} else if (!isSpeakingNow && !this.isSpeaking) {
|
|
290
|
+
// Below threshold and not yet confirmed — reset speech start timer
|
|
291
|
+
this._speechStartTime = 0;
|
|
292
|
+
this._resumedSpeechStart = 0;
|
|
293
|
+
} else if (!isSpeakingNow && this.isSpeaking && !this.isProcessing && !this.stoppingRecorder) {
|
|
294
|
+
// Silence after confirmed speech — start silence timer
|
|
295
|
+
this._resumedSpeechStart = 0;
|
|
296
|
+
if (!this.silenceTimer) {
|
|
297
|
+
this.silenceTimer = setTimeout(() => {
|
|
298
|
+
this.isSpeaking = false;
|
|
299
|
+
this.stoppingRecorder = true;
|
|
300
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
301
|
+
this.mediaRecorder.stop();
|
|
302
|
+
}
|
|
303
|
+
}, this.silenceDelayMs);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
308
|
+
};
|
|
309
|
+
|
|
310
|
+
this._vadAnimFrame = requestAnimationFrame(checkLevel);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
stop() {
|
|
314
|
+
this.isListening = false;
|
|
315
|
+
this.stoppingRecorder = false;
|
|
316
|
+
this._micMuted = false;
|
|
317
|
+
this._muteActive = false;
|
|
318
|
+
|
|
319
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
320
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
321
|
+
if (this._accumulationTimer) { clearTimeout(this._accumulationTimer); this._accumulationTimer = null; }
|
|
322
|
+
if (this._vadAnimFrame) { cancelAnimationFrame(this._vadAnimFrame); this._vadAnimFrame = null; }
|
|
323
|
+
|
|
324
|
+
if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
|
|
325
|
+
this.mediaRecorder.stop();
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Release mic stream
|
|
329
|
+
if (this.stream) {
|
|
330
|
+
this.stream.getTracks().forEach(track => track.stop());
|
|
331
|
+
this.stream = null;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Close audio context
|
|
335
|
+
if (this._audioCtx) {
|
|
336
|
+
this._audioCtx.close().catch(() => {});
|
|
337
|
+
this._audioCtx = null;
|
|
338
|
+
this._analyser = null;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
console.log('Groq STT stopped');
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
resetProcessing() {
|
|
345
|
+
this.isProcessing = false;
|
|
346
|
+
this.accumulatedText = '';
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/** Alias for mute() — VoiceConversation calls pause() during greeting. */
|
|
350
|
+
pause() {
|
|
351
|
+
this.mute();
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* Mute STT — called when TTS starts speaking.
|
|
356
|
+
* Stops recording and discards any pending audio to prevent echo.
|
|
357
|
+
* Does NOT release the mic stream or change isListening state.
|
|
358
|
+
*/
|
|
359
|
+
mute() {
|
|
360
|
+
this._muteActive = true;
|
|
361
|
+
this.isProcessing = true;
|
|
362
|
+
this.hadSpeechInChunk = false;
|
|
363
|
+
this.accumulatedText = '';
|
|
364
|
+
if (this.silenceTimer) {
|
|
365
|
+
clearTimeout(this.silenceTimer);
|
|
366
|
+
this.silenceTimer = null;
|
|
367
|
+
}
|
|
368
|
+
if (this.maxRecordingTimer) {
|
|
369
|
+
clearTimeout(this.maxRecordingTimer);
|
|
370
|
+
this.maxRecordingTimer = null;
|
|
371
|
+
}
|
|
372
|
+
if (this._accumulationTimer) {
|
|
373
|
+
clearTimeout(this._accumulationTimer);
|
|
374
|
+
this._accumulationTimer = null;
|
|
375
|
+
}
|
|
376
|
+
// Stop recording but keep stream alive
|
|
377
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
378
|
+
this.mediaRecorder.stop();
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/**
|
|
383
|
+
* Resume STT after TTS finishes.
|
|
384
|
+
* Restarts recording from clean state.
|
|
385
|
+
*/
|
|
386
|
+
resume() {
|
|
387
|
+
this._muteActive = false;
|
|
388
|
+
this.isProcessing = false;
|
|
389
|
+
this.stoppingRecorder = false;
|
|
390
|
+
this.hadSpeechInChunk = false;
|
|
391
|
+
this.isSpeaking = false;
|
|
392
|
+
this.audioChunks = [];
|
|
393
|
+
|
|
394
|
+
// Restart recording if session is active and not muted
|
|
395
|
+
if (this.isListening && !this._micMuted) {
|
|
396
|
+
if (this.stream && this.stream.active) {
|
|
397
|
+
// MediaRecorder may need to be recreated if stream changed
|
|
398
|
+
if (!this.mediaRecorder || this.mediaRecorder.stream !== this.stream) {
|
|
399
|
+
this._setupRecorder();
|
|
400
|
+
}
|
|
401
|
+
if (this.mediaRecorder.state === 'inactive') {
|
|
402
|
+
this.mediaRecorder.start();
|
|
403
|
+
}
|
|
404
|
+
// Restart VAD loop if it stopped
|
|
405
|
+
if (!this._vadAnimFrame) {
|
|
406
|
+
this._startVAD();
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// --- PTT helpers (called from PTT code in app.js) ---
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* PTT activate — start recording for push-to-talk.
|
|
416
|
+
* Called when user presses the PTT button.
|
|
417
|
+
*/
|
|
418
|
+
pttActivate() {
|
|
419
|
+
this._pttHolding = true;
|
|
420
|
+
this._micMuted = false;
|
|
421
|
+
this._muteActive = false; // Clear stale TTS mute — PTT overrides
|
|
422
|
+
this.isProcessing = false;
|
|
423
|
+
this.accumulatedText = '';
|
|
424
|
+
this.hadSpeechInChunk = false;
|
|
425
|
+
this.audioChunks = [];
|
|
426
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
427
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
428
|
+
|
|
429
|
+
// Start recording
|
|
430
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'inactive') {
|
|
431
|
+
this.mediaRecorder.start();
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* PTT release — stop recording and force transcription.
|
|
437
|
+
* Called when user releases the PTT button.
|
|
438
|
+
* Unlike mute(), this DOES process the captured audio.
|
|
439
|
+
*/
|
|
440
|
+
pttRelease() {
|
|
441
|
+
this._pttHolding = false;
|
|
442
|
+
this._micMuted = true;
|
|
443
|
+
this.hadSpeechInChunk = true; // Force transcription regardless
|
|
444
|
+
this.stoppingRecorder = true;
|
|
445
|
+
|
|
446
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
447
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
448
|
+
|
|
449
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
450
|
+
this.mediaRecorder.stop();
|
|
451
|
+
// onstop handler will send to Groq and call onResult
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* PTT mute — stop recording and discard audio.
|
|
457
|
+
* Called when PTT mode is toggled ON (mic off by default).
|
|
458
|
+
*/
|
|
459
|
+
pttMute() {
|
|
460
|
+
this._pttHolding = false;
|
|
461
|
+
this._micMuted = true;
|
|
462
|
+
this.hadSpeechInChunk = false;
|
|
463
|
+
if (this.silenceTimer) { clearTimeout(this.silenceTimer); this.silenceTimer = null; }
|
|
464
|
+
if (this.maxRecordingTimer) { clearTimeout(this.maxRecordingTimer); this.maxRecordingTimer = null; }
|
|
465
|
+
this.isProcessing = true; // Prevents onstop from transcribing
|
|
466
|
+
if (this.mediaRecorder && this.mediaRecorder.state === 'recording') {
|
|
467
|
+
this.mediaRecorder.stop();
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* PTT unmute — resume continuous listening.
|
|
473
|
+
* Called when PTT mode is toggled OFF.
|
|
474
|
+
*/
|
|
475
|
+
pttUnmute() {
|
|
476
|
+
this._micMuted = false;
|
|
477
|
+
this._pttHolding = false;
|
|
478
|
+
this.isProcessing = false;
|
|
479
|
+
this.stoppingRecorder = false;
|
|
480
|
+
this.hadSpeechInChunk = false;
|
|
481
|
+
this.audioChunks = [];
|
|
482
|
+
|
|
483
|
+
if (this.isListening && this.mediaRecorder && this.mediaRecorder.state === 'inactive') {
|
|
484
|
+
this.mediaRecorder.start();
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
// ===== GROQ WAKE WORD DETECTOR =====
|
|
491
|
+
// Listens for wake words using Groq Whisper API.
|
|
492
|
+
// Continuously records, transcribes, and checks for wake phrases.
|
|
493
|
+
class GroqWakeWordDetector {
|
|
494
|
+
constructor() {
|
|
495
|
+
this.isListening = false;
|
|
496
|
+
this.onWakeWordDetected = null;
|
|
497
|
+
this.wakeWords = ['wake up'];
|
|
498
|
+
this._stt = null;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
isSupported() {
|
|
502
|
+
return !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
async start() {
|
|
506
|
+
if (this.isListening) return true;
|
|
507
|
+
|
|
508
|
+
this._stt = new GroqSTT();
|
|
509
|
+
// Faster settings for wake word detection
|
|
510
|
+
this._stt.silenceDelayMs = 1500; // 1.5s silence (faster response)
|
|
511
|
+
this._stt.maxRecordingMs = 10000; // 10s max chunks
|
|
512
|
+
this._stt.vadThreshold = 40; // Sensitive but not noise-triggering
|
|
513
|
+
|
|
514
|
+
this._stt.onResult = (transcript) => {
|
|
515
|
+
const lower = transcript.toLowerCase();
|
|
516
|
+
console.log(`Wake word detector heard: "${transcript}"`);
|
|
517
|
+
if (this.wakeWords.some(ww => lower.includes(ww))) {
|
|
518
|
+
console.log('Wake word detected!');
|
|
519
|
+
if (this.onWakeWordDetected) {
|
|
520
|
+
this.onWakeWordDetected();
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
};
|
|
524
|
+
|
|
525
|
+
this._stt.onError = (error) => {
|
|
526
|
+
console.warn('Wake word detector error:', error);
|
|
527
|
+
};
|
|
528
|
+
|
|
529
|
+
this.isListening = true;
|
|
530
|
+
const ok = await this._stt.start();
|
|
531
|
+
if (!ok) {
|
|
532
|
+
this.isListening = false;
|
|
533
|
+
return false;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
console.log('Groq wake word detector started');
|
|
537
|
+
return true;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
stop() {
|
|
541
|
+
this.isListening = false;
|
|
542
|
+
if (this._stt) {
|
|
543
|
+
this._stt.stop();
|
|
544
|
+
this._stt = null;
|
|
545
|
+
}
|
|
546
|
+
console.log('Groq wake word detector stopped');
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
async toggle() {
|
|
550
|
+
if (this.isListening) {
|
|
551
|
+
this.stop();
|
|
552
|
+
return false;
|
|
553
|
+
} else {
|
|
554
|
+
return await this.start();
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
export { GroqSTT, GroqWakeWordDetector };
|