@unctad-ai/voice-agent-core 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config/defaults.d.ts +128 -0
- package/dist/config/defaults.d.ts.map +1 -0
- package/dist/config/defaults.js +169 -0
- package/dist/config/defaults.js.map +1 -0
- package/dist/contexts/SiteConfigContext.d.ts +7 -0
- package/dist/contexts/SiteConfigContext.d.ts.map +1 -0
- package/dist/contexts/SiteConfigContext.js +14 -0
- package/dist/contexts/SiteConfigContext.js.map +1 -0
- package/dist/hooks/useAudioPlayback.d.ts +18 -0
- package/dist/hooks/useAudioPlayback.d.ts.map +1 -0
- package/dist/hooks/useAudioPlayback.js +482 -0
- package/dist/hooks/useAudioPlayback.js.map +1 -0
- package/dist/hooks/useTenVAD.d.ts +42 -0
- package/dist/hooks/useTenVAD.d.ts.map +1 -0
- package/dist/hooks/useTenVAD.js +318 -0
- package/dist/hooks/useTenVAD.js.map +1 -0
- package/dist/hooks/useVoiceAgent.d.ts +50 -0
- package/dist/hooks/useVoiceAgent.d.ts.map +1 -0
- package/dist/hooks/useVoiceAgent.js +1005 -0
- package/dist/hooks/useVoiceAgent.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/services/voiceApi.d.ts +22 -0
- package/dist/services/voiceApi.d.ts.map +1 -0
- package/dist/services/voiceApi.js +93 -0
- package/dist/services/voiceApi.js.map +1 -0
- package/dist/types/config.d.ts +53 -0
- package/dist/types/config.d.ts.map +1 -0
- package/dist/types/config.js +2 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/errors.d.ts +2 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +2 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/settings.d.ts +26 -0
- package/dist/types/settings.d.ts.map +1 -0
- package/dist/types/settings.js +2 -0
- package/dist/types/settings.js.map +1 -0
- package/dist/types/voice.d.ts +20 -0
- package/dist/types/voice.d.ts.map +1 -0
- package/dist/types/voice.js +11 -0
- package/dist/types/voice.js.map +1 -0
- package/dist/utils/audioUtils.d.ts +2 -0
- package/dist/utils/audioUtils.d.ts.map +1 -0
- package/dist/utils/audioUtils.js +30 -0
- package/dist/utils/audioUtils.js.map +1 -0
- package/dist/utils/wavParser.d.ts +27 -0
- package/dist/utils/wavParser.d.ts.map +1 -0
- package/dist/utils/wavParser.js +75 -0
- package/dist/utils/wavParser.js.map +1 -0
- package/package.json +47 -0
|
@@ -0,0 +1,1005 @@
|
|
|
1
|
+
import { useCallback, useEffect, useRef, useState } from 'react';
|
|
2
|
+
import { useTenVAD } from './useTenVAD';
|
|
3
|
+
import { useChat } from '@ai-sdk/react';
|
|
4
|
+
import { DefaultChatTransport, lastAssistantMessageIsCompleteWithToolCalls } from 'ai';
|
|
5
|
+
import { useNavigate, useLocation, useParams } from 'react-router';
|
|
6
|
+
import { useUIActionRegistry, useFormFieldRegistry, createClientToolHandler } from '@unctad-ai/voice-agent-registries';
|
|
7
|
+
import { float32ToWav } from '../utils/audioUtils';
|
|
8
|
+
import { transcribeAudio, synthesizeSpeech, streamSpeech, checkLLMHealth, } from '../services/voiceApi';
|
|
9
|
+
import { useAudioPlayback } from './useAudioPlayback';
|
|
10
|
+
import { useSiteConfig } from '../contexts/SiteConfigContext';
|
|
11
|
+
import { BARGE_IN, GUARD_DELAY_MS, MAX_STT_RETRIES, RETRY_BASE_DELAY_MS, MISFIRE_DISMISS_MS, LLM_ERROR_DISMISS_MS, MAX_NO_SPEECH_PROB, MIN_AVG_LOGPROB, MIC_TOGGLE_DEBOUNCE_MS, PIPELINE_TIMEOUT_MS, VAD, SILENT_MARKER, END_SESSION_MARKER, ACTION_BADGE_CONFIG, } from '../config/defaults';
|
|
12
|
+
/**
|
|
13
|
+
* Strip reasoning-model chain-of-thought from LLM output.
|
|
14
|
+
* Handles both tagged (<think>...</think>) and untagged CoT.
|
|
15
|
+
*/
|
|
16
|
+
function stripChainOfThought(raw) {
|
|
17
|
+
let text = raw;
|
|
18
|
+
// Tagged CoT: <think>...</think>
|
|
19
|
+
text = text.replace(/<think>[\s\S]*?<\/think>/gi, '');
|
|
20
|
+
// Untagged CoT: reasoning paragraphs before the actual answer
|
|
21
|
+
const paragraphs = text
|
|
22
|
+
.split(/\n\n+/)
|
|
23
|
+
.map((p) => p.trim())
|
|
24
|
+
.filter(Boolean);
|
|
25
|
+
if (paragraphs.length > 1) {
|
|
26
|
+
const reasoningPatterns = /\b(we need to|we should|we must|according to rules|the user says|ensure no|two sentences|under \d+ words|no markdown|no contractions|let me think|so we|that'?s \d+ sentences)\b/i;
|
|
27
|
+
const hasReasoning = paragraphs.slice(0, -1).some((p) => reasoningPatterns.test(p));
|
|
28
|
+
if (hasReasoning) {
|
|
29
|
+
text = paragraphs[paragraphs.length - 1];
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return text.trim();
|
|
33
|
+
}
|
|
34
|
+
/** Strip markdown formatting and LLM artifacts for the transcript panel. */
|
|
35
|
+
function sanitizeForTranscript(raw) {
|
|
36
|
+
return (stripChainOfThought(raw)
|
|
37
|
+
.replace(/\|[^\n]+\|/g, '')
|
|
38
|
+
.replace(/^\s*[-|: ]+$/gm, '')
|
|
39
|
+
.replace(/^\s*[-–•*]\s+/gm, '')
|
|
40
|
+
.replace(/^\s*\d+\.\s+/gm, '')
|
|
41
|
+
.replace(/\*{1,2}([^*]+)\*{1,2}/g, '$1')
|
|
42
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
43
|
+
.replace(/```[\s\S]*?```/g, '')
|
|
44
|
+
// Strip bracketed stage directions ([Awaiting response], [thinking], etc.)
|
|
45
|
+
.replace(/\[[^\]]{2,}\]/g, '')
|
|
46
|
+
.replace(/\n{2,}/g, ' ')
|
|
47
|
+
.replace(/\n/g, ' ')
|
|
48
|
+
.replace(/ {2,}/g, ' ')
|
|
49
|
+
.replace(/\.{2,}/g, '.')
|
|
50
|
+
.replace(/\.\s*\./g, '.')
|
|
51
|
+
.trim());
|
|
52
|
+
}
|
|
53
|
+
/** Known Whisper hallucinations on near-silent audio */
|
|
54
|
+
const WHISPER_HALLUCINATIONS = new Set([
|
|
55
|
+
'thank you.',
|
|
56
|
+
'thank you',
|
|
57
|
+
'thank you for watching.',
|
|
58
|
+
'thank you for watching',
|
|
59
|
+
'thanks.',
|
|
60
|
+
'thanks',
|
|
61
|
+
'thanks for watching.',
|
|
62
|
+
'thanks for watching',
|
|
63
|
+
'bye.',
|
|
64
|
+
'bye',
|
|
65
|
+
'goodbye.',
|
|
66
|
+
'goodbye',
|
|
67
|
+
"you're welcome.",
|
|
68
|
+
"you're welcome",
|
|
69
|
+
'hmm.',
|
|
70
|
+
'hmm',
|
|
71
|
+
'huh.',
|
|
72
|
+
'huh',
|
|
73
|
+
'oh.',
|
|
74
|
+
'oh',
|
|
75
|
+
'ah.',
|
|
76
|
+
'ah',
|
|
77
|
+
'uh.',
|
|
78
|
+
'uh',
|
|
79
|
+
'so.',
|
|
80
|
+
'so',
|
|
81
|
+
'well.',
|
|
82
|
+
'you',
|
|
83
|
+
'the end.',
|
|
84
|
+
'the end',
|
|
85
|
+
'subtitle',
|
|
86
|
+
'subtitles',
|
|
87
|
+
'subscribe',
|
|
88
|
+
'like and subscribe',
|
|
89
|
+
'sort of',
|
|
90
|
+
'sort of.',
|
|
91
|
+
'five.',
|
|
92
|
+
'five',
|
|
93
|
+
'one.',
|
|
94
|
+
'one',
|
|
95
|
+
'two.',
|
|
96
|
+
'two',
|
|
97
|
+
'three.',
|
|
98
|
+
'three',
|
|
99
|
+
// Non-speech sounds Whisper transcribes literally
|
|
100
|
+
'cough',
|
|
101
|
+
'cough.',
|
|
102
|
+
'coughing',
|
|
103
|
+
'coughing.',
|
|
104
|
+
'sigh',
|
|
105
|
+
'sigh.',
|
|
106
|
+
'clap',
|
|
107
|
+
'clap.',
|
|
108
|
+
'click',
|
|
109
|
+
'click.',
|
|
110
|
+
'knock',
|
|
111
|
+
'knock.',
|
|
112
|
+
// Common non-English hallucinations
|
|
113
|
+
'продолжение следует',
|
|
114
|
+
'продолжение следует...',
|
|
115
|
+
'sous-titres',
|
|
116
|
+
'sous-titrage',
|
|
117
|
+
'merci.',
|
|
118
|
+
'merci',
|
|
119
|
+
'silencio',
|
|
120
|
+
'ready for your approval.',
|
|
121
|
+
]);
|
|
122
|
+
/** Compute RMS energy of Float32 audio buffer */
|
|
123
|
+
function computeRMS(audio) {
|
|
124
|
+
let sum = 0;
|
|
125
|
+
for (let i = 0; i < audio.length; i++) {
|
|
126
|
+
sum += audio[i] * audio[i];
|
|
127
|
+
}
|
|
128
|
+
return Math.sqrt(sum / audio.length);
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Split text into sentences for pipelined TTS.
|
|
132
|
+
* Fires one TTS request per sentence in parallel so the first sentence
|
|
133
|
+
* plays within ~1s while subsequent sentences are still generating.
|
|
134
|
+
* Merges short fragments (< 8 words) with the previous sentence to
|
|
135
|
+
* avoid tiny TTS requests that produce choppy audio.
|
|
136
|
+
*/
|
|
137
|
+
function splitSentences(text) {
|
|
138
|
+
const trimmed = text.trim();
|
|
139
|
+
if (!trimmed)
|
|
140
|
+
return [];
|
|
141
|
+
const parts = trimmed.split(/(?<=[.!?])\s+/);
|
|
142
|
+
const sentences = parts.map((s) => s.trim()).filter((s) => s.length > 0);
|
|
143
|
+
if (sentences.length <= 1)
|
|
144
|
+
return sentences.length > 0 ? sentences : [trimmed];
|
|
145
|
+
const merged = [];
|
|
146
|
+
for (const s of sentences) {
|
|
147
|
+
if (merged.length > 0 && merged[merged.length - 1].split(/\s+/).length < 8) {
|
|
148
|
+
merged[merged.length - 1] += ' ' + s;
|
|
149
|
+
}
|
|
150
|
+
else {
|
|
151
|
+
merged.push(s);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return merged;
|
|
155
|
+
}
|
|
156
|
+
/** VAD tuning config — imported from centralized voice config */
|
|
157
|
+
const VAD_CONFIG = VAD;
|
|
158
|
+
function logTimings(t) {
|
|
159
|
+
const rows = {};
|
|
160
|
+
if (t.speechDurationMs != null)
|
|
161
|
+
rows['Speech duration'] = `${t.speechDurationMs.toFixed(0)} ms`;
|
|
162
|
+
if (t.wavSizeBytes != null)
|
|
163
|
+
rows['WAV size'] = `${(t.wavSizeBytes / 1024).toFixed(1)} KB`;
|
|
164
|
+
if (t.wavEncodeMs != null)
|
|
165
|
+
rows['WAV encode'] = `${t.wavEncodeMs.toFixed(1)} ms`;
|
|
166
|
+
if (t.sttMs != null)
|
|
167
|
+
rows['STT'] = `${t.sttMs.toFixed(0)} ms${t.sttRetries ? ` (${t.sttRetries} retries)` : ''}`;
|
|
168
|
+
if (t.llmSendMs != null)
|
|
169
|
+
rows['LLM send'] = `${t.llmSendMs.toFixed(0)} ms`;
|
|
170
|
+
if (t.llmWaitMs != null)
|
|
171
|
+
rows['LLM wait'] = `${t.llmWaitMs.toFixed(0)} ms`;
|
|
172
|
+
if (t.llmTotalMs != null)
|
|
173
|
+
rows['LLM total'] = `${t.llmTotalMs.toFixed(0)} ms`;
|
|
174
|
+
if (t.ttsFirstChunkMs != null)
|
|
175
|
+
rows['TTS first chunk'] = `${t.ttsFirstChunkMs.toFixed(0)} ms`;
|
|
176
|
+
if (t.ttsTotalMs != null)
|
|
177
|
+
rows['TTS total'] = `${t.ttsTotalMs.toFixed(0)} ms`;
|
|
178
|
+
if (t.ttsMs != null)
|
|
179
|
+
rows['TTS (buffered)'] = `${t.ttsMs.toFixed(0)} ms`;
|
|
180
|
+
if (t.playbackMs != null)
|
|
181
|
+
rows['Playback'] = `${t.playbackMs.toFixed(0)} ms`;
|
|
182
|
+
rows['TOTAL'] = `${t.totalMs.toFixed(0)} ms`;
|
|
183
|
+
console.group(`%c⏱ Voice Pipeline [${t.pipeline}] — ${t.totalMs.toFixed(0)} ms`, 'color: #4fc3f7; font-weight: bold');
|
|
184
|
+
console.table(rows);
|
|
185
|
+
console.groupEnd();
|
|
186
|
+
}
|
|
187
|
+
function classifyError(err) {
|
|
188
|
+
if (err instanceof DOMException && err.name === 'NotAllowedError')
|
|
189
|
+
return 'mic_denied';
|
|
190
|
+
if (err instanceof DOMException && err.name === 'NotFoundError')
|
|
191
|
+
return 'mic_unavailable';
|
|
192
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
193
|
+
if (msg.includes('STT') || msg.includes('Transcription'))
|
|
194
|
+
return 'stt_failed';
|
|
195
|
+
if (msg.includes('TTS') || msg.includes('synthesis'))
|
|
196
|
+
return 'tts_failed';
|
|
197
|
+
if (msg.includes('fetch') || msg.includes('network') || msg.includes('Failed to fetch'))
|
|
198
|
+
return 'network_error';
|
|
199
|
+
if (msg.includes('chat') || msg.includes('LLM'))
|
|
200
|
+
return 'llm_failed';
|
|
201
|
+
return 'network_error';
|
|
202
|
+
}
|
|
203
|
+
export function useVoiceAgent({ bargeInEnabled = true, settings, volumeRef, speedRef, }) {
|
|
204
|
+
const config = useSiteConfig();
|
|
205
|
+
const [state, setState] = useState('IDLE');
|
|
206
|
+
const [messages, setMessages] = useState([]);
|
|
207
|
+
const [currentTranscript, setCurrentTranscript] = useState('');
|
|
208
|
+
const [voiceError, setVoiceError] = useState(null);
|
|
209
|
+
const [lastTimings, setLastTimings] = useState(null);
|
|
210
|
+
// Transient LLM error — shows briefly then auto-clears so the user can retry.
|
|
211
|
+
// Unlike network_error (persistent offline), tool_use_failed is intermittent.
|
|
212
|
+
const setTransientLLMError = useCallback(() => {
|
|
213
|
+
setVoiceError('llm_failed');
|
|
214
|
+
setTimeout(() => {
|
|
215
|
+
setVoiceError((prev) => (prev === 'llm_failed' ? null : prev));
|
|
216
|
+
}, LLM_ERROR_DISMISS_MS);
|
|
217
|
+
}, []);
|
|
218
|
+
const stateRef = useRef(state);
|
|
219
|
+
useEffect(() => {
|
|
220
|
+
stateRef.current = state;
|
|
221
|
+
}, [state]);
|
|
222
|
+
// Ref for settings — avoids stale closures in useCallback bodies
|
|
223
|
+
const settingsRef = useRef(settings);
|
|
224
|
+
useEffect(() => {
|
|
225
|
+
settingsRef.current = settings;
|
|
226
|
+
}, [settings]);
|
|
227
|
+
const abortRef = useRef(null);
|
|
228
|
+
const bargeInFrames = useRef(0);
|
|
229
|
+
const sttRetryCount = useRef(0);
|
|
230
|
+
const lastToggleRef = useRef(0);
|
|
231
|
+
const processingRef = useRef(false);
|
|
232
|
+
/** Tracks whether the current pipeline was initiated via text input */
|
|
233
|
+
const textPipelineRef = useRef(false);
|
|
234
|
+
/** Two-phase barge-in: mute first, confirm speech before destroying TTS */
|
|
235
|
+
const bargeInPendingRef = useRef(false);
|
|
236
|
+
/** If TTS playback ends while barge-in is pending, we can't resume audio */
|
|
237
|
+
const playbackEndedDuringBargeInRef = useRef(false);
|
|
238
|
+
/** When true, close after current TTS finishes (LLM sent [END_SESSION]) */
|
|
239
|
+
const sessionEndingRef = useRef(false);
|
|
240
|
+
/** Signals the panel to close after farewell TTS played */
|
|
241
|
+
const [sessionEnded, setSessionEnded] = useState(false);
|
|
242
|
+
// --- Vercel AI SDK: useChat replaces CopilotKit ---
|
|
243
|
+
const navigate = useNavigate();
|
|
244
|
+
const location = useLocation();
|
|
245
|
+
const params = useParams();
|
|
246
|
+
const { actions: uiActions, execute: executeUIAction } = useUIActionRegistry();
|
|
247
|
+
const formRegistry = useFormFieldRegistry();
|
|
248
|
+
const handleClientTool = createClientToolHandler({
|
|
249
|
+
navigate,
|
|
250
|
+
executeUIAction: executeUIAction,
|
|
251
|
+
getFormFields: () => formRegistry.fields,
|
|
252
|
+
setFormValue: formRegistry.setValue,
|
|
253
|
+
config,
|
|
254
|
+
});
|
|
255
|
+
const roundTripCountRef = useRef(0);
|
|
256
|
+
const lastAutoSendMsgIdRef = useRef(null);
|
|
257
|
+
const MAX_CLIENT_ROUND_TRIPS = 3;
|
|
258
|
+
const NAVIGATION_TOOLS = ['navigateTo', 'viewService', 'startApplication'];
|
|
259
|
+
// Client tools have no server-side `execute` — the client must provide results.
|
|
260
|
+
// Server tools (searchServices, getServiceDetails, etc.) are already executed
|
|
261
|
+
// server-side; their results arrive in the stream and must NOT be overwritten.
|
|
262
|
+
const CLIENT_TOOLS = new Set([
|
|
263
|
+
'navigateTo',
|
|
264
|
+
'viewService',
|
|
265
|
+
'startApplication',
|
|
266
|
+
'performUIAction',
|
|
267
|
+
'getFormSchema',
|
|
268
|
+
'fillFormFields',
|
|
269
|
+
]);
|
|
270
|
+
const actionSeqRef = useRef(0);
|
|
271
|
+
const { messages: chatMessages, setMessages: setChatMessages, status: chatStatus, stop: chatStop, sendMessage: chatSendMessage, addToolOutput: chatAddToolOutput, } = useChat({
|
|
272
|
+
transport: new DefaultChatTransport({
|
|
273
|
+
api: '/api/chat',
|
|
274
|
+
headers: () => {
|
|
275
|
+
const apiKey = import.meta.env.VITE_API_KEY;
|
|
276
|
+
return apiKey ? { 'X-API-Key': apiKey } : {};
|
|
277
|
+
},
|
|
278
|
+
body: () => ({
|
|
279
|
+
clientState: {
|
|
280
|
+
route: location.pathname,
|
|
281
|
+
currentService: params.serviceId
|
|
282
|
+
? (() => {
|
|
283
|
+
const s = config.services.find(sv => sv.id === params.serviceId);
|
|
284
|
+
return s ? { id: s.id, title: s.title, category: s.category } : null;
|
|
285
|
+
})()
|
|
286
|
+
: null,
|
|
287
|
+
categories: config.categories.map((c) => ({
|
|
288
|
+
category: c.title,
|
|
289
|
+
count: c.services.length,
|
|
290
|
+
})),
|
|
291
|
+
uiActions: uiActions.length > 0
|
|
292
|
+
? uiActions.map((a) => ({
|
|
293
|
+
id: a.id,
|
|
294
|
+
description: a.description,
|
|
295
|
+
category: a.category,
|
|
296
|
+
params: a.params,
|
|
297
|
+
}))
|
|
298
|
+
: [],
|
|
299
|
+
formStatus: formRegistry.fields.length > 0
|
|
300
|
+
? {
|
|
301
|
+
fieldCount: formRegistry.fields.length,
|
|
302
|
+
groups: [
|
|
303
|
+
...new Set(formRegistry.fields.map((f) => f.group).filter(Boolean)),
|
|
304
|
+
],
|
|
305
|
+
}
|
|
306
|
+
: null,
|
|
307
|
+
},
|
|
308
|
+
}),
|
|
309
|
+
}),
|
|
310
|
+
// Auto-send a follow-up request after all client tool outputs are provided.
|
|
311
|
+
// This runs after addToolOutput updates a tool part — when every tool
|
|
312
|
+
// invocation in the last assistant message has resolved, the SDK sends the
|
|
313
|
+
// results back to the model for the next step.
|
|
314
|
+
sendAutomaticallyWhen({ messages: msgs }) {
|
|
315
|
+
if (roundTripCountRef.current > MAX_CLIENT_ROUND_TRIPS)
|
|
316
|
+
return false;
|
|
317
|
+
const last = msgs[msgs.length - 1];
|
|
318
|
+
if (!last || last.role !== 'assistant')
|
|
319
|
+
return false;
|
|
320
|
+
if (last.id === lastAutoSendMsgIdRef.current)
|
|
321
|
+
return false;
|
|
322
|
+
// Use the SDK's own check: filters providerExecuted (server) tools,
|
|
323
|
+
// respects step boundaries, and verifies all client tool parts are resolved.
|
|
324
|
+
const complete = lastAssistantMessageIsCompleteWithToolCalls({ messages: msgs });
|
|
325
|
+
if (complete) {
|
|
326
|
+
lastAutoSendMsgIdRef.current = last.id;
|
|
327
|
+
roundTripCountRef.current++;
|
|
328
|
+
console.debug('[sendAutomaticallyWhen] follow-up #' + roundTripCountRef.current);
|
|
329
|
+
return true;
|
|
330
|
+
}
|
|
331
|
+
return false;
|
|
332
|
+
},
|
|
333
|
+
onFinish({ message, isAbort }) {
|
|
334
|
+
if (isAbort)
|
|
335
|
+
return; // Don't trigger TTS on aborted requests
|
|
336
|
+
// onFinish fires per HTTP response, not per user turn.
|
|
337
|
+
// Guard: only trigger TTS when there is actual text content.
|
|
338
|
+
const textParts = (message.parts || []).filter((p) => p.type === 'text');
|
|
339
|
+
const text = textParts.map((p) => p.text || '').join('');
|
|
340
|
+
if (!text)
|
|
341
|
+
return; // Intermediate response with only tool calls
|
|
342
|
+
// NOTE: Do NOT reset roundTripCountRef here — onFinish fires per HTTP
|
|
343
|
+
// response (including auto-send follow-ups). Resetting here would defeat
|
|
344
|
+
// the round-trip guard and allow infinite loops. The counter resets when
|
|
345
|
+
// the USER sends a new message (in sendTextMessage / voice pipeline).
|
|
346
|
+
const cleaned = sanitizeForTranscript(text);
|
|
347
|
+
// Silent rejection
|
|
348
|
+
if (cleaned?.includes(SILENT_MARKER)) {
|
|
349
|
+
console.debug('[VoiceAgent] LLM returned SILENT marker, skipping TTS');
|
|
350
|
+
setMessages((prev) => (prev.length > 0 ? prev.slice(0, -1) : prev));
|
|
351
|
+
setVoiceError('not_addressed');
|
|
352
|
+
setTimeout(() => {
|
|
353
|
+
setVoiceError((prev) => (prev === 'not_addressed' ? null : prev));
|
|
354
|
+
}, MISFIRE_DISMISS_MS);
|
|
355
|
+
if (stateRef.current === 'PROCESSING') {
|
|
356
|
+
const nextState = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
357
|
+
textPipelineRef.current = false;
|
|
358
|
+
stateRef.current = nextState;
|
|
359
|
+
setState(nextState);
|
|
360
|
+
}
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
// Session end
|
|
364
|
+
let ttsText = cleaned || '';
|
|
365
|
+
if (text.includes(END_SESSION_MARKER)) {
|
|
366
|
+
ttsText = config.farewellMessage;
|
|
367
|
+
sessionEndingRef.current = true;
|
|
368
|
+
}
|
|
369
|
+
// Update transcript
|
|
370
|
+
if (ttsText) {
|
|
371
|
+
setCurrentTranscript(ttsText);
|
|
372
|
+
setMessages((prev) => [
|
|
373
|
+
...prev,
|
|
374
|
+
{ role: 'assistant', text: ttsText, timestamp: Date.now() },
|
|
375
|
+
]);
|
|
376
|
+
}
|
|
377
|
+
// TTS
|
|
378
|
+
if (ttsText && ttsText !== SILENT_MARKER && stateRef.current === 'PROCESSING') {
|
|
379
|
+
stateRef.current = 'AI_SPEAKING';
|
|
380
|
+
setState('AI_SPEAKING');
|
|
381
|
+
const curSettings = settingsRef.current;
|
|
382
|
+
// TTS disabled — skip synthesis
|
|
383
|
+
if (!curSettings.ttsEnabled) {
|
|
384
|
+
const nextState = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
385
|
+
textPipelineRef.current = false;
|
|
386
|
+
stateRef.current = nextState;
|
|
387
|
+
setState(nextState);
|
|
388
|
+
return;
|
|
389
|
+
}
|
|
390
|
+
const doTTS = async () => {
|
|
391
|
+
const ttsParams = {
|
|
392
|
+
temperature: curSettings.expressiveness,
|
|
393
|
+
maxWords: curSettings.responseLength,
|
|
394
|
+
};
|
|
395
|
+
abortRef.current = new AbortController();
|
|
396
|
+
try {
|
|
397
|
+
const stream = streamSpeech(ttsText, abortRef.current.signal, ttsParams, curSettings.ttsTimeoutMs);
|
|
398
|
+
if (stateRef.current === 'AI_SPEAKING') {
|
|
399
|
+
await playStreamingAudio(stream, abortRef.current.signal);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
catch (streamErr) {
|
|
403
|
+
if (streamErr.name !== 'AbortError') {
|
|
404
|
+
console.warn('Streaming TTS failed, falling back to buffered:', streamErr);
|
|
405
|
+
try {
|
|
406
|
+
if (!abortRef.current)
|
|
407
|
+
throw new DOMException('Aborted', 'AbortError');
|
|
408
|
+
const sentences = splitSentences(ttsText);
|
|
409
|
+
for (const sentence of sentences) {
|
|
410
|
+
if (abortRef.current.signal.aborted)
|
|
411
|
+
break;
|
|
412
|
+
if (stateRef.current !== 'AI_SPEAKING')
|
|
413
|
+
break;
|
|
414
|
+
const audio = await synthesizeSpeech(sentence, abortRef.current.signal, ttsParams, curSettings.ttsTimeoutMs);
|
|
415
|
+
if (stateRef.current !== 'AI_SPEAKING')
|
|
416
|
+
break;
|
|
417
|
+
await playAudio(audio);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
catch (fbErr) {
|
|
421
|
+
if (fbErr.name !== 'AbortError') {
|
|
422
|
+
console.error('TTS failed:', fbErr);
|
|
423
|
+
setVoiceError('tts_failed');
|
|
424
|
+
setTimeout(() => {
|
|
425
|
+
if (stateRef.current === 'AI_SPEAKING') {
|
|
426
|
+
const ns = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
427
|
+
textPipelineRef.current = false;
|
|
428
|
+
setState(ns);
|
|
429
|
+
}
|
|
430
|
+
}, 2000);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
};
|
|
436
|
+
doTTS();
|
|
437
|
+
}
|
|
438
|
+
else if (stateRef.current === 'PROCESSING') {
|
|
439
|
+
// No TTS needed (empty or silent) — go back to listening
|
|
440
|
+
const nextState = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
441
|
+
textPipelineRef.current = false;
|
|
442
|
+
stateRef.current = nextState;
|
|
443
|
+
setState(nextState);
|
|
444
|
+
}
|
|
445
|
+
},
|
|
446
|
+
async onToolCall({ toolCall }) {
|
|
447
|
+
console.debug('[onToolCall]', toolCall.toolName, 'client:', CLIENT_TOOLS.has(toolCall.toolName));
|
|
448
|
+
const isClientTool = CLIENT_TOOLS.has(toolCall.toolName);
|
|
449
|
+
// Emit action badge for all tools (server and client)
|
|
450
|
+
const badgeConfig = ACTION_BADGE_CONFIG[toolCall.toolName];
|
|
451
|
+
if (badgeConfig) {
|
|
452
|
+
// For server tools, just show the label — result is handled server-side.
|
|
453
|
+
// For client tools, we'll update the badge after execution.
|
|
454
|
+
setMessages((prev) => [
|
|
455
|
+
...prev,
|
|
456
|
+
{
|
|
457
|
+
role: 'action',
|
|
458
|
+
text: badgeConfig.label,
|
|
459
|
+
timestamp: Date.now() + ++actionSeqRef.current * 0.001,
|
|
460
|
+
action: { name: toolCall.toolName, category: badgeConfig.category },
|
|
461
|
+
},
|
|
462
|
+
]);
|
|
463
|
+
}
|
|
464
|
+
// Only handle client-side tools — server tools already have results
|
|
465
|
+
// from the stream and must not be overwritten via addToolOutput.
|
|
466
|
+
if (!isClientTool)
|
|
467
|
+
return;
|
|
468
|
+
roundTripCountRef.current++;
|
|
469
|
+
if (roundTripCountRef.current > MAX_CLIENT_ROUND_TRIPS) {
|
|
470
|
+
console.warn(`[VoiceAgent] Client round-trip limit reached`);
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
// Wait for React to flush state for form schema reads
|
|
474
|
+
if (toolCall.toolName === 'getFormSchema') {
|
|
475
|
+
await new Promise((r) => requestAnimationFrame(r));
|
|
476
|
+
}
|
|
477
|
+
const result = await handleClientTool(toolCall.toolName, toolCall.input);
|
|
478
|
+
// Wait for navigation to settle
|
|
479
|
+
if (NAVIGATION_TOOLS.includes(toolCall.toolName)) {
|
|
480
|
+
await new Promise((r) => requestAnimationFrame(r));
|
|
481
|
+
}
|
|
482
|
+
// Update the action badge with the result snippet
|
|
483
|
+
if (badgeConfig && result) {
|
|
484
|
+
let resultSnippet = result;
|
|
485
|
+
if (resultSnippet.length > 40) {
|
|
486
|
+
const truncated = resultSnippet.slice(0, 40);
|
|
487
|
+
const lastSpace = truncated.lastIndexOf(' ');
|
|
488
|
+
resultSnippet =
|
|
489
|
+
(lastSpace > 15 ? truncated.slice(0, lastSpace) : truncated).trimEnd() + '\u2026';
|
|
490
|
+
}
|
|
491
|
+
const label = `${badgeConfig.label} \u00b7 ${resultSnippet}`;
|
|
492
|
+
setMessages((prev) => {
|
|
493
|
+
// Replace the last action badge for this tool with the updated one
|
|
494
|
+
let idx = -1;
|
|
495
|
+
for (let i = prev.length - 1; i >= 0; i--) {
|
|
496
|
+
if (prev[i].role === 'action' && prev[i].action?.name === toolCall.toolName) {
|
|
497
|
+
idx = i;
|
|
498
|
+
break;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
if (idx === -1)
|
|
502
|
+
return prev;
|
|
503
|
+
const updated = [...prev];
|
|
504
|
+
updated[idx] = {
|
|
505
|
+
...updated[idx],
|
|
506
|
+
text: label,
|
|
507
|
+
action: { name: toolCall.toolName, category: badgeConfig.category, result: resultSnippet },
|
|
508
|
+
};
|
|
509
|
+
return updated;
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
// Provide the tool output to the SDK. Fire-and-forget (no await) because
|
|
513
|
+
// onToolCall runs inside the Chat jobExecutor — awaiting addToolOutput
|
|
514
|
+
// here would deadlock. The queued job runs after the current transform
|
|
515
|
+
// finishes, updates the tool part to "output-available", and triggers
|
|
516
|
+
// sendAutomaticallyWhen to send a follow-up request.
|
|
517
|
+
console.debug('[onToolCall] addToolOutput for', toolCall.toolName, 'callId:', toolCall.toolCallId);
|
|
518
|
+
chatAddToolOutput({
|
|
519
|
+
toolCallId: toolCall.toolCallId,
|
|
520
|
+
tool: toolCall.toolName,
|
|
521
|
+
output: result,
|
|
522
|
+
});
|
|
523
|
+
},
|
|
524
|
+
});
|
|
525
|
+
const { playAudio, playStreamingAudio, stopAudio, suspendPlayback, resumePlayback, getAmplitude, initContext, applyVolume, analyser, } = useAudioPlayback({
|
|
526
|
+
volumeRef,
|
|
527
|
+
speedRef,
|
|
528
|
+
onPlaybackEnd: () => {
|
|
529
|
+
// If barge-in is pending (audio muted while we verify noise vs speech),
|
|
530
|
+
// record that playback ended so resumeFromBargeIn knows not to unmute.
|
|
531
|
+
if (bargeInPendingRef.current) {
|
|
532
|
+
playbackEndedDuringBargeInRef.current = true;
|
|
533
|
+
return;
|
|
534
|
+
}
|
|
535
|
+
if (stateRef.current === 'AI_SPEAKING') {
|
|
536
|
+
// LLM included [END_SESSION] — farewell just played, close the session
|
|
537
|
+
if (sessionEndingRef.current) {
|
|
538
|
+
sessionEndingRef.current = false;
|
|
539
|
+
textPipelineRef.current = false;
|
|
540
|
+
stateRef.current = 'IDLE';
|
|
541
|
+
setState('IDLE');
|
|
542
|
+
setSessionEnded(true);
|
|
543
|
+
return;
|
|
544
|
+
}
|
|
545
|
+
const nextState = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
546
|
+
textPipelineRef.current = false;
|
|
547
|
+
setTimeout(() => {
|
|
548
|
+
if (stateRef.current === 'AI_SPEAKING') {
|
|
549
|
+
setState(nextState);
|
|
550
|
+
}
|
|
551
|
+
}, GUARD_DELAY_MS);
|
|
552
|
+
}
|
|
553
|
+
},
|
|
554
|
+
});
|
|
555
|
+
/**
|
|
556
|
+
* Resume TTS playback after a false barge-in (noise was not real speech).
|
|
557
|
+
* Unfreezes the AudioContext so all paused sources continue from where they
|
|
558
|
+
* stopped. If audio ended during the brief async suspension race window,
|
|
559
|
+
* transitions normally instead.
|
|
560
|
+
*/
|
|
561
|
+
const resumeFromBargeIn = useCallback(() => {
|
|
562
|
+
// Guard: if the agent was stopped (IDLE) or barge-in was already resolved,
|
|
563
|
+
// don't resurrect state — stop() clears bargeInPendingRef.
|
|
564
|
+
if (stateRef.current === 'IDLE' || !bargeInPendingRef.current)
|
|
565
|
+
return;
|
|
566
|
+
bargeInPendingRef.current = false;
|
|
567
|
+
if (playbackEndedDuringBargeInRef.current) {
|
|
568
|
+
// Audio ended during the async suspension window — transition normally
|
|
569
|
+
playbackEndedDuringBargeInRef.current = false;
|
|
570
|
+
resumePlayback();
|
|
571
|
+
const nextState = textPipelineRef.current ? 'IDLE' : 'LISTENING';
|
|
572
|
+
textPipelineRef.current = false;
|
|
573
|
+
stateRef.current = nextState;
|
|
574
|
+
setState(nextState);
|
|
575
|
+
}
|
|
576
|
+
else {
|
|
577
|
+
// Audio still frozen — unfreeze and continue playback
|
|
578
|
+
resumePlayback();
|
|
579
|
+
stateRef.current = 'AI_SPEAKING';
|
|
580
|
+
setState('AI_SPEAKING');
|
|
581
|
+
}
|
|
582
|
+
}, [resumePlayback]);
|
|
583
|
+
const handleSpeechEnd = useCallback(async (audio) => {
|
|
584
|
+
if (stateRef.current !== 'USER_SPEAKING' && stateRef.current !== 'LISTENING')
|
|
585
|
+
return;
|
|
586
|
+
// Reset session-ending flag so a barge-in during farewell TTS doesn't
|
|
587
|
+
// cause premature session close on the next turn.
|
|
588
|
+
sessionEndingRef.current = false;
|
|
589
|
+
const wasBargeIn = bargeInPendingRef.current;
|
|
590
|
+
// Block concurrent pipelines — but allow barge-in noise processing
|
|
591
|
+
// even when the first pipeline still holds processingRef.
|
|
592
|
+
if (processingRef.current && !wasBargeIn)
|
|
593
|
+
return;
|
|
594
|
+
// Energy gate BEFORE entering PROCESSING — quiet audio should not
|
|
595
|
+
// reset the idle timer or trigger any visible state change.
|
|
596
|
+
const rms = computeRMS(audio);
|
|
597
|
+
if (rms < settingsRef.current.minAudioRms) {
|
|
598
|
+
console.debug(`[VoiceAgent] Audio too quiet (RMS=${rms.toFixed(4)}), discarding`);
|
|
599
|
+
if (wasBargeIn) {
|
|
600
|
+
console.debug('[VoiceAgent] False barge-in (RMS gate), resuming TTS');
|
|
601
|
+
resumeFromBargeIn();
|
|
602
|
+
}
|
|
603
|
+
else {
|
|
604
|
+
setState('LISTENING');
|
|
605
|
+
}
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
// Track whether we "own" processingRef — during barge-in the first
|
|
609
|
+
// pipeline already holds it and its finally block will clean up.
|
|
610
|
+
let ownProcessing = !processingRef.current;
|
|
611
|
+
processingRef.current = true;
|
|
612
|
+
// Only abort previous TTS if this is NOT a pending barge-in.
|
|
613
|
+
// For barge-in, we defer the abort until we confirm it's real speech.
|
|
614
|
+
if (!wasBargeIn) {
|
|
615
|
+
abortRef.current?.abort();
|
|
616
|
+
abortRef.current = null;
|
|
617
|
+
}
|
|
618
|
+
// Clear any lingering error from previous attempt (e.g. "Didn't catch that")
|
|
619
|
+
setVoiceError(null);
|
|
620
|
+
// During barge-in, defer PROCESSING state and transcript clear until we've
|
|
621
|
+
// confirmed real speech — avoids flicker if it turns out to be noise.
|
|
622
|
+
if (!wasBargeIn) {
|
|
623
|
+
setState('PROCESSING');
|
|
624
|
+
setCurrentTranscript('');
|
|
625
|
+
}
|
|
626
|
+
roundTripCountRef.current = 0; // Reset for new user turn
|
|
627
|
+
lastAutoSendMsgIdRef.current = null;
|
|
628
|
+
const t0 = performance.now();
|
|
629
|
+
const timings = {
|
|
630
|
+
pipeline: 'voice',
|
|
631
|
+
speechDurationMs: (audio.length / 16000) * 1000,
|
|
632
|
+
timestamp: Date.now(),
|
|
633
|
+
};
|
|
634
|
+
// End-to-end pipeline timeout — prevents hanging due to network or GPU issues
|
|
635
|
+
const pipelineAc = new AbortController();
|
|
636
|
+
const pipelineTimer = setTimeout(() => pipelineAc.abort(), PIPELINE_TIMEOUT_MS);
|
|
637
|
+
try {
|
|
638
|
+
// 1. Convert to WAV and transcribe (with retry)
|
|
639
|
+
const tWav0 = performance.now();
|
|
640
|
+
const wavBlob = float32ToWav(audio, 16000);
|
|
641
|
+
timings.wavEncodeMs = performance.now() - tWav0;
|
|
642
|
+
timings.wavSizeBytes = wavBlob.size;
|
|
643
|
+
let text;
|
|
644
|
+
let noSpeechProb = 0;
|
|
645
|
+
let avgLogprob = 0;
|
|
646
|
+
let sttRetries = 0;
|
|
647
|
+
const tStt0 = performance.now();
|
|
648
|
+
for (let attempt = 0; attempt <= MAX_STT_RETRIES; attempt++) {
|
|
649
|
+
try {
|
|
650
|
+
const result = await transcribeAudio(wavBlob, undefined, settingsRef.current.sttTimeoutMs);
|
|
651
|
+
text = result.text;
|
|
652
|
+
noSpeechProb = result.noSpeechProb ?? 0;
|
|
653
|
+
avgLogprob = result.avgLogprob ?? 0;
|
|
654
|
+
sttRetryCount.current = 0;
|
|
655
|
+
break;
|
|
656
|
+
}
|
|
657
|
+
catch {
|
|
658
|
+
sttRetries = attempt + 1;
|
|
659
|
+
if (attempt < MAX_STT_RETRIES) {
|
|
660
|
+
await new Promise((r) => setTimeout(r, RETRY_BASE_DELAY_MS * (attempt + 1)));
|
|
661
|
+
}
|
|
662
|
+
else {
|
|
663
|
+
timings.sttMs = performance.now() - tStt0;
|
|
664
|
+
timings.sttRetries = sttRetries;
|
|
665
|
+
timings.totalMs = performance.now() - t0;
|
|
666
|
+
logTimings(timings);
|
|
667
|
+
setLastTimings(timings);
|
|
668
|
+
if (wasBargeIn) {
|
|
669
|
+
console.debug('[VoiceAgent] False barge-in (STT failed), resuming TTS');
|
|
670
|
+
resumeFromBargeIn();
|
|
671
|
+
}
|
|
672
|
+
else {
|
|
673
|
+
setVoiceError('stt_failed');
|
|
674
|
+
setTimeout(() => {
|
|
675
|
+
setVoiceError((prev) => (prev === 'stt_failed' ? null : prev));
|
|
676
|
+
}, MISFIRE_DISMISS_MS);
|
|
677
|
+
setState('LISTENING');
|
|
678
|
+
}
|
|
679
|
+
return;
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
timings.sttMs = performance.now() - tStt0;
|
|
684
|
+
timings.sttRetries = sttRetries;
|
|
685
|
+
// Pipeline timeout check — bail after STT if we've exceeded the budget
|
|
686
|
+
if (pipelineAc.signal.aborted) {
|
|
687
|
+
console.warn('[VoiceAgent] Pipeline timeout after STT');
|
|
688
|
+
timings.totalMs = performance.now() - t0;
|
|
689
|
+
logTimings(timings);
|
|
690
|
+
setLastTimings(timings);
|
|
691
|
+
setState('LISTENING');
|
|
692
|
+
return;
|
|
693
|
+
}
|
|
694
|
+
// Filter out non-speech using Whisper's quality signals:
|
|
695
|
+
// - no_speech_prob: model's estimate that segment contains no speech
|
|
696
|
+
// - avg_logprob: mean token confidence (more negative = less sure)
|
|
697
|
+
// Coughs/noise produce no_speech_prob ≈ 0 but avg_logprob ≈ -0.9
|
|
698
|
+
if (noSpeechProb > MAX_NO_SPEECH_PROB || avgLogprob < MIN_AVG_LOGPROB) {
|
|
699
|
+
console.debug(`[VoiceAgent] Low-confidence STT (no_speech_prob=${noSpeechProb.toFixed(3)}, avg_logprob=${avgLogprob.toFixed(3)}), discarding`);
|
|
700
|
+
timings.totalMs = performance.now() - t0;
|
|
701
|
+
logTimings(timings);
|
|
702
|
+
setLastTimings(timings);
|
|
703
|
+
if (wasBargeIn) {
|
|
704
|
+
console.debug('[VoiceAgent] False barge-in (low STT confidence), resuming TTS');
|
|
705
|
+
resumeFromBargeIn();
|
|
706
|
+
}
|
|
707
|
+
else {
|
|
708
|
+
setState('LISTENING');
|
|
709
|
+
}
|
|
710
|
+
return;
|
|
711
|
+
}
|
|
712
|
+
// Filter out Whisper ghost transcriptions:
|
|
713
|
+
// 1. Punctuation/symbol-only output (e.g. ".", "...", "!")
|
|
714
|
+
// 2. Known hallucinated phrases on near-silent audio (e.g. "Thank you.")
|
|
715
|
+
const trimmed = (text ?? '').trim();
|
|
716
|
+
const cleaned = trimmed.replace(/[\s\p{P}\p{S}]+/gu, '');
|
|
717
|
+
const isGhost = cleaned.length === 0 || WHISPER_HALLUCINATIONS.has(trimmed.toLowerCase());
|
|
718
|
+
if (!text || isGhost) {
|
|
719
|
+
console.debug('[VoiceAgent] Discarded ghost transcription:', JSON.stringify(text));
|
|
720
|
+
timings.totalMs = performance.now() - t0;
|
|
721
|
+
logTimings(timings);
|
|
722
|
+
setLastTimings(timings);
|
|
723
|
+
if (wasBargeIn) {
|
|
724
|
+
console.debug('[VoiceAgent] False barge-in (ghost transcription), resuming TTS');
|
|
725
|
+
resumeFromBargeIn();
|
|
726
|
+
}
|
|
727
|
+
else {
|
|
728
|
+
setState('LISTENING');
|
|
729
|
+
}
|
|
730
|
+
return;
|
|
731
|
+
}
|
|
732
|
+
// Confirmed real speech — if this was a barge-in, now fully stop the old TTS
|
|
733
|
+
if (wasBargeIn) {
|
|
734
|
+
console.debug('[VoiceAgent] Barge-in confirmed (real speech), stopping old TTS');
|
|
735
|
+
bargeInPendingRef.current = false;
|
|
736
|
+
playbackEndedDuringBargeInRef.current = false;
|
|
737
|
+
stopAudio();
|
|
738
|
+
abortRef.current?.abort();
|
|
739
|
+
abortRef.current = null;
|
|
740
|
+
// Now safe to show PROCESSING state — speech is real, not noise
|
|
741
|
+
setState('PROCESSING');
|
|
742
|
+
setCurrentTranscript('');
|
|
743
|
+
// The old pipeline was aborted above — take ownership of processingRef
|
|
744
|
+
// so this pipeline processes the barge-in speech instead of dropping it.
|
|
745
|
+
// Previously this returned to LISTENING, forcing the user to repeat.
|
|
746
|
+
if (!ownProcessing) {
|
|
747
|
+
ownProcessing = true;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
// Store user message
|
|
751
|
+
setCurrentTranscript(text);
|
|
752
|
+
setMessages((prev) => [
|
|
753
|
+
...prev,
|
|
754
|
+
{ role: 'user', text: text, timestamp: Date.now() },
|
|
755
|
+
]);
|
|
756
|
+
// 2. Send to LLM via useChat — response handled in onFinish callback
|
|
757
|
+
const tLlm0 = performance.now();
|
|
758
|
+
try {
|
|
759
|
+
await chatSendMessage({ text: text });
|
|
760
|
+
}
|
|
761
|
+
catch (llmErr) {
|
|
762
|
+
console.error('LLM error:', llmErr);
|
|
763
|
+
timings.llmSendMs = performance.now() - tLlm0;
|
|
764
|
+
timings.totalMs = performance.now() - t0;
|
|
765
|
+
logTimings(timings);
|
|
766
|
+
setLastTimings(timings);
|
|
767
|
+
setTransientLLMError();
|
|
768
|
+
setState('LISTENING');
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
timings.llmSendMs = performance.now() - tLlm0;
|
|
772
|
+
// TTS is handled by onFinish callback — no need to await response here
|
|
773
|
+
timings.totalMs = performance.now() - t0;
|
|
774
|
+
logTimings(timings);
|
|
775
|
+
setLastTimings(timings);
|
|
776
|
+
}
|
|
777
|
+
catch (err) {
|
|
778
|
+
timings.totalMs = performance.now() - t0;
|
|
779
|
+
logTimings(timings);
|
|
780
|
+
setLastTimings(timings);
|
|
781
|
+
if (err.name !== 'AbortError') {
|
|
782
|
+
console.error('Voice agent error:', err);
|
|
783
|
+
setVoiceError(classifyError(err));
|
|
784
|
+
}
|
|
785
|
+
const s = stateRef.current;
|
|
786
|
+
if (s === 'PROCESSING' || s === 'AI_SPEAKING') {
|
|
787
|
+
setState('LISTENING');
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
finally {
|
|
791
|
+
clearTimeout(pipelineTimer);
|
|
792
|
+
if (ownProcessing)
|
|
793
|
+
processingRef.current = false;
|
|
794
|
+
}
|
|
795
|
+
}, [chatSendMessage, playAudio, playStreamingAudio, stopAudio, resumePlayback, resumeFromBargeIn]);
|
|
796
|
+
const handleBargeIn = useCallback(() => {
|
|
797
|
+
// Phase 1: freeze the AudioContext instead of destroying playback.
|
|
798
|
+
// All scheduled sources pause in place. New TTS chunks continue to be
|
|
799
|
+
// scheduled on the frozen context and play seamlessly when resumed.
|
|
800
|
+
// If the noise turns out to be real speech, we fully stop in handleSpeechEnd.
|
|
801
|
+
// If it's noise, we unfreeze via resumeFromBargeIn() — zero content loss.
|
|
802
|
+
// Set flags BEFORE suspending — during the async suspension window,
|
|
803
|
+
// onPlaybackEnd may fire; it needs bargeInPendingRef to be true.
|
|
804
|
+
// Only reset playbackEndedDuringBargeInRef if this is a fresh barge-in,
|
|
805
|
+
// not a duplicate — avoids wiping a "playback ended" signal from the first.
|
|
806
|
+
if (!bargeInPendingRef.current) {
|
|
807
|
+
playbackEndedDuringBargeInRef.current = false;
|
|
808
|
+
}
|
|
809
|
+
bargeInPendingRef.current = true;
|
|
810
|
+
suspendPlayback();
|
|
811
|
+
stateRef.current = 'USER_SPEAKING';
|
|
812
|
+
setState('USER_SPEAKING');
|
|
813
|
+
bargeInFrames.current = 0;
|
|
814
|
+
}, [suspendPlayback]);
|
|
815
|
+
// Stable callback refs for useTenVAD (avoids re-creating the hook)
|
|
816
|
+
const handleSpeechEndRef = useRef(handleSpeechEnd);
|
|
817
|
+
useEffect(() => {
|
|
818
|
+
handleSpeechEndRef.current = handleSpeechEnd;
|
|
819
|
+
});
|
|
820
|
+
const handleBargeInRef = useRef(handleBargeIn);
|
|
821
|
+
useEffect(() => {
|
|
822
|
+
handleBargeInRef.current = handleBargeIn;
|
|
823
|
+
});
|
|
824
|
+
const vad = useTenVAD({
|
|
825
|
+
startOnLoad: false,
|
|
826
|
+
...VAD_CONFIG,
|
|
827
|
+
// Override with user settings
|
|
828
|
+
positiveSpeechThreshold: settings.speechThreshold,
|
|
829
|
+
negativeSpeechThreshold: Math.max(0.1, settings.speechThreshold - 0.25),
|
|
830
|
+
redemptionMs: settings.pauseToleranceMs,
|
|
831
|
+
onSpeechStart: () => {
|
|
832
|
+
if (stateRef.current === 'LISTENING') {
|
|
833
|
+
setState('USER_SPEAKING');
|
|
834
|
+
}
|
|
835
|
+
},
|
|
836
|
+
onSpeechEnd: (audio) => {
|
|
837
|
+
handleSpeechEndRef.current(audio);
|
|
838
|
+
},
|
|
839
|
+
onVADMisfire: () => {
|
|
840
|
+
if (bargeInPendingRef.current) {
|
|
841
|
+
console.debug('[VoiceAgent] False barge-in (VAD misfire), resuming TTS');
|
|
842
|
+
resumeFromBargeIn();
|
|
843
|
+
return;
|
|
844
|
+
}
|
|
845
|
+
if (stateRef.current === 'USER_SPEAKING') {
|
|
846
|
+
setState('LISTENING');
|
|
847
|
+
setVoiceError('speech_too_short');
|
|
848
|
+
setTimeout(() => {
|
|
849
|
+
setVoiceError((prev) => (prev === 'speech_too_short' ? null : prev));
|
|
850
|
+
}, MISFIRE_DISMISS_MS);
|
|
851
|
+
}
|
|
852
|
+
},
|
|
853
|
+
onFrameProcessed: (probabilities) => {
|
|
854
|
+
if (bargeInEnabled && stateRef.current === 'AI_SPEAKING') {
|
|
855
|
+
// Require both speech probability AND sufficient energy to barge in.
|
|
856
|
+
// Without the RMS gate, quiet sounds (speaker bleed, ambient noise)
|
|
857
|
+
// trigger barge-in via VAD probability alone, interrupting playback.
|
|
858
|
+
if (probabilities.isSpeech > settings.bargeInThreshold &&
|
|
859
|
+
probabilities.rms >= settings.minAudioRms) {
|
|
860
|
+
bargeInFrames.current++;
|
|
861
|
+
if (bargeInFrames.current >= BARGE_IN.framesRequired) {
|
|
862
|
+
handleBargeInRef.current();
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
else {
|
|
866
|
+
bargeInFrames.current = 0;
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
},
|
|
870
|
+
});
|
|
871
|
+
// Detect VAD errors (mic denied, model load failure)
|
|
872
|
+
useEffect(() => {
|
|
873
|
+
if (vad.errored) {
|
|
874
|
+
const errMsg = typeof vad.errored === 'object' && 'message' in vad.errored
|
|
875
|
+
? vad.errored.message
|
|
876
|
+
: String(vad.errored);
|
|
877
|
+
if (errMsg.includes('Permission') || errMsg.includes('NotAllowed')) {
|
|
878
|
+
setVoiceError('mic_denied');
|
|
879
|
+
}
|
|
880
|
+
else if (errMsg.includes('NotFound') || errMsg.includes('no audio')) {
|
|
881
|
+
setVoiceError('mic_unavailable');
|
|
882
|
+
}
|
|
883
|
+
else {
|
|
884
|
+
setVoiceError('vad_load_failed');
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
}, [vad.errored]);
|
|
888
|
+
const dismissError = useCallback(() => setVoiceError(null), []);
|
|
889
|
+
const start = useCallback(() => {
|
|
890
|
+
const now = Date.now();
|
|
891
|
+
if (now - lastToggleRef.current < MIC_TOGGLE_DEBOUNCE_MS)
|
|
892
|
+
return;
|
|
893
|
+
lastToggleRef.current = now;
|
|
894
|
+
// Abort any running pipeline to prevent ghost state transitions
|
|
895
|
+
abortRef.current?.abort();
|
|
896
|
+
abortRef.current = null;
|
|
897
|
+
processingRef.current = false;
|
|
898
|
+
setVoiceError(null);
|
|
899
|
+
setMessages([]);
|
|
900
|
+
setCurrentTranscript('');
|
|
901
|
+
sessionEndingRef.current = false;
|
|
902
|
+
setSessionEnded(false);
|
|
903
|
+
setChatMessages([]);
|
|
904
|
+
roundTripCountRef.current = 0;
|
|
905
|
+
actionSeqRef.current = 0;
|
|
906
|
+
setState('LISTENING');
|
|
907
|
+
vad.start();
|
|
908
|
+
// Non-blocking LLM health check — warn user early if AI service is down
|
|
909
|
+
checkLLMHealth().then(({ available, message }) => {
|
|
910
|
+
if (!available) {
|
|
911
|
+
console.warn('[VoiceAgent] LLM unavailable:', message);
|
|
912
|
+
setVoiceError('llm_failed');
|
|
913
|
+
}
|
|
914
|
+
});
|
|
915
|
+
}, [vad, setChatMessages]);
|
|
916
|
+
const stop = useCallback((force) => {
|
|
917
|
+
if (!force) {
|
|
918
|
+
const now = Date.now();
|
|
919
|
+
if (now - lastToggleRef.current < MIC_TOGGLE_DEBOUNCE_MS)
|
|
920
|
+
return;
|
|
921
|
+
lastToggleRef.current = now;
|
|
922
|
+
}
|
|
923
|
+
stopAudio(); // also clears suspension state
|
|
924
|
+
abortRef.current?.abort();
|
|
925
|
+
processingRef.current = false;
|
|
926
|
+
bargeInPendingRef.current = false;
|
|
927
|
+
playbackEndedDuringBargeInRef.current = false;
|
|
928
|
+
sessionEndingRef.current = false;
|
|
929
|
+
vad.pause();
|
|
930
|
+
setState('IDLE');
|
|
931
|
+
}, [vad, stopAudio]);
|
|
932
|
+
// Cleanup on unmount
|
|
933
|
+
useEffect(() => {
|
|
934
|
+
return () => {
|
|
935
|
+
stopAudio();
|
|
936
|
+
abortRef.current?.abort();
|
|
937
|
+
processingRef.current = false;
|
|
938
|
+
};
|
|
939
|
+
}, [stopAudio]);
|
|
940
|
+
// Text input pipeline (same flow as voice, minus STT)
|
|
941
|
+
const sendTextMessage = useCallback(async (text) => {
|
|
942
|
+
if (!text.trim())
|
|
943
|
+
return;
|
|
944
|
+
sessionEndingRef.current = false;
|
|
945
|
+
if (processingRef.current) {
|
|
946
|
+
setVoiceError('processing');
|
|
947
|
+
setTimeout(() => {
|
|
948
|
+
setVoiceError((prev) => (prev === 'processing' ? null : prev));
|
|
949
|
+
}, MISFIRE_DISMISS_MS);
|
|
950
|
+
return;
|
|
951
|
+
}
|
|
952
|
+
processingRef.current = true;
|
|
953
|
+
textPipelineRef.current = true;
|
|
954
|
+
roundTripCountRef.current = 0; // Reset for new user turn
|
|
955
|
+
lastAutoSendMsgIdRef.current = null;
|
|
956
|
+
const t0 = performance.now();
|
|
957
|
+
const timings = { pipeline: 'text', timestamp: Date.now() };
|
|
958
|
+
setMessages((prev) => [...prev, { role: 'user', text, timestamp: Date.now() }]);
|
|
959
|
+
setCurrentTranscript(text);
|
|
960
|
+
setState('PROCESSING');
|
|
961
|
+
const tLlm0 = performance.now();
|
|
962
|
+
try {
|
|
963
|
+
await chatSendMessage({ text });
|
|
964
|
+
}
|
|
965
|
+
catch (llmErr) {
|
|
966
|
+
console.error('LLM error:', llmErr);
|
|
967
|
+
timings.llmSendMs = performance.now() - tLlm0;
|
|
968
|
+
timings.totalMs = performance.now() - t0;
|
|
969
|
+
logTimings(timings);
|
|
970
|
+
setLastTimings(timings);
|
|
971
|
+
setTransientLLMError();
|
|
972
|
+
setState('IDLE');
|
|
973
|
+
processingRef.current = false;
|
|
974
|
+
textPipelineRef.current = false;
|
|
975
|
+
return;
|
|
976
|
+
}
|
|
977
|
+
timings.llmSendMs = performance.now() - tLlm0;
|
|
978
|
+
// TTS handled by onFinish callback
|
|
979
|
+
timings.totalMs = performance.now() - t0;
|
|
980
|
+
logTimings(timings);
|
|
981
|
+
setLastTimings(timings);
|
|
982
|
+
processingRef.current = false;
|
|
983
|
+
}, [chatSendMessage]);
|
|
984
|
+
return {
|
|
985
|
+
state,
|
|
986
|
+
start,
|
|
987
|
+
stop,
|
|
988
|
+
loading: vad.loading,
|
|
989
|
+
error: vad.errored,
|
|
990
|
+
voiceError,
|
|
991
|
+
dismissError,
|
|
992
|
+
messages,
|
|
993
|
+
currentTranscript,
|
|
994
|
+
isLLMLoading: chatStatus === 'streaming' || chatStatus === 'submitted',
|
|
995
|
+
getAmplitude,
|
|
996
|
+
initContext,
|
|
997
|
+
applyVolume,
|
|
998
|
+
analyser,
|
|
999
|
+
sendTextMessage,
|
|
1000
|
+
lastTimings,
|
|
1001
|
+
sessionEnded,
|
|
1002
|
+
settings,
|
|
1003
|
+
};
|
|
1004
|
+
}
|
|
1005
|
+
//# sourceMappingURL=useVoiceAgent.js.map
|