@iaforged/context-code 1.1.4 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -8
- package/dist/src/commands/init.js +91 -219
- package/dist/src/commands/voice/index.js +6 -7
- package/dist/src/commands/voice/voice.js +87 -43
- package/dist/src/commands.js +1 -3
- package/dist/src/components/LogoV2/VoiceModeNotice.js +1 -1
- package/dist/src/components/PromptInput/VoiceIndicator.js +4 -4
- package/dist/src/components/Spinner.js +18 -18
- package/dist/src/constants/spinnerVerbs.js +9 -9
- package/dist/src/hooks/usePasteHandler.js +8 -8
- package/dist/src/hooks/useVoice.js +93 -805
- package/dist/src/hooks/useVoiceEnabled.js +3 -15
- package/dist/src/hooks/useVoiceIntegration.js +6 -25
- package/dist/src/keybindings/defaultBindings.js +9 -6
- package/dist/src/screens/REPL.js +10 -22
- package/dist/src/services/localDictation.js +520 -0
- package/dist/src/services/voice.js +9 -7
- package/dist/src/state/AppState.js +1 -3
- package/dist/src/tools/ConfigTool/ConfigTool.js +12 -15
- package/dist/src/tools/ConfigTool/supportedSettings.js +2 -2
- package/dist/src/utils/imagePaste.js +11 -5
- package/dist/src/utils/model/model.js +2 -0
- package/dist/src/utils/settings/types.js +2 -2
- package/dist/src/voice/voiceModeEnabled.js +5 -25
- package/dist/vendor/audio-capture/arm64-darwin/audio-capture.node +0 -0
- package/dist/vendor/audio-capture/arm64-linux/audio-capture.node +0 -0
- package/dist/vendor/audio-capture/arm64-win32/audio-capture.node +0 -0
- package/dist/vendor/audio-capture/x64-darwin/audio-capture.node +0 -0
- package/dist/vendor/audio-capture/x64-linux/audio-capture.node +0 -0
- package/dist/vendor/audio-capture/x64-win32/audio-capture.node +0 -0
- package/dist/vendor/audio-capture-src/index.js +114 -0
- package/dist/vendor/audio-capture-src/index.ts +155 -0
- package/docs/comandos.md +132 -121
- package/package.json +1 -1
|
@@ -1,159 +1,67 @@
|
|
|
1
|
-
import { createRequire } from 'module';
|
|
2
|
-
const require = createRequire(import.meta.url);
|
|
3
|
-
// React hook for hold-to-talk voice input using Anthropic voice_stream STT.
|
|
4
|
-
//
|
|
5
|
-
// Hold the keybinding to record; release to stop and submit. Auto-repeat
|
|
6
|
-
// key events reset an internal timer — when no keypress arrives within
|
|
7
|
-
// RELEASE_TIMEOUT_MS the recording stops automatically. Uses the native
|
|
8
|
-
// audio module (macOS) or SoX for recording, and Anthropic's voice_stream
|
|
9
|
-
// endpoint (conversation_engine) for STT.
|
|
10
1
|
import { useCallback, useEffect, useRef, useState } from 'react';
|
|
11
2
|
import { useSetVoiceState } from '../context/voice.js';
|
|
12
|
-
import {
|
|
13
|
-
import { logEvent, } from '../services/analytics/index.js';
|
|
14
|
-
import { getVoiceKeyterms } from '../services/voiceKeyterms.js';
|
|
15
|
-
import { connectVoiceStream, isVoiceStreamAvailable, } from '../services/voiceStreamSTT.js';
|
|
3
|
+
import { checkLocalDictationConfiguration, transcribePcmBuffers, } from '../services/localDictation.js';
|
|
16
4
|
import { logForDebugging } from '../utils/debug.js';
|
|
17
|
-
import { toError } from '../utils/errors.js';
|
|
18
|
-
import { getSystemLocaleLanguage } from '../utils/intl.js';
|
|
19
5
|
import { logError } from '../utils/log.js';
|
|
20
6
|
import { getInitialSettings } from '../utils/settings/settings.js';
|
|
21
|
-
|
|
22
|
-
// ─── Language normalization ─────────────────────────────────────────────
|
|
23
|
-
const DEFAULT_STT_LANGUAGE = 'en';
|
|
24
|
-
// Maps language names (English and native) to BCP-47 codes supported by
|
|
25
|
-
// the voice_stream Deepgram backend. Keys must be lowercase.
|
|
26
|
-
//
|
|
27
|
-
// This list must be a SUBSET of the server-side supported_language_codes
|
|
28
|
-
// allowlist (GrowthBook: speech_to_text_voice_stream_config).
|
|
29
|
-
// If the CLI sends a code the server rejects, the WebSocket closes with
|
|
30
|
-
// 1008 "Unsupported language" and voice breaks. Unsupported languages
|
|
31
|
-
// fall back to DEFAULT_STT_LANGUAGE so recording still works.
|
|
7
|
+
const DEFAULT_STT_LANGUAGE = 'es';
|
|
32
8
|
const LANGUAGE_NAME_TO_CODE = {
|
|
33
9
|
english: 'en',
|
|
34
10
|
spanish: 'es',
|
|
35
|
-
español: 'es',
|
|
36
11
|
espanol: 'es',
|
|
37
12
|
french: 'fr',
|
|
38
|
-
français: 'fr',
|
|
39
13
|
francais: 'fr',
|
|
40
14
|
japanese: 'ja',
|
|
41
|
-
日本語: 'ja',
|
|
42
15
|
german: 'de',
|
|
43
16
|
deutsch: 'de',
|
|
44
17
|
portuguese: 'pt',
|
|
45
|
-
português: 'pt',
|
|
46
18
|
portugues: 'pt',
|
|
47
19
|
italian: 'it',
|
|
48
|
-
italiano: 'it',
|
|
49
20
|
korean: 'ko',
|
|
50
|
-
한국어: 'ko',
|
|
51
21
|
hindi: 'hi',
|
|
52
|
-
हिन्दी: 'hi',
|
|
53
|
-
हिंदी: 'hi',
|
|
54
22
|
indonesian: 'id',
|
|
55
|
-
'bahasa indonesia': 'id',
|
|
56
|
-
bahasa: 'id',
|
|
57
23
|
russian: 'ru',
|
|
58
|
-
русский: 'ru',
|
|
59
24
|
polish: 'pl',
|
|
60
|
-
polski: 'pl',
|
|
61
25
|
turkish: 'tr',
|
|
62
|
-
türkçe: 'tr',
|
|
63
|
-
turkce: 'tr',
|
|
64
26
|
dutch: 'nl',
|
|
65
|
-
nederlands: 'nl',
|
|
66
27
|
ukrainian: 'uk',
|
|
67
|
-
українська: 'uk',
|
|
68
28
|
greek: 'el',
|
|
69
|
-
ελληνικά: 'el',
|
|
70
29
|
czech: 'cs',
|
|
71
|
-
čeština: 'cs',
|
|
72
|
-
cestina: 'cs',
|
|
73
30
|
danish: 'da',
|
|
74
|
-
dansk: 'da',
|
|
75
31
|
swedish: 'sv',
|
|
76
|
-
svenska: 'sv',
|
|
77
32
|
norwegian: 'no',
|
|
78
|
-
norsk: 'no',
|
|
79
33
|
};
|
|
80
|
-
// Subset of the GrowthBook speech_to_text_voice_stream_config allowlist.
|
|
81
|
-
// Sending a code not in the server allowlist closes the connection.
|
|
82
|
-
const SUPPORTED_LANGUAGE_CODES = new Set([
|
|
83
|
-
'en',
|
|
84
|
-
'es',
|
|
85
|
-
'fr',
|
|
86
|
-
'ja',
|
|
87
|
-
'de',
|
|
88
|
-
'pt',
|
|
89
|
-
'it',
|
|
90
|
-
'ko',
|
|
91
|
-
'hi',
|
|
92
|
-
'id',
|
|
93
|
-
'ru',
|
|
94
|
-
'pl',
|
|
95
|
-
'tr',
|
|
96
|
-
'nl',
|
|
97
|
-
'uk',
|
|
98
|
-
'el',
|
|
99
|
-
'cs',
|
|
100
|
-
'da',
|
|
101
|
-
'sv',
|
|
102
|
-
'no',
|
|
103
|
-
]);
|
|
104
|
-
// Normalize a language preference string (from settings.language) to a
|
|
105
|
-
// BCP-47 code supported by the voice_stream endpoint. Returns the
|
|
106
|
-
// default language if the input cannot be resolved. When the input is
|
|
107
|
-
// non-empty but unsupported, fellBackFrom is set to the original input so
|
|
108
|
-
// callers can surface a warning.
|
|
109
34
|
export function normalizeLanguageForSTT(language) {
|
|
110
35
|
if (!language)
|
|
111
36
|
return { code: DEFAULT_STT_LANGUAGE };
|
|
112
|
-
|
|
37
|
+
// Normaliza eliminando tildes/diacriticos para que "Español" -> "espanol".
|
|
38
|
+
const lower = language
|
|
39
|
+
.toLowerCase()
|
|
40
|
+
.normalize('NFD')
|
|
41
|
+
.replace(/[\u0300-\u036f]/g, '')
|
|
42
|
+
.trim();
|
|
113
43
|
if (!lower)
|
|
114
44
|
return { code: DEFAULT_STT_LANGUAGE };
|
|
115
|
-
if (
|
|
45
|
+
if (lower === 'auto')
|
|
46
|
+
return { code: lower };
|
|
47
|
+
if (/^[a-z]{2}(-[a-z]{2})?$/i.test(lower))
|
|
116
48
|
return { code: lower };
|
|
117
49
|
const fromName = LANGUAGE_NAME_TO_CODE[lower];
|
|
118
50
|
if (fromName)
|
|
119
51
|
return { code: fromName };
|
|
120
|
-
const base = lower.split('-')[0];
|
|
121
|
-
if (base && SUPPORTED_LANGUAGE_CODES.has(base))
|
|
122
|
-
return { code: base };
|
|
123
52
|
return { code: DEFAULT_STT_LANGUAGE, fellBackFrom: language };
|
|
124
53
|
}
|
|
125
54
|
let voiceModule = null;
|
|
126
|
-
// Gap (ms) between auto-repeat key events that signals key release.
|
|
127
|
-
// Terminal auto-repeat typically fires every 30-80ms; 200ms comfortably
|
|
128
|
-
// covers jitter while still feeling responsive.
|
|
129
55
|
const RELEASE_TIMEOUT_MS = 200;
|
|
130
|
-
// Fallback (ms) to arm the release timer if no auto-repeat is seen.
|
|
131
|
-
// macOS default key repeat delay is ~500ms; 600ms gives headroom.
|
|
132
|
-
// If the user tapped and released before auto-repeat started, this
|
|
133
|
-
// ensures the release timer gets armed and recording stops.
|
|
134
|
-
//
|
|
135
|
-
// For modifier-combo first-press activation (handleKeyEvent called at
|
|
136
|
-
// t=0, before any auto-repeat), callers should pass FIRST_PRESS_FALLBACK_MS
|
|
137
|
-
// instead — the gap to the next keypress is the OS initial repeat *delay*
|
|
138
|
-
// (up to ~2s on macOS with slider at "Long"), not the repeat *rate*.
|
|
139
56
|
const REPEAT_FALLBACK_MS = 600;
|
|
140
57
|
export const FIRST_PRESS_FALLBACK_MS = 2000;
|
|
141
|
-
// How long (ms) to keep a focus-mode session alive without any speech
|
|
142
|
-
// before tearing it down to free the WebSocket connection. Re-arms on
|
|
143
|
-
// the next focus cycle (blur → refocus).
|
|
144
|
-
const FOCUS_SILENCE_TIMEOUT_MS = 5_000;
|
|
145
|
-
// Number of bars shown in the recording waveform visualizer.
|
|
146
58
|
const AUDIO_LEVEL_BARS = 16;
|
|
147
|
-
// Compute RMS amplitude from a 16-bit signed PCM buffer and return a
|
|
148
|
-
// normalized 0-1 value. A sqrt curve spreads quieter levels across more
|
|
149
|
-
// of the visual range so the waveform uses the full set of block heights.
|
|
150
59
|
export function computeLevel(chunk) {
|
|
151
|
-
const samples = chunk.length >> 1;
|
|
60
|
+
const samples = chunk.length >> 1;
|
|
152
61
|
if (samples === 0)
|
|
153
62
|
return 0;
|
|
154
63
|
let sumSq = 0;
|
|
155
64
|
for (let i = 0; i < chunk.length - 1; i += 2) {
|
|
156
|
-
// Read 16-bit signed little-endian
|
|
157
65
|
const sample = ((chunk[i] | (chunk[i + 1] << 8)) << 16) >> 16;
|
|
158
66
|
sumSq += sample * sample;
|
|
159
67
|
}
|
|
@@ -161,90 +69,25 @@ export function computeLevel(chunk) {
|
|
|
161
69
|
const normalized = Math.min(rms / 2000, 1);
|
|
162
70
|
return Math.sqrt(normalized);
|
|
163
71
|
}
|
|
164
|
-
export function useVoice({ onTranscript, onError, enabled, focusMode, }) {
|
|
72
|
+
export function useVoice({ onTranscript, onError, enabled, focusMode: _focusMode, }) {
|
|
165
73
|
const [state, setState] = useState('idle');
|
|
166
74
|
const stateRef = useRef('idle');
|
|
167
|
-
const connectionRef = useRef(null);
|
|
168
|
-
const accumulatedRef = useRef('');
|
|
169
75
|
const onTranscriptRef = useRef(onTranscript);
|
|
170
76
|
const onErrorRef = useRef(onError);
|
|
171
|
-
const cleanupTimerRef = useRef(null);
|
|
172
77
|
const releaseTimerRef = useRef(null);
|
|
173
|
-
// True once we've seen a second keypress (auto-repeat) while recording.
|
|
174
|
-
// The OS key repeat delay (~500ms on macOS) means the first keypress is
|
|
175
|
-
// solo — arming the release timer before auto-repeat starts would cause
|
|
176
|
-
// a false release.
|
|
177
|
-
const seenRepeatRef = useRef(false);
|
|
178
78
|
const repeatFallbackTimerRef = useRef(null);
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const focusTriggeredRef = useRef(false);
|
|
182
|
-
// Timer that tears down the session after prolonged silence in focus mode.
|
|
183
|
-
const focusSilenceTimerRef = useRef(null);
|
|
184
|
-
// Set when a focus-mode session is torn down due to silence. Prevents
|
|
185
|
-
// the focus effect from immediately restarting. Cleared on blur so the
|
|
186
|
-
// next focus cycle re-arms recording.
|
|
187
|
-
const silenceTimedOutRef = useRef(false);
|
|
188
|
-
const recordingStartRef = useRef(0);
|
|
189
|
-
// Incremented on each startRecordingSession(). Callbacks capture their
|
|
190
|
-
// generation and bail if a newer session has started — prevents a zombie
|
|
191
|
-
// slow-connecting WS from an abandoned session from overwriting
|
|
192
|
-
// connectionRef mid-way through the next session.
|
|
193
|
-
const sessionGenRef = useRef(0);
|
|
194
|
-
// True if the early-error retry fired during this session.
|
|
195
|
-
// Tracked for the tengu_voice_recording_completed analytics event.
|
|
196
|
-
const retryUsedRef = useRef(false);
|
|
197
|
-
// Full audio captured this session, kept for silent-drop replay. ~1% of
|
|
198
|
-
// sessions get a sticky-broken CE pod that accepts audio but returns zero
|
|
199
|
-
// transcripts (anthropics/anthropic#287008 session-sticky variant); when
|
|
200
|
-
// finalize() resolves via no_data_timeout with hadAudioSignal=true, we
|
|
201
|
-
// replay the buffer on a fresh WS once. Bounded: 32KB/s × ~60s max ≈ 2MB.
|
|
202
|
-
const fullAudioRef = useRef([]);
|
|
203
|
-
const silentDropRetriedRef = useRef(false);
|
|
204
|
-
// Bumped when the early-error retry is scheduled. Captured per
|
|
205
|
-
// attemptConnect — onError swallows stale-gen events (conn 1's
|
|
206
|
-
// trailing close-error) but surfaces current-gen ones (conn 2's
|
|
207
|
-
// genuine failure). Same shape as sessionGenRef, one level down.
|
|
208
|
-
const attemptGenRef = useRef(0);
|
|
209
|
-
// Running total of chars flushed in focus mode (each final transcript is
|
|
210
|
-
// injected immediately and accumulatedRef reset). Added to transcriptChars
|
|
211
|
-
// in the completed event so focus-mode sessions don't false-positive as
|
|
212
|
-
// silent-drops (transcriptChars=0 despite successful transcription).
|
|
213
|
-
const focusFlushedCharsRef = useRef(0);
|
|
214
|
-
// True if at least one audio chunk with non-trivial signal was received.
|
|
215
|
-
// Used to distinguish "microphone is silent/inaccessible" from "speech not detected".
|
|
216
|
-
const hasAudioSignalRef = useRef(false);
|
|
217
|
-
// True once onReady fired for the current session. Unlike connectionRef
|
|
218
|
-
// (which cleanup() nulls), this survives effect-order races where Effect 3
|
|
219
|
-
// cleanup runs before Effect 2's finishRecording() — e.g. /voice toggled
|
|
220
|
-
// off mid-recording in focus mode. Used for the wsConnected analytics
|
|
221
|
-
// dimension and error-message branching. Reset in startRecordingSession.
|
|
222
|
-
const everConnectedRef = useRef(false);
|
|
79
|
+
const seenRepeatRef = useRef(false);
|
|
80
|
+
const recordingChunksRef = useRef([]);
|
|
223
81
|
const audioLevelsRef = useRef([]);
|
|
224
|
-
const isFocused = useTerminalFocus();
|
|
225
82
|
const setVoiceState = useSetVoiceState();
|
|
226
|
-
// Keep callback refs current without triggering re-renders
|
|
227
83
|
onTranscriptRef.current = onTranscript;
|
|
228
84
|
onErrorRef.current = onError;
|
|
229
85
|
function updateState(newState) {
|
|
230
86
|
stateRef.current = newState;
|
|
231
87
|
setState(newState);
|
|
232
|
-
setVoiceState(prev => {
|
|
233
|
-
if (prev.voiceState === newState)
|
|
234
|
-
return prev;
|
|
235
|
-
return { ...prev, voiceState: newState };
|
|
236
|
-
});
|
|
88
|
+
setVoiceState(prev => prev.voiceState === newState ? prev : { ...prev, voiceState: newState });
|
|
237
89
|
}
|
|
238
90
|
const cleanup = useCallback(() => {
|
|
239
|
-
// Stale any in-flight session (main connection isStale(), replay
|
|
240
|
-
// isStale(), finishRecording continuation). Without this, disabling
|
|
241
|
-
// voice during the replay window lets the stale replay open a WS,
|
|
242
|
-
// accumulate transcript, and inject it after voice was torn down.
|
|
243
|
-
sessionGenRef.current++;
|
|
244
|
-
if (cleanupTimerRef.current) {
|
|
245
|
-
clearTimeout(cleanupTimerRef.current);
|
|
246
|
-
cleanupTimerRef.current = null;
|
|
247
|
-
}
|
|
248
91
|
if (releaseTimerRef.current) {
|
|
249
92
|
clearTimeout(releaseTimerRef.current);
|
|
250
93
|
releaseTimerRef.current = null;
|
|
@@ -253,691 +96,136 @@ export function useVoice({ onTranscript, onError, enabled, focusMode, }) {
|
|
|
253
96
|
clearTimeout(repeatFallbackTimerRef.current);
|
|
254
97
|
repeatFallbackTimerRef.current = null;
|
|
255
98
|
}
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
focusSilenceTimerRef.current = null;
|
|
259
|
-
}
|
|
260
|
-
silenceTimedOutRef.current = false;
|
|
261
|
-
voiceModule?.stopRecording();
|
|
262
|
-
if (connectionRef.current) {
|
|
263
|
-
connectionRef.current.close();
|
|
264
|
-
connectionRef.current = null;
|
|
265
|
-
}
|
|
266
|
-
accumulatedRef.current = '';
|
|
99
|
+
seenRepeatRef.current = false;
|
|
100
|
+
recordingChunksRef.current = [];
|
|
267
101
|
audioLevelsRef.current = [];
|
|
268
|
-
|
|
269
|
-
setVoiceState(prev => {
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
102
|
+
voiceModule?.stopRecording();
|
|
103
|
+
setVoiceState(prev => ({
|
|
104
|
+
...prev,
|
|
105
|
+
voiceInterimTranscript: '',
|
|
106
|
+
voiceAudioLevels: [],
|
|
107
|
+
voiceWarmingUp: false,
|
|
108
|
+
}));
|
|
274
109
|
}, [setVoiceState]);
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
// top of the "check network" message below.
|
|
280
|
-
attemptGenRef.current++;
|
|
281
|
-
// Capture focusTriggered BEFORE clearing it — needed as an event dimension
|
|
282
|
-
// so BigQuery can filter out passive focus-mode auto-recordings (user focused
|
|
283
|
-
// terminal without speaking → ambient noise sets hadAudioSignal=true → false
|
|
284
|
-
// silent-drop signature). focusFlushedCharsRef fixes transcriptChars accuracy
|
|
285
|
-
// for sessions WITH speech; focusTriggered enables filtering sessions WITHOUT.
|
|
286
|
-
const focusTriggered = focusTriggeredRef.current;
|
|
287
|
-
focusTriggeredRef.current = false;
|
|
288
|
-
updateState('processing');
|
|
110
|
+
const finishRecording = useCallback(async () => {
|
|
111
|
+
if (stateRef.current !== 'recording') {
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
289
114
|
voiceModule?.stopRecording();
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
// reproducing the silent-drop false-positive this ref exists to prevent.
|
|
296
|
-
const recordingDurationMs = Date.now() - recordingStartRef.current;
|
|
297
|
-
const hadAudioSignal = hasAudioSignalRef.current;
|
|
298
|
-
const retried = retryUsedRef.current;
|
|
299
|
-
const focusFlushedChars = focusFlushedCharsRef.current;
|
|
300
|
-
// wsConnected distinguishes "backend received audio but dropped it" (the
|
|
301
|
-
// bug backend PR #287008 fixes) from "WS handshake never completed" —
|
|
302
|
-
// in the latter case audio is still in audioBuffer, never reached the
|
|
303
|
-
// server, but hasAudioSignalRef is already true from ambient noise.
|
|
304
|
-
const wsConnected = everConnectedRef.current;
|
|
305
|
-
// Capture generation BEFORE the .then() — if a new session starts during
|
|
306
|
-
// the finalize wait, sessionGenRef has already advanced by the time the
|
|
307
|
-
// continuation runs, so capturing inside the .then() would yield the new
|
|
308
|
-
// session's gen and every staleness check would be a no-op.
|
|
309
|
-
const myGen = sessionGenRef.current;
|
|
310
|
-
const isStale = () => sessionGenRef.current !== myGen;
|
|
311
|
-
logForDebugging('[voice] Recording stopped');
|
|
312
|
-
// Send finalize and wait for the WebSocket to close before reading the
|
|
313
|
-
// accumulated transcript. The close handler promotes any unreported
|
|
314
|
-
// interim text to final, so we must wait for it to fire.
|
|
315
|
-
const finalizePromise = connectionRef.current
|
|
316
|
-
? connectionRef.current.finalize()
|
|
317
|
-
: Promise.resolve(undefined);
|
|
318
|
-
void finalizePromise
|
|
319
|
-
.then(async (finalizeSource) => {
|
|
320
|
-
if (isStale())
|
|
321
|
-
return;
|
|
322
|
-
// Silent-drop replay: when the server accepted audio (wsConnected),
|
|
323
|
-
// the mic captured real signal (hadAudioSignal), but finalize timed
|
|
324
|
-
// out with zero transcript — the ~1% session-sticky CE-pod bug.
|
|
325
|
-
// Replay the buffered audio on a fresh connection once. A 250ms
|
|
326
|
-
// backoff clears the same-pod rapid-reconnect race (same gap as the
|
|
327
|
-
// early-error retry path below).
|
|
328
|
-
if (finalizeSource === 'no_data_timeout' &&
|
|
329
|
-
hadAudioSignal &&
|
|
330
|
-
wsConnected &&
|
|
331
|
-
!focusTriggered &&
|
|
332
|
-
focusFlushedChars === 0 &&
|
|
333
|
-
accumulatedRef.current.trim() === '' &&
|
|
334
|
-
!silentDropRetriedRef.current &&
|
|
335
|
-
fullAudioRef.current.length > 0) {
|
|
336
|
-
silentDropRetriedRef.current = true;
|
|
337
|
-
logForDebugging(`[voice] Silent-drop detected (no_data_timeout, ${String(fullAudioRef.current.length)} chunks); replaying on fresh connection`);
|
|
338
|
-
logEvent('tengu_voice_silent_drop_replay', {
|
|
339
|
-
recordingDurationMs,
|
|
340
|
-
chunkCount: fullAudioRef.current.length,
|
|
341
|
-
});
|
|
342
|
-
if (connectionRef.current) {
|
|
343
|
-
connectionRef.current.close();
|
|
344
|
-
connectionRef.current = null;
|
|
345
|
-
}
|
|
346
|
-
const replayBuffer = fullAudioRef.current;
|
|
347
|
-
await sleep(250);
|
|
348
|
-
if (isStale())
|
|
349
|
-
return;
|
|
350
|
-
const stt = normalizeLanguageForSTT(getInitialSettings().language);
|
|
351
|
-
const keyterms = await getVoiceKeyterms();
|
|
352
|
-
if (isStale())
|
|
353
|
-
return;
|
|
354
|
-
await new Promise(resolve => {
|
|
355
|
-
void connectVoiceStream({
|
|
356
|
-
onTranscript: (t, isFinal) => {
|
|
357
|
-
if (isStale())
|
|
358
|
-
return;
|
|
359
|
-
if (isFinal && t.trim()) {
|
|
360
|
-
if (accumulatedRef.current)
|
|
361
|
-
accumulatedRef.current += ' ';
|
|
362
|
-
accumulatedRef.current += t.trim();
|
|
363
|
-
}
|
|
364
|
-
},
|
|
365
|
-
onError: () => resolve(),
|
|
366
|
-
onClose: () => { },
|
|
367
|
-
onReady: conn => {
|
|
368
|
-
if (isStale()) {
|
|
369
|
-
conn.close();
|
|
370
|
-
resolve();
|
|
371
|
-
return;
|
|
372
|
-
}
|
|
373
|
-
connectionRef.current = conn;
|
|
374
|
-
const SLICE = 32_000;
|
|
375
|
-
let slice = [];
|
|
376
|
-
let bytes = 0;
|
|
377
|
-
for (const c of replayBuffer) {
|
|
378
|
-
if (bytes > 0 && bytes + c.length > SLICE) {
|
|
379
|
-
conn.send(Buffer.concat(slice));
|
|
380
|
-
slice = [];
|
|
381
|
-
bytes = 0;
|
|
382
|
-
}
|
|
383
|
-
slice.push(c);
|
|
384
|
-
bytes += c.length;
|
|
385
|
-
}
|
|
386
|
-
if (slice.length)
|
|
387
|
-
conn.send(Buffer.concat(slice));
|
|
388
|
-
void conn.finalize().then(() => {
|
|
389
|
-
conn.close();
|
|
390
|
-
resolve();
|
|
391
|
-
});
|
|
392
|
-
},
|
|
393
|
-
}, { language: stt.code, keyterms }).then(c => {
|
|
394
|
-
if (!c)
|
|
395
|
-
resolve();
|
|
396
|
-
}, () => resolve());
|
|
397
|
-
});
|
|
398
|
-
if (isStale())
|
|
399
|
-
return;
|
|
400
|
-
}
|
|
401
|
-
fullAudioRef.current = [];
|
|
402
|
-
const text = accumulatedRef.current.trim();
|
|
403
|
-
logForDebugging(`[voice] Final transcript assembled (${String(text.length)} chars): "${text.slice(0, 200)}"`);
|
|
404
|
-
// Tracks silent-drop rate: transcriptChars=0 + hadAudioSignal=true
|
|
405
|
-
// + recordingDurationMs>2000 = the bug backend PR #287008 fixes.
|
|
406
|
-
// focusFlushedCharsRef makes transcriptChars accurate for focus mode
|
|
407
|
-
// (where each final is injected immediately and accumulatedRef reset).
|
|
408
|
-
//
|
|
409
|
-
// NOTE: this fires only on the finishRecording() path. The onError
|
|
410
|
-
// fallthrough and !conn (no-OAuth) paths bypass this → don't compute
|
|
411
|
-
// COUNT(completed)/COUNT(started) as a success rate; the silent-drop
|
|
412
|
-
// denominator (completed events only) is internally consistent.
|
|
413
|
-
logEvent('tengu_voice_recording_completed', {
|
|
414
|
-
transcriptChars: text.length + focusFlushedChars,
|
|
415
|
-
recordingDurationMs,
|
|
416
|
-
hadAudioSignal,
|
|
417
|
-
retried,
|
|
418
|
-
silentDropRetried: silentDropRetriedRef.current,
|
|
419
|
-
wsConnected,
|
|
420
|
-
focusTriggered,
|
|
421
|
-
});
|
|
422
|
-
if (connectionRef.current) {
|
|
423
|
-
connectionRef.current.close();
|
|
424
|
-
connectionRef.current = null;
|
|
425
|
-
}
|
|
426
|
-
if (text) {
|
|
427
|
-
logForDebugging(`[voice] Injecting transcript (${String(text.length)} chars)`);
|
|
428
|
-
onTranscriptRef.current(text);
|
|
429
|
-
}
|
|
430
|
-
else if (focusFlushedChars === 0 && recordingDurationMs > 2000) {
|
|
431
|
-
// Only warn about empty transcript if nothing was flushed in focus
|
|
432
|
-
// mode either, and recording was > 2s (short recordings = accidental
|
|
433
|
-
// taps → silently return to idle).
|
|
434
|
-
if (!wsConnected) {
|
|
435
|
-
// WS never connected → audio never reached backend. Not a silent
|
|
436
|
-
// drop; a connection failure (slow OAuth refresh, network, etc).
|
|
437
|
-
onErrorRef.current?.('Voice connection failed. Check your network and try again.');
|
|
438
|
-
}
|
|
439
|
-
else if (!hadAudioSignal) {
|
|
440
|
-
// Distinguish silent mic (capture issue) from speech not recognized.
|
|
441
|
-
onErrorRef.current?.('No audio detected from microphone. Check that the correct input device is selected and that Context Code has microphone access.');
|
|
442
|
-
}
|
|
443
|
-
else {
|
|
444
|
-
onErrorRef.current?.('No speech detected.');
|
|
445
|
-
}
|
|
446
|
-
}
|
|
447
|
-
accumulatedRef.current = '';
|
|
448
|
-
setVoiceState(prev => {
|
|
449
|
-
if (prev.voiceInterimTranscript === '')
|
|
450
|
-
return prev;
|
|
451
|
-
return { ...prev, voiceInterimTranscript: '' };
|
|
452
|
-
});
|
|
115
|
+
updateState('processing');
|
|
116
|
+
const chunks = recordingChunksRef.current;
|
|
117
|
+
recordingChunksRef.current = [];
|
|
118
|
+
if (!chunks.length) {
|
|
119
|
+
cleanup();
|
|
453
120
|
updateState('idle');
|
|
454
|
-
})
|
|
455
|
-
.catch(err => {
|
|
456
|
-
logError(toError(err));
|
|
457
|
-
if (!isStale())
|
|
458
|
-
updateState('idle');
|
|
459
|
-
});
|
|
460
|
-
}
|
|
461
|
-
// When voice is enabled, lazy-import voice.ts so checkRecordingAvailability
|
|
462
|
-
// et al. are ready when the user presses the voice key. Do NOT preload the
|
|
463
|
-
// native module — require('audio-capture.node') is a synchronous dlopen of
|
|
464
|
-
// CoreAudio/AudioUnit that blocks the event loop for ~1s (warm) to ~8s
|
|
465
|
-
// (cold coreaudiod). setImmediate doesn't help: it yields one tick, then the
|
|
466
|
-
// dlopen still blocks. The first voice keypress pays the dlopen cost instead.
|
|
467
|
-
useEffect(() => {
|
|
468
|
-
if (enabled && !voiceModule) {
|
|
469
|
-
void import('../services/voice.js').then(mod => {
|
|
470
|
-
voiceModule = mod;
|
|
471
|
-
});
|
|
472
|
-
}
|
|
473
|
-
}, [enabled]);
|
|
474
|
-
// ── Focus silence timer ────────────────────────────────────────────
|
|
475
|
-
// Arms (or resets) a timer that tears down the focus-mode session
|
|
476
|
-
// after FOCUS_SILENCE_TIMEOUT_MS of no speech. Called when a session
|
|
477
|
-
// starts and after each flushed transcript.
|
|
478
|
-
function armFocusSilenceTimer() {
|
|
479
|
-
if (focusSilenceTimerRef.current) {
|
|
480
|
-
clearTimeout(focusSilenceTimerRef.current);
|
|
481
|
-
}
|
|
482
|
-
focusSilenceTimerRef.current = setTimeout((focusSilenceTimerRef, stateRef, focusTriggeredRef, silenceTimedOutRef, finishRecording) => {
|
|
483
|
-
focusSilenceTimerRef.current = null;
|
|
484
|
-
if (stateRef.current === 'recording' && focusTriggeredRef.current) {
|
|
485
|
-
logForDebugging('[voice] Focus silence timeout — tearing down session');
|
|
486
|
-
silenceTimedOutRef.current = true;
|
|
487
|
-
finishRecording();
|
|
488
|
-
}
|
|
489
|
-
}, FOCUS_SILENCE_TIMEOUT_MS, focusSilenceTimerRef, stateRef, focusTriggeredRef, silenceTimedOutRef, finishRecording);
|
|
490
|
-
}
|
|
491
|
-
// ── Focus-driven recording ──────────────────────────────────────────
|
|
492
|
-
// In focus mode, start recording when the terminal gains focus and
|
|
493
|
-
// stop when it loses focus. This enables a "multi-clauding army"
|
|
494
|
-
// workflow where voice input follows window focus.
|
|
495
|
-
useEffect(() => {
|
|
496
|
-
if (!enabled || !focusMode) {
|
|
497
|
-
// Focus mode was disabled while a focus-driven recording was active —
|
|
498
|
-
// stop the recording so it doesn't linger until the silence timer fires.
|
|
499
|
-
if (focusTriggeredRef.current && stateRef.current === 'recording') {
|
|
500
|
-
logForDebugging('[voice] Focus mode disabled during recording, finishing');
|
|
501
|
-
finishRecording();
|
|
502
|
-
}
|
|
503
121
|
return;
|
|
504
122
|
}
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
if (cancelled ||
|
|
513
|
-
stateRef.current !== 'idle' ||
|
|
514
|
-
silenceTimedOutRef.current)
|
|
515
|
-
return;
|
|
516
|
-
logForDebugging('[voice] Focus gained, starting recording session');
|
|
517
|
-
focusTriggeredRef.current = true;
|
|
518
|
-
void startRecordingSession();
|
|
519
|
-
armFocusSilenceTimer();
|
|
520
|
-
};
|
|
521
|
-
if (voiceModule) {
|
|
522
|
-
beginFocusRecording();
|
|
123
|
+
try {
|
|
124
|
+
const language = normalizeLanguageForSTT(getInitialSettings().language);
|
|
125
|
+
const transcript = (await transcribePcmBuffers(chunks, {
|
|
126
|
+
language: language.code,
|
|
127
|
+
})).trim();
|
|
128
|
+
if (transcript) {
|
|
129
|
+
onTranscriptRef.current(transcript);
|
|
523
130
|
}
|
|
524
131
|
else {
|
|
525
|
-
|
|
526
|
-
// microtask). Wait for it before starting the recording session.
|
|
527
|
-
void import('../services/voice.js').then(mod => {
|
|
528
|
-
voiceModule = mod;
|
|
529
|
-
beginFocusRecording();
|
|
530
|
-
});
|
|
132
|
+
onErrorRef.current?.('No se detecto dictado. Revisa el microfono o habla un poco mas fuerte.');
|
|
531
133
|
}
|
|
532
134
|
}
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
135
|
+
catch (error) {
|
|
136
|
+
logError(error);
|
|
137
|
+
onErrorRef.current?.(error instanceof Error ? error.message : 'No se pudo transcribir el audio.');
|
|
138
|
+
}
|
|
139
|
+
finally {
|
|
140
|
+
cleanup();
|
|
141
|
+
updateState('idle');
|
|
142
|
+
}
|
|
143
|
+
}, [cleanup]);
|
|
144
|
+
const startRecordingSession = useCallback(async () => {
|
|
145
|
+
if (stateRef.current !== 'idle') {
|
|
146
|
+
return;
|
|
541
147
|
}
|
|
542
|
-
return () => {
|
|
543
|
-
cancelled = true;
|
|
544
|
-
};
|
|
545
|
-
}, [enabled, focusMode, isFocused]);
|
|
546
|
-
// ── Start a new recording session (voice_stream connect + audio) ──
|
|
547
|
-
async function startRecordingSession() {
|
|
548
148
|
if (!voiceModule) {
|
|
549
|
-
|
|
149
|
+
voiceModule = await import('../services/voice.js');
|
|
150
|
+
}
|
|
151
|
+
const dictation = await checkLocalDictationConfiguration();
|
|
152
|
+
if (!dictation.available) {
|
|
153
|
+
onErrorRef.current?.(dictation.error ?? 'El dictado local no esta configurado.');
|
|
550
154
|
return;
|
|
551
155
|
}
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
silentDropRetriedRef.current = false;
|
|
566
|
-
fullAudioRef.current = [];
|
|
567
|
-
focusFlushedCharsRef.current = 0;
|
|
568
|
-
everConnectedRef.current = false;
|
|
569
|
-
const myGen = ++sessionGenRef.current;
|
|
570
|
-
// ── Pre-check: can we actually record audio? ──────────────
|
|
571
|
-
const availability = await voiceModule.checkRecordingAvailability();
|
|
572
|
-
if (!availability.available) {
|
|
573
|
-
logForDebugging(`[voice] Recording not available: ${availability.reason ?? 'unknown'}`);
|
|
574
|
-
onErrorRef.current?.(availability.reason ?? 'Audio recording is not available.');
|
|
575
|
-
cleanup();
|
|
576
|
-
updateState('idle');
|
|
156
|
+
const recording = await voiceModule.checkRecordingAvailability();
|
|
157
|
+
if (!recording.available) {
|
|
158
|
+
onErrorRef.current?.(recording.reason ??
|
|
159
|
+
'La grabacion de audio no esta disponible en este entorno.');
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
if (!(await voiceModule.requestMicrophonePermission())) {
|
|
163
|
+
const guidance = process.platform === 'win32'
|
|
164
|
+
? 'Configuracion > Privacidad > Microfono'
|
|
165
|
+
: process.platform === 'linux'
|
|
166
|
+
? 'la configuracion de audio del sistema'
|
|
167
|
+
: 'System Settings > Privacy & Security > Microphone';
|
|
168
|
+
onErrorRef.current?.(`El acceso al microfono esta denegado. Habilitalo en ${guidance}.`);
|
|
577
169
|
return;
|
|
578
170
|
}
|
|
579
|
-
|
|
580
|
-
// Clear any previous error
|
|
581
|
-
setVoiceState(prev => {
|
|
582
|
-
if (!prev.voiceError)
|
|
583
|
-
return prev;
|
|
584
|
-
return { ...prev, voiceError: null };
|
|
585
|
-
});
|
|
586
|
-
// Buffer audio chunks while the WebSocket connects. Once the connection
|
|
587
|
-
// is ready (onReady fires), buffered chunks are flushed and subsequent
|
|
588
|
-
// chunks are sent directly.
|
|
589
|
-
const audioBuffer = [];
|
|
590
|
-
// Start recording IMMEDIATELY — audio is buffered until the WebSocket
|
|
591
|
-
// opens, eliminating the 1-2s latency from waiting for OAuth + WS connect.
|
|
592
|
-
logForDebugging('[voice] startRecording: buffering audio while WebSocket connects');
|
|
171
|
+
recordingChunksRef.current = [];
|
|
593
172
|
audioLevelsRef.current = [];
|
|
173
|
+
updateState('recording');
|
|
174
|
+
logForDebugging('[dictation] Starting local recording session');
|
|
594
175
|
const started = await voiceModule.startRecording((chunk) => {
|
|
595
|
-
// Copy for fullAudioRef replay buffer. send() in voiceStreamSTT
|
|
596
|
-
// copies again defensively — acceptable overhead at audio rates.
|
|
597
|
-
// Skip buffering in focus mode — replay is gated on !focusTriggered
|
|
598
|
-
// so the buffer is dead weight (up to ~20MB for a 10min session).
|
|
599
176
|
const owned = Buffer.from(chunk);
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
}
|
|
606
|
-
else {
|
|
607
|
-
audioBuffer.push(owned);
|
|
608
|
-
}
|
|
609
|
-
// Update audio level histogram for the recording visualizer
|
|
610
|
-
const level = computeLevel(chunk);
|
|
611
|
-
if (!hasAudioSignalRef.current && level > 0.01) {
|
|
612
|
-
hasAudioSignalRef.current = true;
|
|
613
|
-
}
|
|
614
|
-
const levels = audioLevelsRef.current;
|
|
615
|
-
if (levels.length >= AUDIO_LEVEL_BARS) {
|
|
616
|
-
levels.shift();
|
|
617
|
-
}
|
|
618
|
-
levels.push(level);
|
|
619
|
-
// Copy the array so React sees a new reference
|
|
620
|
-
const snapshot = [...levels];
|
|
621
|
-
audioLevelsRef.current = snapshot;
|
|
622
|
-
setVoiceState(prev => ({ ...prev, voiceAudioLevels: snapshot }));
|
|
177
|
+
recordingChunksRef.current.push(owned);
|
|
178
|
+
const level = computeLevel(owned);
|
|
179
|
+
const next = [...audioLevelsRef.current, level].slice(-AUDIO_LEVEL_BARS);
|
|
180
|
+
audioLevelsRef.current = next;
|
|
181
|
+
setVoiceState(prev => ({ ...prev, voiceAudioLevels: next }));
|
|
623
182
|
}, () => {
|
|
624
|
-
|
|
625
|
-
if (stateRef.current === 'recording') {
|
|
626
|
-
finishRecording();
|
|
627
|
-
}
|
|
183
|
+
void finishRecording();
|
|
628
184
|
}, { silenceDetection: false });
|
|
629
185
|
if (!started) {
|
|
630
|
-
logError(new Error('[voice] Recording failed — no audio tool found'));
|
|
631
|
-
onErrorRef.current?.('Failed to start audio capture. Check that your microphone is accessible.');
|
|
632
186
|
cleanup();
|
|
633
187
|
updateState('idle');
|
|
634
|
-
|
|
635
|
-
...prev,
|
|
636
|
-
voiceError: 'Recording failed — no audio tool found',
|
|
637
|
-
}));
|
|
638
|
-
return;
|
|
188
|
+
onErrorRef.current?.('No se pudo iniciar la captura de audio. Revisa el microfono.');
|
|
639
189
|
}
|
|
640
|
-
|
|
641
|
-
const stt = normalizeLanguageForSTT(rawLanguage);
|
|
642
|
-
logEvent('tengu_voice_recording_started', {
|
|
643
|
-
focusTriggered: focusTriggeredRef.current,
|
|
644
|
-
sttLanguage: stt.code,
|
|
645
|
-
sttLanguageIsDefault: !rawLanguage?.trim(),
|
|
646
|
-
sttLanguageFellBack: stt.fellBackFrom !== undefined,
|
|
647
|
-
// ISO 639 subtag from Intl (bounded set, never user text). undefined if
|
|
648
|
-
// Intl failed — omitted from the payload, no retry cost (cached).
|
|
649
|
-
systemLocaleLanguage: getSystemLocaleLanguage(),
|
|
650
|
-
});
|
|
651
|
-
// Retry once if the connection errors before delivering any transcript.
|
|
652
|
-
// The conversation-engine proxy can reject rapid reconnects (~1/N_pods
|
|
653
|
-
// same-pod collision) or CE's Deepgram upstream can fail during its own
|
|
654
|
-
// teardown window (anthropics/anthropic#287008 surfaces this as
|
|
655
|
-
// TranscriptError instead of silent-drop). A 250ms backoff clears both.
|
|
656
|
-
// Audio captured during the retry window routes to audioBuffer (via the
|
|
657
|
-
// connectionRef.current null check in the recording callback above) and
|
|
658
|
-
// is flushed by the second onReady.
|
|
659
|
-
let sawTranscript = false;
|
|
660
|
-
// Connect WebSocket in parallel with audio recording.
|
|
661
|
-
// Gather keyterms first (async but fast — no model calls), then connect.
|
|
662
|
-
// Bail from callbacks if a newer session has started. Prevents a
|
|
663
|
-
// slow-connecting zombie WS (e.g. user released, pressed again, first
|
|
664
|
-
// WS still handshaking) from firing onReady/onError into the new
|
|
665
|
-
// session and corrupting its connectionRef / triggering a bogus retry.
|
|
666
|
-
const isStale = () => sessionGenRef.current !== myGen;
|
|
667
|
-
const attemptConnect = (keyterms) => {
|
|
668
|
-
const myAttemptGen = attemptGenRef.current;
|
|
669
|
-
void connectVoiceStream({
|
|
670
|
-
onTranscript: (text, isFinal) => {
|
|
671
|
-
if (isStale())
|
|
672
|
-
return;
|
|
673
|
-
sawTranscript = true;
|
|
674
|
-
logForDebugging(`[voice] onTranscript: isFinal=${String(isFinal)} text="${text}"`);
|
|
675
|
-
if (isFinal && text.trim()) {
|
|
676
|
-
if (focusTriggeredRef.current) {
|
|
677
|
-
// Focus mode: flush each final transcript immediately and
|
|
678
|
-
// keep recording. This gives continuous transcription while
|
|
679
|
-
// the terminal is focused.
|
|
680
|
-
logForDebugging(`[voice] Focus mode: flushing final transcript immediately: "${text.trim()}"`);
|
|
681
|
-
onTranscriptRef.current(text.trim());
|
|
682
|
-
focusFlushedCharsRef.current += text.trim().length;
|
|
683
|
-
setVoiceState(prev => {
|
|
684
|
-
if (prev.voiceInterimTranscript === '')
|
|
685
|
-
return prev;
|
|
686
|
-
return { ...prev, voiceInterimTranscript: '' };
|
|
687
|
-
});
|
|
688
|
-
accumulatedRef.current = '';
|
|
689
|
-
// User is actively speaking — reset the silence timer.
|
|
690
|
-
armFocusSilenceTimer();
|
|
691
|
-
}
|
|
692
|
-
else {
|
|
693
|
-
// Hold-to-talk: accumulate final transcripts separated by spaces
|
|
694
|
-
if (accumulatedRef.current) {
|
|
695
|
-
accumulatedRef.current += ' ';
|
|
696
|
-
}
|
|
697
|
-
accumulatedRef.current += text.trim();
|
|
698
|
-
logForDebugging(`[voice] Accumulated final transcript: "${accumulatedRef.current}"`);
|
|
699
|
-
// Clear interim since final supersedes it
|
|
700
|
-
setVoiceState(prev => {
|
|
701
|
-
const preview = accumulatedRef.current;
|
|
702
|
-
if (prev.voiceInterimTranscript === preview)
|
|
703
|
-
return prev;
|
|
704
|
-
return { ...prev, voiceInterimTranscript: preview };
|
|
705
|
-
});
|
|
706
|
-
}
|
|
707
|
-
}
|
|
708
|
-
else if (!isFinal) {
|
|
709
|
-
// Active interim speech resets the focus silence timer.
|
|
710
|
-
// Nova 3 disables auto-finalize so isFinal is never true
|
|
711
|
-
// mid-stream — without this, the 5s timer fires during
|
|
712
|
-
// active speech and tears down the session.
|
|
713
|
-
if (focusTriggeredRef.current) {
|
|
714
|
-
armFocusSilenceTimer();
|
|
715
|
-
}
|
|
716
|
-
// Show accumulated finals + current interim as live preview
|
|
717
|
-
const interim = text.trim();
|
|
718
|
-
const preview = accumulatedRef.current
|
|
719
|
-
? accumulatedRef.current + (interim ? ' ' + interim : '')
|
|
720
|
-
: interim;
|
|
721
|
-
setVoiceState(prev => {
|
|
722
|
-
if (prev.voiceInterimTranscript === preview)
|
|
723
|
-
return prev;
|
|
724
|
-
return { ...prev, voiceInterimTranscript: preview };
|
|
725
|
-
});
|
|
726
|
-
}
|
|
727
|
-
},
|
|
728
|
-
onError: (error, opts) => {
|
|
729
|
-
if (isStale()) {
|
|
730
|
-
logForDebugging(`[voice] ignoring onError from stale session: ${error}`);
|
|
731
|
-
return;
|
|
732
|
-
}
|
|
733
|
-
// Swallow errors from superseded attempts. Covers conn 1's
|
|
734
|
-
// trailing close after retry is scheduled, AND the current
|
|
735
|
-
// conn's ws close event after its ws error already surfaced
|
|
736
|
-
// below (gen bumped at surface).
|
|
737
|
-
if (attemptGenRef.current !== myAttemptGen) {
|
|
738
|
-
logForDebugging(`[voice] ignoring stale onError from superseded attempt: ${error}`);
|
|
739
|
-
return;
|
|
740
|
-
}
|
|
741
|
-
// Early-failure retry: server error before any transcript =
|
|
742
|
-
// likely a transient upstream race (CE rejection, Deepgram
|
|
743
|
-
// not ready). Clear connectionRef so audio re-buffers, back
|
|
744
|
-
// off, reconnect. Skip if the user has already released the
|
|
745
|
-
// key (state left 'recording') — no point retrying a session
|
|
746
|
-
// they've ended. Fatal errors (Cloudflare bot challenge, auth
|
|
747
|
-
// rejection) are the same failure on every retry attempt, so
|
|
748
|
-
// fall through to surface the message.
|
|
749
|
-
if (!opts?.fatal &&
|
|
750
|
-
!sawTranscript &&
|
|
751
|
-
stateRef.current === 'recording') {
|
|
752
|
-
if (!retryUsedRef.current) {
|
|
753
|
-
retryUsedRef.current = true;
|
|
754
|
-
logForDebugging(`[voice] early voice_stream error (pre-transcript), retrying once: ${error}`);
|
|
755
|
-
logEvent('tengu_voice_stream_early_retry', {});
|
|
756
|
-
connectionRef.current = null;
|
|
757
|
-
attemptGenRef.current++;
|
|
758
|
-
setTimeout((stateRef, attemptConnect, keyterms) => {
|
|
759
|
-
if (stateRef.current === 'recording') {
|
|
760
|
-
attemptConnect(keyterms);
|
|
761
|
-
}
|
|
762
|
-
}, 250, stateRef, attemptConnect, keyterms);
|
|
763
|
-
return;
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
// Surfacing — bump gen so this conn's trailing close-error
|
|
767
|
-
// (ws fires error then close 1006) is swallowed above.
|
|
768
|
-
attemptGenRef.current++;
|
|
769
|
-
logError(new Error(`[voice] voice_stream error: ${error}`));
|
|
770
|
-
onErrorRef.current?.(`Voice stream error: ${error}`);
|
|
771
|
-
// Clear the audio buffer on error to avoid memory leaks
|
|
772
|
-
audioBuffer.length = 0;
|
|
773
|
-
focusTriggeredRef.current = false;
|
|
774
|
-
cleanup();
|
|
775
|
-
updateState('idle');
|
|
776
|
-
},
|
|
777
|
-
onClose: () => {
|
|
778
|
-
// no-op; lifecycle handled by cleanup()
|
|
779
|
-
},
|
|
780
|
-
onReady: conn => {
|
|
781
|
-
// Only proceed if we're still in recording state AND this is
|
|
782
|
-
// still the current session. A zombie late-connecting WS from
|
|
783
|
-
// an abandoned session can pass the 'recording' check if the
|
|
784
|
-
// user has since started a new session.
|
|
785
|
-
if (isStale() || stateRef.current !== 'recording') {
|
|
786
|
-
conn.close();
|
|
787
|
-
return;
|
|
788
|
-
}
|
|
789
|
-
// The WebSocket is now truly open — assign connectionRef so
|
|
790
|
-
// subsequent audio callbacks send directly instead of buffering.
|
|
791
|
-
connectionRef.current = conn;
|
|
792
|
-
everConnectedRef.current = true;
|
|
793
|
-
// Flush all audio chunks that were buffered while the WebSocket
|
|
794
|
-
// was connecting. This is safe because onReady fires from the
|
|
795
|
-
// WebSocket 'open' event, guaranteeing send() will not be dropped.
|
|
796
|
-
//
|
|
797
|
-
// Coalesce into ~1s slices rather than one ws.send per chunk
|
|
798
|
-
// — fewer WS frames means less overhead on both ends.
|
|
799
|
-
const SLICE_TARGET_BYTES = 32_000; // ~1s at 16kHz/16-bit/mono
|
|
800
|
-
if (audioBuffer.length > 0) {
|
|
801
|
-
let totalBytes = 0;
|
|
802
|
-
for (const c of audioBuffer)
|
|
803
|
-
totalBytes += c.length;
|
|
804
|
-
const slices = [[]];
|
|
805
|
-
let sliceBytes = 0;
|
|
806
|
-
for (const chunk of audioBuffer) {
|
|
807
|
-
if (sliceBytes > 0 &&
|
|
808
|
-
sliceBytes + chunk.length > SLICE_TARGET_BYTES) {
|
|
809
|
-
slices.push([]);
|
|
810
|
-
sliceBytes = 0;
|
|
811
|
-
}
|
|
812
|
-
slices[slices.length - 1].push(chunk);
|
|
813
|
-
sliceBytes += chunk.length;
|
|
814
|
-
}
|
|
815
|
-
logForDebugging(`[voice] onReady: flushing ${String(audioBuffer.length)} buffered chunks (${String(totalBytes)} bytes) as ${String(slices.length)} coalesced frame(s)`);
|
|
816
|
-
for (const slice of slices) {
|
|
817
|
-
conn.send(Buffer.concat(slice));
|
|
818
|
-
}
|
|
819
|
-
}
|
|
820
|
-
audioBuffer.length = 0;
|
|
821
|
-
// Reset the release timer now that the WebSocket is ready.
|
|
822
|
-
// Only arm it if auto-repeat has been seen — otherwise the OS
|
|
823
|
-
// key repeat delay (~500ms) hasn't elapsed yet and the timer
|
|
824
|
-
// would fire prematurely.
|
|
825
|
-
if (releaseTimerRef.current) {
|
|
826
|
-
clearTimeout(releaseTimerRef.current);
|
|
827
|
-
}
|
|
828
|
-
if (seenRepeatRef.current) {
|
|
829
|
-
releaseTimerRef.current = setTimeout((releaseTimerRef, stateRef, finishRecording) => {
|
|
830
|
-
releaseTimerRef.current = null;
|
|
831
|
-
if (stateRef.current === 'recording') {
|
|
832
|
-
finishRecording();
|
|
833
|
-
}
|
|
834
|
-
}, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording);
|
|
835
|
-
}
|
|
836
|
-
},
|
|
837
|
-
}, {
|
|
838
|
-
language: stt.code,
|
|
839
|
-
keyterms,
|
|
840
|
-
}).then(conn => {
|
|
841
|
-
if (isStale()) {
|
|
842
|
-
conn?.close();
|
|
843
|
-
return;
|
|
844
|
-
}
|
|
845
|
-
if (!conn) {
|
|
846
|
-
logForDebugging('[voice] Failed to connect to voice_stream (no OAuth token?)');
|
|
847
|
-
onErrorRef.current?.('Voice mode requires a Claude.ai account. Please run /login to sign in.');
|
|
848
|
-
// Clear the audio buffer on failure
|
|
849
|
-
audioBuffer.length = 0;
|
|
850
|
-
cleanup();
|
|
851
|
-
updateState('idle');
|
|
852
|
-
return;
|
|
853
|
-
}
|
|
854
|
-
// Safety check: if the user released the key before connectVoiceStream
|
|
855
|
-
// resolved (but after onReady already ran), close the connection.
|
|
856
|
-
if (stateRef.current !== 'recording') {
|
|
857
|
-
audioBuffer.length = 0;
|
|
858
|
-
conn.close();
|
|
859
|
-
return;
|
|
860
|
-
}
|
|
861
|
-
});
|
|
862
|
-
};
|
|
863
|
-
void getVoiceKeyterms().then(attemptConnect);
|
|
864
|
-
}
|
|
865
|
-
// ── Hold-to-talk handler ────────────────────────────────────────────
|
|
866
|
-
// Called on every keypress (including terminal auto-repeats while
|
|
867
|
-
// the key is held). A gap longer than RELEASE_TIMEOUT_MS between
|
|
868
|
-
// events is interpreted as key release.
|
|
869
|
-
//
|
|
870
|
-
// Recording starts immediately on the first keypress to eliminate
|
|
871
|
-
// startup delay. The release timer is only armed after auto-repeat
|
|
872
|
-
// is detected (to avoid false releases during the OS key repeat
|
|
873
|
-
// delay of ~500ms on macOS).
|
|
190
|
+
}, [cleanup, finishRecording, setVoiceState]);
|
|
874
191
|
const handleKeyEvent = useCallback((fallbackMs = REPEAT_FALLBACK_MS) => {
|
|
875
|
-
if (!enabled
|
|
876
|
-
return;
|
|
877
|
-
}
|
|
878
|
-
// In focus mode, recording is driven by terminal focus, not keypresses.
|
|
879
|
-
if (focusTriggeredRef.current) {
|
|
880
|
-
// Active focus recording — ignore key events (session ends on blur).
|
|
881
|
-
return;
|
|
882
|
-
}
|
|
883
|
-
if (focusMode && silenceTimedOutRef.current) {
|
|
884
|
-
// Focus session timed out due to silence — keypress re-arms it.
|
|
885
|
-
logForDebugging('[voice] Re-arming focus recording after silence timeout');
|
|
886
|
-
silenceTimedOutRef.current = false;
|
|
887
|
-
focusTriggeredRef.current = true;
|
|
888
|
-
void startRecordingSession();
|
|
889
|
-
armFocusSilenceTimer();
|
|
192
|
+
if (!enabled) {
|
|
890
193
|
return;
|
|
891
194
|
}
|
|
892
195
|
const currentState = stateRef.current;
|
|
893
|
-
// Ignore keypresses while processing
|
|
894
196
|
if (currentState === 'processing') {
|
|
895
197
|
return;
|
|
896
198
|
}
|
|
897
199
|
if (currentState === 'idle') {
|
|
898
|
-
logForDebugging('[voice] handleKeyEvent: idle, starting recording session immediately');
|
|
899
200
|
void startRecordingSession();
|
|
900
|
-
|
|
901
|
-
// arm the release timer anyway (the user likely tapped and released).
|
|
902
|
-
repeatFallbackTimerRef.current = setTimeout((repeatFallbackTimerRef, stateRef, seenRepeatRef, releaseTimerRef, finishRecording) => {
|
|
201
|
+
repeatFallbackTimerRef.current = setTimeout(() => {
|
|
903
202
|
repeatFallbackTimerRef.current = null;
|
|
904
203
|
if (stateRef.current === 'recording' && !seenRepeatRef.current) {
|
|
905
|
-
logForDebugging('[voice] No auto-repeat seen, arming release timer via fallback');
|
|
906
204
|
seenRepeatRef.current = true;
|
|
907
|
-
releaseTimerRef.current = setTimeout((
|
|
205
|
+
releaseTimerRef.current = setTimeout(() => {
|
|
908
206
|
releaseTimerRef.current = null;
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
}
|
|
912
|
-
}, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording);
|
|
207
|
+
void finishRecording();
|
|
208
|
+
}, RELEASE_TIMEOUT_MS);
|
|
913
209
|
}
|
|
914
|
-
}, fallbackMs
|
|
210
|
+
}, fallbackMs);
|
|
915
211
|
}
|
|
916
212
|
else if (currentState === 'recording') {
|
|
917
|
-
// Second+ keypress while recording — auto-repeat has started.
|
|
918
213
|
seenRepeatRef.current = true;
|
|
919
214
|
if (repeatFallbackTimerRef.current) {
|
|
920
215
|
clearTimeout(repeatFallbackTimerRef.current);
|
|
921
216
|
repeatFallbackTimerRef.current = null;
|
|
922
217
|
}
|
|
923
218
|
}
|
|
924
|
-
// Reset the release timer on every keypress (including auto-repeats)
|
|
925
219
|
if (releaseTimerRef.current) {
|
|
926
220
|
clearTimeout(releaseTimerRef.current);
|
|
927
221
|
}
|
|
928
|
-
// Only arm the release timer once auto-repeat has been seen.
|
|
929
|
-
// The OS key repeat delay is ~500ms on macOS; without this gate
|
|
930
|
-
// the 200ms timer fires before repeat starts, causing a false release.
|
|
931
222
|
if (stateRef.current === 'recording' && seenRepeatRef.current) {
|
|
932
|
-
releaseTimerRef.current = setTimeout((
|
|
223
|
+
releaseTimerRef.current = setTimeout(() => {
|
|
933
224
|
releaseTimerRef.current = null;
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
}
|
|
937
|
-
}, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording);
|
|
225
|
+
void finishRecording();
|
|
226
|
+
}, RELEASE_TIMEOUT_MS);
|
|
938
227
|
}
|
|
939
|
-
}, [enabled,
|
|
940
|
-
// Cleanup only when disabled or unmounted - NOT on state changes
|
|
228
|
+
}, [enabled, finishRecording, startRecordingSession]);
|
|
941
229
|
useEffect(() => {
|
|
942
230
|
if (!enabled && stateRef.current !== 'idle') {
|
|
943
231
|
cleanup();
|