voicecc 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +6 -0
- package/README.md +48 -0
- package/bin/voicecc.js +39 -0
- package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
- package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
- package/dashboard/dist/audio-processor.js +126 -0
- package/dashboard/dist/index.html +13 -0
- package/dashboard/routes/auth.ts +119 -0
- package/dashboard/routes/browser-call.ts +87 -0
- package/dashboard/routes/claude-md.ts +50 -0
- package/dashboard/routes/conversations.ts +203 -0
- package/dashboard/routes/integrations.ts +154 -0
- package/dashboard/routes/mcp-servers.ts +198 -0
- package/dashboard/routes/settings.ts +64 -0
- package/dashboard/routes/tunnel.ts +66 -0
- package/dashboard/routes/twilio.ts +120 -0
- package/dashboard/routes/voice.ts +48 -0
- package/dashboard/routes/webrtc.ts +85 -0
- package/dashboard/server.ts +130 -0
- package/dashboard/tsconfig.json +13 -0
- package/init/CLAUDE.md +18 -0
- package/package.json +59 -0
- package/run.ts +68 -0
- package/scripts/postinstall.js +228 -0
- package/services/browser-call-manager.ts +106 -0
- package/services/device-pairing.ts +176 -0
- package/services/env.ts +88 -0
- package/services/tunnel.ts +204 -0
- package/services/twilio-manager.ts +126 -0
- package/sidecar/assets/startup.pcm +0 -0
- package/sidecar/audio-adapter.ts +60 -0
- package/sidecar/audio-capture.ts +220 -0
- package/sidecar/browser-audio-playback.test.ts +149 -0
- package/sidecar/browser-audio.ts +147 -0
- package/sidecar/browser-server.ts +331 -0
- package/sidecar/chime.test.ts +69 -0
- package/sidecar/chime.ts +54 -0
- package/sidecar/claude-session.ts +295 -0
- package/sidecar/endpointing.ts +163 -0
- package/sidecar/index.ts +83 -0
- package/sidecar/local-audio.ts +126 -0
- package/sidecar/mic-vpio +0 -0
- package/sidecar/mic-vpio.swift +484 -0
- package/sidecar/mock-tts-server-tagged.mjs +132 -0
- package/sidecar/narration.ts +204 -0
- package/sidecar/scripts/generate-startup-audio.py +79 -0
- package/sidecar/session-lock.ts +123 -0
- package/sidecar/sherpa-onnx-node.d.ts +4 -0
- package/sidecar/stt.ts +199 -0
- package/sidecar/tts-server.py +193 -0
- package/sidecar/tts.ts +481 -0
- package/sidecar/twilio-audio.ts +338 -0
- package/sidecar/twilio-server.ts +436 -0
- package/sidecar/types.ts +210 -0
- package/sidecar/vad.ts +101 -0
- package/sidecar/voice-loop-bugs.test.ts +522 -0
- package/sidecar/voice-session.ts +523 -0
- package/skills/voice/SKILL.md +26 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared voice loop logic extracted from index.ts into a reusable session.
|
|
3
|
+
*
|
|
4
|
+
* Creates a voice session that wires all pipeline modules (VAD, STT, endpointing,
|
|
5
|
+
* Claude session, narration, TTS) using an AudioAdapter for transport-agnostic I/O.
|
|
6
|
+
* All state is closure-scoped inside createVoiceSession, allowing multiple independent
|
|
7
|
+
* sessions across processes.
|
|
8
|
+
*
|
|
9
|
+
* Responsibilities:
|
|
10
|
+
* - Initialize all voice pipeline modules from config
|
|
11
|
+
* - Run the voice loop state machine (IDLE -> LISTENING -> PROCESSING -> SPEAKING)
|
|
12
|
+
* - Route audio through VAD for speech detection and STT accumulation
|
|
13
|
+
* - Handle turn completion via endpointing, then send transcript to Claude
|
|
14
|
+
* - Stream Claude responses through narration into TTS playback
|
|
15
|
+
* - Detect user interruption during SPEAKING/PROCESSING state
|
|
16
|
+
* - Acquire a session lock at start, release on stop
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { readFileSync } from "fs";
|
|
20
|
+
import { Writable } from "stream";
|
|
21
|
+
|
|
22
|
+
import { createVad } from "./vad.js";
|
|
23
|
+
import { createStt } from "./stt.js";
|
|
24
|
+
import { createEndpointer } from "./endpointing.js";
|
|
25
|
+
import { createClaudeSession } from "./claude-session.js";
|
|
26
|
+
import { createNarrator } from "./narration.js";
|
|
27
|
+
import { createTts } from "./tts.js";
|
|
28
|
+
import { acquireSessionLock } from "./session-lock.js";
|
|
29
|
+
|
|
30
|
+
import { dirname, join } from "path";
|
|
31
|
+
import { fileURLToPath } from "url";
|
|
32
|
+
|
|
33
|
+
import type { AudioAdapter } from "./audio-adapter.js";
|
|
34
|
+
import type { SessionLock } from "./session-lock.js";
|
|
35
|
+
import type { VadProcessor } from "./vad.js";
|
|
36
|
+
import type { SttProcessor } from "./stt.js";
|
|
37
|
+
import type { Endpointer } from "./endpointing.js";
|
|
38
|
+
import type { ClaudeSession } from "./claude-session.js";
|
|
39
|
+
import type { Narrator } from "./narration.js";
|
|
40
|
+
import type { TtsPlayer } from "./tts.js";
|
|
41
|
+
import type { VadEvent, VoiceLoopState, VoiceLoopStatus, TextChunk, EndpointingConfig, NarrationConfig, ClaudeSessionConfig } from "./types.js";
|
|
42
|
+
|
|
43
|
+
// ============================================================================
|
|
44
|
+
// CONSTANTS
|
|
45
|
+
// ============================================================================
|
|
46
|
+
|
|
47
|
+
/** Default max concurrent sessions (overridden by .env) */
|
|
48
|
+
const DEFAULT_MAX_SESSIONS = 2;
|
|
49
|
+
|
|
50
|
+
/** Pre-recorded startup greeting (24kHz 16-bit mono PCM). Null if file is missing. */
|
|
51
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
52
|
+
const STARTUP_PCM: Buffer | null = (() => {
|
|
53
|
+
try {
|
|
54
|
+
return readFileSync(join(__dirname, "assets", "startup.pcm"));
|
|
55
|
+
} catch {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
})();
|
|
59
|
+
|
|
60
|
+
// ============================================================================
|
|
61
|
+
// INTERFACES
|
|
62
|
+
// ============================================================================
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Configuration for a voice session.
|
|
66
|
+
* Like VoiceLoopConfig but without sampleRate (adapter concern),
|
|
67
|
+
* and with onSessionEnd and interruptionThresholdMs added.
|
|
68
|
+
*/
|
|
69
|
+
export interface VoiceSessionConfig {
|
|
70
|
+
/** Path to the sherpa-onnx Whisper ONNX model directory */
|
|
71
|
+
sttModelPath: string;
|
|
72
|
+
/** mlx-audio model ID for TTS (e.g. "prince-canuma/Kokoro-82M") */
|
|
73
|
+
ttsModel: string;
|
|
74
|
+
/** TTS voice ID (e.g. "af_heart" for Kokoro) */
|
|
75
|
+
ttsVoice: string;
|
|
76
|
+
/** Directory for cached model files */
|
|
77
|
+
modelCacheDir: string;
|
|
78
|
+
/** Phrase that stops the voice session when spoken */
|
|
79
|
+
stopPhrase: string;
|
|
80
|
+
/** Minimum sustained speech duration (ms) before interrupting TTS playback */
|
|
81
|
+
interruptionThresholdMs: number;
|
|
82
|
+
/** Endpointing configuration for turn detection */
|
|
83
|
+
endpointing: EndpointingConfig;
|
|
84
|
+
/** Narration configuration for Claude response processing */
|
|
85
|
+
narration: NarrationConfig;
|
|
86
|
+
/** Claude Agent SDK session configuration */
|
|
87
|
+
claudeSession: ClaudeSessionConfig;
|
|
88
|
+
/** Called when the stop phrase is detected. Local path: process.exit(). Twilio: ws.close(). */
|
|
89
|
+
onSessionEnd: () => void;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Handle to a running voice session. Call stop() to tear down.
|
|
94
|
+
*/
|
|
95
|
+
export interface VoiceSession {
|
|
96
|
+
/** Gracefully shut down the session and release the session lock */
|
|
97
|
+
stop: () => Promise<void>;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ============================================================================
|
|
101
|
+
// MAIN ENTRYPOINT
|
|
102
|
+
// ============================================================================
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Create and start a voice session using the given audio adapter and config.
|
|
106
|
+
*
|
|
107
|
+
* Acquires a session lock (throws if limit reached), initializes all pipeline
|
|
108
|
+
* modules (VAD, STT, endpointer, Claude session, narrator, TTS), subscribes
|
|
109
|
+
* to adapter audio, and starts the state machine.
|
|
110
|
+
*
|
|
111
|
+
* @param adapter - Audio I/O adapter (local mic or Twilio)
|
|
112
|
+
* @param config - Voice session configuration
|
|
113
|
+
* @returns A VoiceSession handle with stop()
|
|
114
|
+
* @throws Error if session limit reached or initialization fails
|
|
115
|
+
*/
|
|
116
|
+
export async function createVoiceSession(
|
|
117
|
+
adapter: AudioAdapter,
|
|
118
|
+
config: VoiceSessionConfig,
|
|
119
|
+
): Promise<VoiceSession> {
|
|
120
|
+
// Acquire session lock (throws if limit reached)
|
|
121
|
+
const maxSessions = parseInt(process.env.MAX_CONCURRENT_SESSIONS ?? "", 10) || DEFAULT_MAX_SESSIONS;
|
|
122
|
+
const lock: SessionLock = acquireSessionLock(maxSessions);
|
|
123
|
+
|
|
124
|
+
// ---- Closure-scoped state ----
|
|
125
|
+
let state: VoiceLoopState = { status: "idle", sessionId: null };
|
|
126
|
+
let accumulating = false;
|
|
127
|
+
let interruptionTimer: ReturnType<typeof setTimeout> | null = null;
|
|
128
|
+
let interrupted = false;
|
|
129
|
+
let stopping = false;
|
|
130
|
+
|
|
131
|
+
// Module instances
|
|
132
|
+
let vadProcessor: VadProcessor | null = null;
|
|
133
|
+
let sttProcessor: SttProcessor | null = null;
|
|
134
|
+
let endpointer: Endpointer | null = null;
|
|
135
|
+
let claudeSession: ClaudeSession | null = null;
|
|
136
|
+
let narrator: Narrator | null = null;
|
|
137
|
+
let ttsPlayer: TtsPlayer | null = null;
|
|
138
|
+
|
|
139
|
+
// ---- Helper functions (closure-scoped) ----
|
|
140
|
+
|
|
141
|
+
/** Clear the interruption detection timer if active. */
|
|
142
|
+
function clearInterruptionTimer(): void {
|
|
143
|
+
if (interruptionTimer !== null) {
|
|
144
|
+
clearTimeout(interruptionTimer);
|
|
145
|
+
interruptionTimer = null;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Pure function that computes the next voice loop state from the current
|
|
151
|
+
* state and an event. Prints state changes to stdout.
|
|
152
|
+
*
|
|
153
|
+
* @param from - Current voice loop state
|
|
154
|
+
* @param event - Event name triggering the transition
|
|
155
|
+
* @returns The new voice loop state
|
|
156
|
+
*/
|
|
157
|
+
function handleStateTransition(from: VoiceLoopState, event: string): VoiceLoopState {
|
|
158
|
+
let nextStatus: VoiceLoopStatus;
|
|
159
|
+
|
|
160
|
+
switch (event) {
|
|
161
|
+
case "init_complete":
|
|
162
|
+
nextStatus = "listening";
|
|
163
|
+
break;
|
|
164
|
+
case "transcript_complete":
|
|
165
|
+
nextStatus = "processing";
|
|
166
|
+
break;
|
|
167
|
+
case "first_audio":
|
|
168
|
+
nextStatus = "speaking";
|
|
169
|
+
break;
|
|
170
|
+
case "response_complete":
|
|
171
|
+
nextStatus = "listening";
|
|
172
|
+
break;
|
|
173
|
+
case "error":
|
|
174
|
+
nextStatus = "listening";
|
|
175
|
+
break;
|
|
176
|
+
case "user_interrupt":
|
|
177
|
+
nextStatus = "listening";
|
|
178
|
+
break;
|
|
179
|
+
default:
|
|
180
|
+
nextStatus = from.status;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (nextStatus !== from.status) {
|
|
184
|
+
const label = nextStatus.charAt(0).toUpperCase() + nextStatus.slice(1);
|
|
185
|
+
console.log(`${label}...`);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return { status: nextStatus, sessionId: from.sessionId };
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---- Main logic functions (closure-scoped) ----
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Handle a Float32Array audio chunk from the adapter.
|
|
195
|
+
* Feeds audio to VAD and accumulates for STT during speech.
|
|
196
|
+
*
|
|
197
|
+
* @param samples - Float32Array of normalized audio samples
|
|
198
|
+
*/
|
|
199
|
+
async function handleAudioChunk(samples: Float32Array): Promise<void> {
|
|
200
|
+
if (!vadProcessor) return;
|
|
201
|
+
|
|
202
|
+
// If we're in a speech segment, accumulate for STT
|
|
203
|
+
if (accumulating && sttProcessor) {
|
|
204
|
+
sttProcessor.accumulate(samples);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Feed raw audio to VAD -- it handles framing internally (512 samples)
|
|
208
|
+
// and fires events via the handleVadEvent callback
|
|
209
|
+
await vadProcessor.processAudio(samples);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Handle VAD events dispatched by avr-vad callbacks.
|
|
214
|
+
* Routes to the appropriate handler based on current state.
|
|
215
|
+
*
|
|
216
|
+
* @param event - The VAD event (SPEECH_START, SPEECH_CONTINUE, SPEECH_END)
|
|
217
|
+
*/
|
|
218
|
+
function handleVadEvent(event: VadEvent): void {
|
|
219
|
+
if (state.status === "listening") {
|
|
220
|
+
handleListeningVadEvent(event);
|
|
221
|
+
} else if (state.status === "speaking" || state.status === "processing") {
|
|
222
|
+
handleInterruptionDetection(event);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Handle VAD events while in the LISTENING state.
|
|
228
|
+
*
|
|
229
|
+
* @param event - The VAD event
|
|
230
|
+
*/
|
|
231
|
+
function handleListeningVadEvent(event: VadEvent): void {
|
|
232
|
+
if (event.type === "SPEECH_START") {
|
|
233
|
+
console.log("Hearing speech...");
|
|
234
|
+
accumulating = true;
|
|
235
|
+
return;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (event.type === "SPEECH_CONTINUE") {
|
|
239
|
+
// Already accumulating, nothing to do
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (event.type === "SPEECH_END") {
|
|
244
|
+
accumulating = false;
|
|
245
|
+
handleSpeechEnd(event).catch((err) => {
|
|
246
|
+
console.error(`Error handling speech end: ${err}`);
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Handle the end of a speech segment: transcribe and check endpointing.
|
|
253
|
+
*
|
|
254
|
+
* @param event - The SPEECH_END VAD event
|
|
255
|
+
*/
|
|
256
|
+
async function handleSpeechEnd(event: VadEvent): Promise<void> {
|
|
257
|
+
if (!sttProcessor || !endpointer) return;
|
|
258
|
+
|
|
259
|
+
console.log("Transcribing...");
|
|
260
|
+
const result = await sttProcessor.transcribe();
|
|
261
|
+
|
|
262
|
+
if (!result.text.trim()) {
|
|
263
|
+
console.log("(empty transcription, continuing)");
|
|
264
|
+
endpointer.reset();
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
console.log(`Heard: "${result.text}"`);
|
|
269
|
+
|
|
270
|
+
// Check endpointing decision
|
|
271
|
+
const decision = await endpointer.onVadEvent(event, result.text);
|
|
272
|
+
|
|
273
|
+
if (decision.isComplete) {
|
|
274
|
+
endpointer.reset();
|
|
275
|
+
await handleCompleteTurn(result.text);
|
|
276
|
+
}
|
|
277
|
+
// If not complete, keep listening for more speech
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Handle a completed user turn: check for stop phrase, then send to Claude.
|
|
282
|
+
*
|
|
283
|
+
* @param transcript - The finalized transcript text
|
|
284
|
+
*/
|
|
285
|
+
async function handleCompleteTurn(transcript: string): Promise<void> {
|
|
286
|
+
// Check for stop phrase
|
|
287
|
+
if (transcript.toLowerCase().includes(config.stopPhrase.toLowerCase())) {
|
|
288
|
+
config.onSessionEnd();
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
console.log(`Transcript: "${transcript}"`);
|
|
293
|
+
|
|
294
|
+
// Transition to PROCESSING
|
|
295
|
+
state = handleStateTransition(state, "transcript_complete");
|
|
296
|
+
|
|
297
|
+
// Start processing Claude response (runs concurrently with audio events)
|
|
298
|
+
processClaudeResponse(transcript).catch((err) => {
|
|
299
|
+
console.error(`Error processing Claude response: ${err}`);
|
|
300
|
+
state = handleStateTransition(state, "error");
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Send transcript to Claude, stream the response through narration and TTS.
|
|
306
|
+
*
|
|
307
|
+
* @param transcript - The user's transcribed speech
|
|
308
|
+
*/
|
|
309
|
+
async function processClaudeResponse(transcript: string): Promise<void> {
|
|
310
|
+
if (!claudeSession || !narrator || !ttsPlayer) {
|
|
311
|
+
throw new Error("Modules not initialized");
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
interrupted = false;
|
|
315
|
+
narrator.reset();
|
|
316
|
+
|
|
317
|
+
// Async generator that yields text chunks from Claude -> narrator
|
|
318
|
+
const session = claudeSession;
|
|
319
|
+
const narr = narrator;
|
|
320
|
+
const player = ttsPlayer;
|
|
321
|
+
async function* textChunks(): AsyncGenerator<TextChunk> {
|
|
322
|
+
const eventStream = session.sendMessage(transcript);
|
|
323
|
+
|
|
324
|
+
for await (const event of eventStream) {
|
|
325
|
+
if (interrupted) return;
|
|
326
|
+
|
|
327
|
+
// Tool narration is a complete sentence -- tag it for immediate TTS
|
|
328
|
+
const isToolEvent = event.type === "tool_start" || event.type === "tool_end";
|
|
329
|
+
const chunks = narr.processEvent(event);
|
|
330
|
+
for (const chunk of chunks) {
|
|
331
|
+
if (interrupted) return;
|
|
332
|
+
yield isToolEvent ? { text: chunk, flush: true } : chunk;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
if (interrupted) return;
|
|
337
|
+
|
|
338
|
+
const remaining = narr.flush();
|
|
339
|
+
for (const chunk of remaining) {
|
|
340
|
+
if (interrupted) return;
|
|
341
|
+
yield chunk;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// Transition to SPEAKING before starting the stream
|
|
346
|
+
state = handleStateTransition(state, "first_audio");
|
|
347
|
+
|
|
348
|
+
await ttsPlayer.speakStream(textChunks());
|
|
349
|
+
|
|
350
|
+
if (interrupted) {
|
|
351
|
+
console.log("[debug] Response interrupted, bailing out");
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
console.log("[debug] Response processing complete");
|
|
356
|
+
adapter.playChime();
|
|
357
|
+
|
|
358
|
+
// Transition back to LISTENING
|
|
359
|
+
state = handleStateTransition(state, "response_complete");
|
|
360
|
+
|
|
361
|
+
// Reset VAD and endpointer for the next turn
|
|
362
|
+
if (vadProcessor) vadProcessor.reset();
|
|
363
|
+
if (endpointer) endpointer.reset();
|
|
364
|
+
accumulating = false;
|
|
365
|
+
clearInterruptionTimer();
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Detect sustained speech during SPEAKING/PROCESSING state for interruption.
|
|
370
|
+
*
|
|
371
|
+
* @param event - The VAD event to evaluate
|
|
372
|
+
*/
|
|
373
|
+
function handleInterruptionDetection(event: VadEvent): void {
|
|
374
|
+
if (event.type === "SPEECH_START") {
|
|
375
|
+
if (interruptionTimer === null) {
|
|
376
|
+
// Start capturing audio immediately so we have the full utterance if this
|
|
377
|
+
// turns out to be an interruption
|
|
378
|
+
if (sttProcessor) sttProcessor.clearBuffer();
|
|
379
|
+
accumulating = true;
|
|
380
|
+
|
|
381
|
+
interruptionTimer = setTimeout(() => {
|
|
382
|
+
interruptionTimer = null;
|
|
383
|
+
triggerInterruption();
|
|
384
|
+
}, config.interruptionThresholdMs);
|
|
385
|
+
}
|
|
386
|
+
return;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
if (event.type === "SPEECH_END") {
|
|
390
|
+
// Speech ended before threshold -- not an interruption, discard audio
|
|
391
|
+
clearInterruptionTimer();
|
|
392
|
+
accumulating = false;
|
|
393
|
+
if (sttProcessor) sttProcessor.clearBuffer();
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Interrupt TTS playback and Claude session, transition back to LISTENING.
|
|
399
|
+
*/
|
|
400
|
+
function triggerInterruption(): void {
|
|
401
|
+
console.log("User interruption detected");
|
|
402
|
+
|
|
403
|
+
interrupted = true;
|
|
404
|
+
if (ttsPlayer) ttsPlayer.interrupt();
|
|
405
|
+
if (claudeSession) claudeSession.interrupt();
|
|
406
|
+
|
|
407
|
+
clearInterruptionTimer();
|
|
408
|
+
// Keep accumulating -- user is still speaking. Buffer already has audio from
|
|
409
|
+
// SPEECH_START onwards, so the full utterance will be transcribed on SPEECH_END.
|
|
410
|
+
|
|
411
|
+
state = handleStateTransition(state, "user_interrupt");
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Gracefully shut down the voice session and release all resources.
|
|
416
|
+
*/
|
|
417
|
+
async function stop(): Promise<void> {
|
|
418
|
+
if (stopping) return;
|
|
419
|
+
stopping = true;
|
|
420
|
+
|
|
421
|
+
adapter.destroy();
|
|
422
|
+
|
|
423
|
+
if (vadProcessor) {
|
|
424
|
+
vadProcessor.destroy();
|
|
425
|
+
vadProcessor = null;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
if (sttProcessor) {
|
|
429
|
+
sttProcessor.destroy();
|
|
430
|
+
sttProcessor = null;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
if (ttsPlayer) {
|
|
434
|
+
ttsPlayer.destroy();
|
|
435
|
+
ttsPlayer = null;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
if (claudeSession) {
|
|
439
|
+
await claudeSession.close();
|
|
440
|
+
claudeSession = null;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
endpointer = null;
|
|
444
|
+
narrator = null;
|
|
445
|
+
accumulating = false;
|
|
446
|
+
clearInterruptionTimer();
|
|
447
|
+
|
|
448
|
+
state = { status: "idle", sessionId: null };
|
|
449
|
+
|
|
450
|
+
lock.release();
|
|
451
|
+
|
|
452
|
+
console.log("Voice session stopped");
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// ---- Initialization ----
|
|
456
|
+
|
|
457
|
+
// Fire-and-forget the startup greeting so it plays while modules initialize.
|
|
458
|
+
// Short delay lets the audio device settle before playback.
|
|
459
|
+
if (STARTUP_PCM) {
|
|
460
|
+
setTimeout(() => {
|
|
461
|
+
adapter.writeSpeaker(STARTUP_PCM).catch((err) => {
|
|
462
|
+
console.error(`Failed to play startup audio: ${err}`);
|
|
463
|
+
});
|
|
464
|
+
}, 1000);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Wrap adapter.writeSpeaker in a Node.js Writable stream for TTS config
|
|
468
|
+
const speakerWritable = new Writable({
|
|
469
|
+
write(chunk: Buffer, _encoding: string, callback: (err?: Error | null) => void) {
|
|
470
|
+
adapter.writeSpeaker(chunk).then(() => callback(), callback);
|
|
471
|
+
},
|
|
472
|
+
});
|
|
473
|
+
|
|
474
|
+
// Claude session and TTS are the slowest to initialize (process spawns + model
|
|
475
|
+
// loading). Run them in parallel since they are independent.
|
|
476
|
+
console.log("Initializing Claude session + TTS in parallel...");
|
|
477
|
+
const [claudeResult, ttsResult] = await Promise.all([
|
|
478
|
+
createClaudeSession(config.claudeSession),
|
|
479
|
+
createTts({
|
|
480
|
+
model: config.ttsModel,
|
|
481
|
+
voice: config.ttsVoice,
|
|
482
|
+
speakerInput: speakerWritable,
|
|
483
|
+
interruptPlayback: () => adapter.interrupt(),
|
|
484
|
+
resumePlayback: () => adapter.resume(),
|
|
485
|
+
}),
|
|
486
|
+
]);
|
|
487
|
+
claudeSession = claudeResult;
|
|
488
|
+
ttsPlayer = ttsResult;
|
|
489
|
+
|
|
490
|
+
// VAD and STT both load ONNX runtimes -- keep them sequential to avoid
|
|
491
|
+
// native library conflicts within the same Node process.
|
|
492
|
+
console.log("Initializing VAD...");
|
|
493
|
+
vadProcessor = await createVad(handleVadEvent);
|
|
494
|
+
|
|
495
|
+
console.log("Initializing STT...");
|
|
496
|
+
sttProcessor = await createStt(config.sttModelPath);
|
|
497
|
+
|
|
498
|
+
console.log("Initializing endpointer...");
|
|
499
|
+
endpointer = createEndpointer(config.endpointing);
|
|
500
|
+
|
|
501
|
+
console.log("Initializing narrator...");
|
|
502
|
+
narrator = createNarrator(config.narration, async (summary: string) => {
|
|
503
|
+
if (interrupted || !ttsPlayer) return;
|
|
504
|
+
await ttsPlayer.speakStream((async function*() {
|
|
505
|
+
yield { text: summary, flush: true };
|
|
506
|
+
})());
|
|
507
|
+
});
|
|
508
|
+
|
|
509
|
+
console.log("Voice mode active");
|
|
510
|
+
adapter.playChime();
|
|
511
|
+
|
|
512
|
+
// Transition to LISTENING
|
|
513
|
+
state = handleStateTransition(state, "init_complete");
|
|
514
|
+
|
|
515
|
+
// Subscribe to audio from the adapter
|
|
516
|
+
adapter.onAudio((samples: Float32Array) => {
|
|
517
|
+
handleAudioChunk(samples).catch((err) => {
|
|
518
|
+
console.error(`Error processing audio chunk: ${err}`);
|
|
519
|
+
});
|
|
520
|
+
});
|
|
521
|
+
|
|
522
|
+
return { stop };
|
|
523
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: voice
|
|
3
|
+
description: Start voice mode for hands-free interaction via microphone
|
|
4
|
+
allowed-tools: ["Bash"]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Voice Mode
|
|
8
|
+
|
|
9
|
+
Start the voice sidecar process. This launches a local voice loop that captures microphone input, transcribes speech locally, sends it to Claude Code, and speaks responses aloud using local TTS.
|
|
10
|
+
|
|
11
|
+
## Instructions
|
|
12
|
+
|
|
13
|
+
Run the following command via the Bash tool:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx tsx "${PLUGIN_ROOT}/sidecar/index.ts"
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Where `${PLUGIN_ROOT}` is the directory containing this plugin (the directory with `package.json`). The sidecar runs as a **foreground process** and blocks until the user says "stop listening" or presses Ctrl+C.
|
|
20
|
+
|
|
21
|
+
**Prerequisites:**
|
|
22
|
+
- `sox` must be installed on the system (e.g. `brew install sox` on macOS)
|
|
23
|
+
- A Whisper ONNX model must be downloaded to `~/.claude-voice-models/`
|
|
24
|
+
- Headphones are recommended (no echo cancellation in v1)
|
|
25
|
+
|
|
26
|
+
Do not run this in the background. The process must stay in the foreground so the user can interact with it.
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2022",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"moduleResolution": "NodeNext",
|
|
6
|
+
"strict": true,
|
|
7
|
+
"esModuleInterop": true,
|
|
8
|
+
"skipLibCheck": true,
|
|
9
|
+
"forceConsistentCasingInFileNames": true,
|
|
10
|
+
"outDir": "dist",
|
|
11
|
+
"declaration": true,
|
|
12
|
+
"sourceMap": true
|
|
13
|
+
},
|
|
14
|
+
"include": [
|
|
15
|
+
"run.ts",
|
|
16
|
+
"sidecar/**/*.ts",
|
|
17
|
+
"dashboard/**/*.ts",
|
|
18
|
+
"services/**/*.ts",
|
|
19
|
+
"dashboard/routes/**/*.ts"
|
|
20
|
+
],
|
|
21
|
+
"exclude": ["node_modules", "dist", "dashboard/src"]
|
|
22
|
+
}
|