voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.claude-plugin/plugin.json +6 -0
  2. package/README.md +48 -0
  3. package/bin/voicecc.js +39 -0
  4. package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
  5. package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
  6. package/dashboard/dist/audio-processor.js +126 -0
  7. package/dashboard/dist/index.html +13 -0
  8. package/dashboard/routes/auth.ts +119 -0
  9. package/dashboard/routes/browser-call.ts +87 -0
  10. package/dashboard/routes/claude-md.ts +50 -0
  11. package/dashboard/routes/conversations.ts +203 -0
  12. package/dashboard/routes/integrations.ts +154 -0
  13. package/dashboard/routes/mcp-servers.ts +198 -0
  14. package/dashboard/routes/settings.ts +64 -0
  15. package/dashboard/routes/tunnel.ts +66 -0
  16. package/dashboard/routes/twilio.ts +120 -0
  17. package/dashboard/routes/voice.ts +48 -0
  18. package/dashboard/routes/webrtc.ts +85 -0
  19. package/dashboard/server.ts +130 -0
  20. package/dashboard/tsconfig.json +13 -0
  21. package/init/CLAUDE.md +18 -0
  22. package/package.json +59 -0
  23. package/run.ts +68 -0
  24. package/scripts/postinstall.js +228 -0
  25. package/services/browser-call-manager.ts +106 -0
  26. package/services/device-pairing.ts +176 -0
  27. package/services/env.ts +88 -0
  28. package/services/tunnel.ts +204 -0
  29. package/services/twilio-manager.ts +126 -0
  30. package/sidecar/assets/startup.pcm +0 -0
  31. package/sidecar/audio-adapter.ts +60 -0
  32. package/sidecar/audio-capture.ts +220 -0
  33. package/sidecar/browser-audio-playback.test.ts +149 -0
  34. package/sidecar/browser-audio.ts +147 -0
  35. package/sidecar/browser-server.ts +331 -0
  36. package/sidecar/chime.test.ts +69 -0
  37. package/sidecar/chime.ts +54 -0
  38. package/sidecar/claude-session.ts +295 -0
  39. package/sidecar/endpointing.ts +163 -0
  40. package/sidecar/index.ts +83 -0
  41. package/sidecar/local-audio.ts +126 -0
  42. package/sidecar/mic-vpio +0 -0
  43. package/sidecar/mic-vpio.swift +484 -0
  44. package/sidecar/mock-tts-server-tagged.mjs +132 -0
  45. package/sidecar/narration.ts +204 -0
  46. package/sidecar/scripts/generate-startup-audio.py +79 -0
  47. package/sidecar/session-lock.ts +123 -0
  48. package/sidecar/sherpa-onnx-node.d.ts +4 -0
  49. package/sidecar/stt.ts +199 -0
  50. package/sidecar/tts-server.py +193 -0
  51. package/sidecar/tts.ts +481 -0
  52. package/sidecar/twilio-audio.ts +338 -0
  53. package/sidecar/twilio-server.ts +436 -0
  54. package/sidecar/types.ts +210 -0
  55. package/sidecar/vad.ts +101 -0
  56. package/sidecar/voice-loop-bugs.test.ts +522 -0
  57. package/sidecar/voice-session.ts +523 -0
  58. package/skills/voice/SKILL.md +26 -0
  59. package/tsconfig.json +22 -0
@@ -0,0 +1,295 @@
1
+ /**
2
+ * Claude session via the @anthropic-ai/claude-code SDK.
3
+ *
4
+ * Keeps a single persistent Claude Code process alive across turns using
5
+ * streaming I/O (AsyncIterable<SDKUserMessage> input). This eliminates the
6
+ * ~2-3s process spawn overhead on each turn.
7
+ *
8
+ * Responsibilities:
9
+ * - Start a persistent query() on createClaudeSession (process spawns once)
10
+ * - Push user messages into the live session via an async queue
11
+ * - Extract streaming text deltas from SDKPartialAssistantMessage events
12
+ * - Map tool_use content blocks to tool_start / tool_end events
13
+ * - Support interruption via query.interrupt()
14
+ * - Provide clean session teardown
15
+ */
16
+
17
+ import { query as claudeQuery, type Query, type Options, type SDKMessage, type SDKUserMessage } from "@anthropic-ai/claude-code";
18
+ import type { ClaudeSessionConfig, ClaudeStreamEvent } from "./types.js";
19
+
20
+ /** Injectable query function signature for testing. Matches the SDK query() contract. */
21
+ export type QueryFn = (params: { prompt: AsyncIterable<SDKUserMessage>; options: Options }) => Query;
22
+
23
+ // ============================================================================
24
+ // ASYNC QUEUE
25
+ // ============================================================================
26
+
27
+ /** Simple async iterable backed by a push queue. */
28
+ class AsyncQueue<T> implements AsyncIterable<T> {
29
+ private buf: T[] = [];
30
+ private resolve: ((r: IteratorResult<T>) => void) | null = null;
31
+ private done = false;
32
+
33
+ push(item: T) {
34
+ if (this.resolve) {
35
+ const r = this.resolve;
36
+ this.resolve = null;
37
+ r({ value: item, done: false });
38
+ } else {
39
+ this.buf.push(item);
40
+ }
41
+ }
42
+
43
+ close() {
44
+ this.done = true;
45
+ if (this.resolve) {
46
+ const r = this.resolve;
47
+ this.resolve = null;
48
+ r({ value: undefined as any, done: true });
49
+ }
50
+ }
51
+
52
+ /** Discard all buffered items. Used to clear stale events after interruption. */
53
+ drain(): void {
54
+ this.buf.length = 0;
55
+ }
56
+
57
+ /** Read one item (used by sendMessage to drain the event channel). */
58
+ async next(): Promise<T | undefined> {
59
+ if (this.buf.length > 0) return this.buf.shift()!;
60
+ if (this.done) return undefined;
61
+ const result = await new Promise<IteratorResult<T>>((r) => { this.resolve = r; });
62
+ return result.done ? undefined : result.value;
63
+ }
64
+
65
+ [Symbol.asyncIterator](): AsyncIterator<T> {
66
+ return {
67
+ next: (): Promise<IteratorResult<T>> => {
68
+ if (this.buf.length > 0) {
69
+ return Promise.resolve({ value: this.buf.shift()!, done: false as const });
70
+ }
71
+ if (this.done) {
72
+ return Promise.resolve({ value: undefined as any, done: true as const });
73
+ }
74
+ return new Promise<IteratorResult<T>>((r) => { this.resolve = r; });
75
+ },
76
+ };
77
+ }
78
+ }
79
+
80
+ // ============================================================================
81
+ // INTERFACES
82
+ // ============================================================================
83
+
84
+ /** Session object returned by createClaudeSession. */
85
+ interface ClaudeSession {
86
+ sendMessage(text: string): AsyncIterable<ClaudeStreamEvent>;
87
+ interrupt(): void;
88
+ close(): Promise<void>;
89
+ }
90
+
91
+ // ============================================================================
92
+ // CONSTANTS
93
+ // ============================================================================
94
+
95
+ const CLAUDE_BIN = "/Users/Focus/.local/bin/claude";
96
+
97
+ const DEFAULT_SYSTEM_PROMPT =
98
+ "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.";
99
+
100
+ // ============================================================================
101
+ // MAIN HANDLERS
102
+ // ============================================================================
103
+
104
+ async function createClaudeSession(
105
+ config: ClaudeSessionConfig,
106
+ queryOverride?: QueryFn,
107
+ ): Promise<ClaudeSession> {
108
+ const systemPrompt = config.systemPrompt || DEFAULT_SYSTEM_PROMPT;
109
+ let sessionId = "";
110
+ let closed = false;
111
+ let lastTurnCompletedCleanly = true;
112
+
113
+ // Persistent input stream — user messages are pushed here across turns
114
+ const userMessages = new AsyncQueue<SDKUserMessage>();
115
+
116
+ // Event channel — SDK events are routed here for sendMessage to consume
117
+ const sdkEvents = new AsyncQueue<SDKMessage>();
118
+
119
+ const options: Options = {
120
+ pathToClaudeCodeExecutable: CLAUDE_BIN,
121
+ includePartialMessages: true,
122
+ maxThinkingTokens: 10000,
123
+ appendSystemPrompt: systemPrompt,
124
+ permissionMode: config.permissionMode as Options["permissionMode"],
125
+ stderr: (data: string) => {
126
+ const msg = data.trim();
127
+ if (msg) console.error(`[claude-stderr] ${msg}`);
128
+ },
129
+ };
130
+
131
+ // Start persistent query — process spawns once and stays alive.
132
+ // NOTE: with AsyncIterable<SDKUserMessage> input, the SDK won't yield
133
+ // events until the first user message is consumed, so we don't block
134
+ // waiting for a system init event here. Session ID is captured when
135
+ // the system event arrives during the first turn.
136
+ const queryFn = queryOverride ?? claudeQuery;
137
+ const q = queryFn({ prompt: userMessages, options });
138
+
139
+ // Background: pump SDK events into our channel
140
+ (async () => {
141
+ try {
142
+ for await (const msg of q) {
143
+ if (msg.type === "system" && !sessionId) {
144
+ sessionId = msg.session_id;
145
+ console.log(`[claude] session ready (id=${sessionId})`);
146
+ }
147
+ sdkEvents.push(msg);
148
+ }
149
+ } catch (err) {
150
+ console.error("[claude] SDK pump error:", err);
151
+ } finally {
152
+ sdkEvents.close();
153
+ }
154
+ })();
155
+
156
+ console.log("[claude] persistent process started");
157
+
158
+ return {
159
+ async *sendMessage(text: string): AsyncIterable<ClaudeStreamEvent> {
160
+ if (closed) {
161
+ throw new Error("Session is closed.");
162
+ }
163
+
164
+ if (!text.trim()) {
165
+ throw new Error("Cannot send empty message.");
166
+ }
167
+
168
+ // If the previous turn was interrupted, consume remaining events until its result
169
+ if (!lastTurnCompletedCleanly) {
170
+ while (true) {
171
+ const msg = await sdkEvents.next();
172
+ if (!msg || msg.type === "result") break;
173
+ }
174
+ }
175
+ sdkEvents.drain();
176
+ lastTurnCompletedCleanly = false;
177
+
178
+ const t0 = Date.now();
179
+ let hasStreamedContent = false;
180
+ const toolUseBlocks = new Set<number>();
181
+ const thinkingBlocks = new Set<number>();
182
+
183
+ // Push user message into the live session
184
+ userMessages.push({
185
+ type: "user",
186
+ message: { content: text, role: "user" },
187
+ parent_tool_use_id: null,
188
+ session_id: sessionId,
189
+ });
190
+
191
+ // Read events for this turn until result
192
+ while (true) {
193
+ const msg = await sdkEvents.next();
194
+ if (!msg) break; // channel closed (process died)
195
+
196
+ // Streaming events (token-level deltas)
197
+ if (msg.type === "stream_event") {
198
+ const event = msg.event;
199
+
200
+ if (event.type === "content_block_start") {
201
+ if (event.content_block.type === "tool_use") {
202
+ hasStreamedContent = true;
203
+ toolUseBlocks.add(event.index);
204
+ yield { type: "tool_start", content: "", toolName: event.content_block.name };
205
+ }
206
+ if (event.content_block.type === "thinking") {
207
+ hasStreamedContent = true;
208
+ thinkingBlocks.add(event.index);
209
+ console.log(`[claude] thinking started at +${Date.now() - t0}ms`);
210
+ yield { type: "text_delta", content: "Thinking... " };
211
+ }
212
+ continue;
213
+ }
214
+
215
+ if (event.type === "content_block_delta") {
216
+ if (event.delta.type === "text_delta") {
217
+ if (!hasStreamedContent) {
218
+ console.log(`[claude] first delta at +${Date.now() - t0}ms`);
219
+ }
220
+ hasStreamedContent = true;
221
+ yield { type: "text_delta", content: event.delta.text };
222
+ }
223
+ continue;
224
+ }
225
+
226
+ if (event.type === "content_block_stop") {
227
+ if (thinkingBlocks.has(event.index)) {
228
+ thinkingBlocks.delete(event.index);
229
+ console.log(`[claude] thinking ended at +${Date.now() - t0}ms`);
230
+ }
231
+ if (toolUseBlocks.has(event.index)) {
232
+ toolUseBlocks.delete(event.index);
233
+ yield { type: "tool_end", content: "" };
234
+ }
235
+ continue;
236
+ }
237
+
238
+ continue;
239
+ }
240
+
241
+ // Full assistant message — fallback if streaming didn't produce deltas
242
+ if (msg.type === "assistant") {
243
+ if (hasStreamedContent) {
244
+ console.log(`[claude] full message at +${Date.now() - t0}ms (skipped, already streamed)`);
245
+ } else {
246
+ console.log(`[claude] full message at +${Date.now() - t0}ms (no streaming, using fallback)`);
247
+ const blocks = msg.message.content;
248
+ if (Array.isArray(blocks)) {
249
+ for (const block of blocks) {
250
+ if (block.type === "text") {
251
+ yield { type: "text_delta", content: block.text };
252
+ }
253
+ if (block.type === "tool_use") {
254
+ yield { type: "tool_start", content: "", toolName: block.name };
255
+ }
256
+ }
257
+ }
258
+ }
259
+ toolUseBlocks.clear();
260
+ continue;
261
+ }
262
+
263
+ // Skip system events and synthetic user messages (tool results)
264
+ if (msg.type === "system" || msg.type === "user") {
265
+ continue;
266
+ }
267
+
268
+ // Result — turn complete
269
+ if (msg.type === "result") {
270
+ lastTurnCompletedCleanly = true;
271
+ console.log(`[claude] result at +${Date.now() - t0}ms (streamed=${hasStreamedContent})`);
272
+ if (msg.is_error) {
273
+ yield { type: "error", content: msg.subtype === "success" ? String((msg as any).result) : msg.subtype };
274
+ }
275
+ break;
276
+ }
277
+ }
278
+
279
+ yield { type: "result", content: "" };
280
+ },
281
+
282
+ interrupt(): void {
283
+ q.interrupt();
284
+ },
285
+
286
+ async close(): Promise<void> {
287
+ closed = true;
288
+ userMessages.close();
289
+ await q.interrupt();
290
+ },
291
+ };
292
+ }
293
+
294
+ export { createClaudeSession };
295
+ export type { ClaudeSession };
@@ -0,0 +1,163 @@
1
+ /**
2
+ * Endpointing module -- determines when the user is done speaking.
3
+ *
4
+ * Uses a two-tier approach to decide turn completion:
5
+ * - Fast path: VAD silence duration + sufficient word count (0ms latency)
6
+ * - Slow path: Haiku semantic check for short/ambiguous utterances (~200ms)
7
+ * - Timeout path: Forces completion after extended silence regardless of content
8
+ *
9
+ * Responsibilities:
10
+ * - Track silence duration from VAD events
11
+ * - Apply fast-path completion for longer utterances
12
+ * - Call Haiku API for semantic turn-completion on short utterances
13
+ * - Force timeout after extended silence
14
+ * - Reset state between turns
15
+ */
16
+
17
+ import Anthropic from "@anthropic-ai/sdk";
18
+ import type { EndpointDecision, EndpointingConfig, VadEvent } from "./types.js";
19
+
20
+ // ============================================================================
21
+ // CONSTANTS
22
+ // ============================================================================
23
+
24
+ const HAIKU_MODEL = "claude-haiku-4-5-20251001";
25
+ const HAIKU_MAX_TOKENS = 10;
26
+
27
+ // ============================================================================
28
+ // INTERFACES
29
+ // ============================================================================
30
+
31
+ /**
32
+ * Endpointer that processes VAD events and decides when the user is done speaking.
33
+ */
34
+ export interface Endpointer {
35
+ /**
36
+ * Process a VAD event and determine if the user's turn is complete.
37
+ * @param event - The VAD event from the voice activity detector
38
+ * @param currentTranscript - The accumulated transcript so far
39
+ * @returns Decision on whether the user has finished speaking
40
+ */
41
+ onVadEvent(event: VadEvent, currentTranscript: string): Promise<EndpointDecision>;
42
+
43
+ /**
44
+ * Reset internal state for a new turn.
45
+ */
46
+ reset(): void;
47
+ }
48
+
49
+ // ============================================================================
50
+ // MAIN ENTRYPOINT
51
+ // ============================================================================
52
+
53
+ /**
54
+ * Create an endpointer instance with the given configuration.
55
+ * @param config - Endpointing thresholds and feature flags
56
+ * @returns A configured Endpointer
57
+ */
58
+ export function createEndpointer(config: EndpointingConfig): Endpointer {
59
+ const anthropicClient = config.enableHaikuFallback ? new Anthropic() : null;
60
+
61
+ return {
62
+ onVadEvent(event: VadEvent, currentTranscript: string): Promise<EndpointDecision> {
63
+ return handleVadEvent(event, currentTranscript, config, anthropicClient);
64
+ },
65
+
66
+ reset(): void {
67
+ // No internal state to reset -- completion is evaluated per SPEECH_END event.
68
+ },
69
+ };
70
+ }
71
+
72
+ // ============================================================================
73
+ // MAIN LOGIC
74
+ // ============================================================================
75
+
76
+ /**
77
+ * Handle a single VAD event and produce an endpoint decision.
78
+ * @param event - The VAD event to process
79
+ * @param transcript - Current accumulated transcript
80
+ * @param config - Endpointing configuration
81
+ * @param client - Anthropic client for Haiku calls (null if disabled)
82
+ * @returns The endpoint decision
83
+ */
84
+ async function handleVadEvent(
85
+ event: VadEvent,
86
+ transcript: string,
87
+ config: EndpointingConfig,
88
+ client: Anthropic | null,
89
+ ): Promise<EndpointDecision> {
90
+ // Active speech -- not complete
91
+ if (event.type === "SPEECH_START" || event.type === "SPEECH_CONTINUE") {
92
+ return { isComplete: false, transcript, method: "vad_fast" };
93
+ }
94
+
95
+ // Speech ended -- evaluate completion immediately.
96
+ // avr-vad's SPEECH_END fires after internal debouncing (redemptionFrames),
97
+ // so silence has already been confirmed by the VAD. No need to wait for
98
+ // separate SILENCE events (avr-vad doesn't emit them).
99
+ if (event.type === "SPEECH_END") {
100
+ const wordCount = countWords(transcript);
101
+
102
+ // Fast path: sufficient words, complete immediately
103
+ if (wordCount >= config.minWordCountForFastPath) {
104
+ return { isComplete: true, transcript, method: "vad_fast" };
105
+ }
106
+
107
+ // Short utterance: ask Haiku for semantic turn-completion check
108
+ if (config.enableHaikuFallback && client !== null) {
109
+ const isComplete = await checkTurnCompletionWithHaiku(client, transcript);
110
+ return { isComplete, transcript, method: "haiku_semantic" };
111
+ }
112
+
113
+ // Haiku disabled, treat as complete
114
+ return { isComplete: true, transcript, method: "vad_fast" };
115
+ }
116
+
117
+ // Unknown event type -- not complete
118
+ return { isComplete: false, transcript, method: "vad_fast" };
119
+ }
120
+
121
+ // ============================================================================
122
+ // HELPER FUNCTIONS
123
+ // ============================================================================
124
+
125
+ /**
126
+ * Count the number of words in a transcript string.
127
+ * @param text - The transcript text
128
+ * @returns Number of whitespace-separated words
129
+ */
130
+ function countWords(text: string): number {
131
+ const trimmed = text.trim();
132
+ if (trimmed.length === 0) {
133
+ return 0;
134
+ }
135
+ return trimmed.split(/\s+/).length;
136
+ }
137
+
138
+ /**
139
+ * Call Haiku to determine if a short transcript represents a complete user turn.
140
+ * @param client - The Anthropic SDK client
141
+ * @param transcript - The short transcript to evaluate
142
+ * @returns True if Haiku considers the turn complete
143
+ */
144
+ async function checkTurnCompletionWithHaiku(client: Anthropic, transcript: string): Promise<boolean> {
145
+ const response = await client.messages.create({
146
+ model: HAIKU_MODEL,
147
+ max_tokens: HAIKU_MAX_TOKENS,
148
+ messages: [
149
+ {
150
+ role: "user",
151
+ content: `Is this a complete user turn? Answer only "yes" or "no".\n\nTranscript: "${transcript}"`,
152
+ },
153
+ ],
154
+ });
155
+
156
+ const firstBlock = response.content[0];
157
+ if (firstBlock.type !== "text") {
158
+ throw new Error(`Unexpected Haiku response block type: ${firstBlock.type}`);
159
+ }
160
+
161
+ const answer = firstBlock.text.trim().toLowerCase();
162
+ return answer.startsWith("yes");
163
+ }
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Entry point for the Claude Code voice sidecar.
3
+ *
4
+ * Thin wrapper that creates a local audio adapter and voice session.
5
+ * All voice loop logic lives in voice-session.ts.
6
+ *
7
+ * Responsibilities:
8
+ * - Load .env configuration via dotenv
9
+ * - Create a local AudioAdapter (VPIO echo cancellation)
10
+ * - Create a voice session with default config
11
+ * - Handle SIGINT/SIGTERM for clean shutdown
12
+ */
13
+
14
+ import "dotenv/config";
15
+
16
+ import { homedir } from "os";
17
+ import { join } from "path";
18
+
19
+ import { createLocalAudioAdapter } from "./local-audio.js";
20
+ import { createVoiceSession } from "./voice-session.js";
21
+
22
+ // ============================================================================
23
+ // CONSTANTS
24
+ // ============================================================================
25
+
26
+ /** Mic capture sample rate in Hz (must match VAD/STT expectations) */
27
+ const MIC_SAMPLE_RATE = 16000;
28
+
29
+ /** TTS output sample rate in Hz -- must match tts-server.py output format */
30
+ const TTS_SAMPLE_RATE = 24000;
31
+
32
+ /** Default configuration for the voice session */
33
+ const DEFAULT_CONFIG = {
34
+ stopPhrase: "stop listening",
35
+ sttModelPath: join(homedir(), ".claude-voice-models", "whisper-small"),
36
+ ttsModel: "prince-canuma/Kokoro-82M",
37
+ ttsVoice: "af_heart",
38
+ modelCacheDir: join(homedir(), ".claude-voice-models"),
39
+ interruptionThresholdMs: 1500,
40
+ endpointing: {
41
+ silenceThresholdMs: 700,
42
+ maxSilenceBeforeTimeoutMs: 1200,
43
+ minWordCountForFastPath: 2,
44
+ enableHaikuFallback: false,
45
+ },
46
+ narration: {
47
+ summaryIntervalMs: 12000,
48
+ },
49
+ claudeSession: {
50
+ allowedTools: [] as string[],
51
+ permissionMode: "bypassPermissions",
52
+ systemPrompt:
53
+ "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
54
+ },
55
+ };
56
+
57
+ // ============================================================================
58
+ // ENTRY POINT
59
+ // ============================================================================
60
+
61
+ /**
62
+ * Main entry point. Creates the local audio adapter and voice session,
63
+ * then waits for shutdown via stop phrase or signal.
64
+ */
65
+ async function main(): Promise<void> {
66
+ const adapter = await createLocalAudioAdapter(MIC_SAMPLE_RATE, TTS_SAMPLE_RATE);
67
+
68
+ const session = await createVoiceSession(adapter, {
69
+ ...DEFAULT_CONFIG,
70
+ onSessionEnd: () => process.exit(0),
71
+ });
72
+
73
+ const signalHandler = () => {
74
+ session.stop().then(() => process.exit(0));
75
+ };
76
+ process.on("SIGINT", signalHandler);
77
+ process.on("SIGTERM", signalHandler);
78
+ }
79
+
80
+ main().catch((err) => {
81
+ console.error(`Voice loop failed: ${err}`);
82
+ process.exit(1);
83
+ });
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Local audio adapter wrapping the VPIO-based audio-capture module.
3
+ *
4
+ * Implements the AudioAdapter interface for the laptop mic path using macOS
5
+ * Voice Processing IO (VPIO) with echo cancellation. Delegates all low-level
6
+ * audio I/O to audio-capture.ts (singleton module).
7
+ *
8
+ * Responsibilities:
9
+ * - Start the VPIO binary via startCapture()
10
+ * - Wire VPIO stdout through bufferToFloat32 to the onAudio callback
11
+ * - Write PCM audio to VPIO stdin with backpressure handling
12
+ * - Send SIGUSR1/SIGUSR2 signals for interrupt/resume
13
+ * - Play the macOS system chime via afplay
14
+ */
15
+
16
+ import { spawn } from "child_process";
17
+
18
+ import { startCapture, stopCapture, interruptPlayback, resumePlayback, bufferToFloat32 } from "./audio-capture.js";
19
+
20
+ import type { Writable } from "stream";
21
+ import type { AudioAdapter } from "./audio-adapter.js";
22
+
23
+ // ============================================================================
24
+ // CONSTANTS
25
+ // ============================================================================
26
+
27
+ /** macOS system sound played when the agent finishes speaking and starts listening */
28
+ const READY_CHIME_PATH = "/System/Library/Sounds/Glass.aiff";
29
+
30
+ // ============================================================================
31
+ // MAIN ENTRYPOINT
32
+ // ============================================================================
33
+
34
+ /**
35
+ * Create a local AudioAdapter backed by the VPIO echo-cancelling audio process.
36
+ *
37
+ * Starts the mic-vpio binary, wires stdout through bufferToFloat32 to the
38
+ * onAudio callback, and returns an AudioAdapter.
39
+ *
40
+ * @param micRate - Mic output sample rate in Hz (e.g. 16000 for VAD/STT)
41
+ * @param speakerRate - Speaker input sample rate in Hz (e.g. 24000 for TTS)
42
+ * @returns An AudioAdapter for local mic I/O
43
+ * @throws Error if VPIO binary fails to start
44
+ */
45
+ export async function createLocalAudioAdapter(micRate: number, speakerRate: number): Promise<AudioAdapter> {
46
+ const audioIO = await startCapture(micRate, speakerRate);
47
+ const micStream = audioIO.micStream;
48
+ const speakerInput: Writable = audioIO.speakerInput;
49
+
50
+ let audioCallback: ((samples: Float32Array) => void) | null = null;
51
+
52
+ /**
53
+ * Subscribe to incoming audio chunks from the VPIO mic stream.
54
+ * Converts each Buffer chunk to Float32Array and invokes the callback.
55
+ *
56
+ * @param callback - Called with each audio chunk as Float32Array
57
+ */
58
+ function onAudio(callback: (samples: Float32Array) => void): void {
59
+ audioCallback = callback;
60
+
61
+ micStream.on("data", (chunk: Buffer) => {
62
+ const samples = bufferToFloat32(chunk);
63
+ audioCallback?.(samples);
64
+ });
65
+
66
+ micStream.on("error", (err: Error) => {
67
+ console.error(`Mic stream error: ${err.message}`);
68
+ });
69
+ }
70
+
71
+ /**
72
+ * Write PCM audio to the VPIO speaker stream with backpressure handling.
73
+ *
74
+ * @param pcm - Raw PCM buffer (16-bit signed, 24kHz mono)
75
+ * @returns Resolves when the write completes
76
+ */
77
+ function writeSpeaker(pcm: Buffer): Promise<void> {
78
+ return new Promise<void>((resolve, reject) => {
79
+ const ok = speakerInput.write(pcm, (err: Error | null | undefined) => {
80
+ if (err) reject(err);
81
+ });
82
+ if (ok) {
83
+ resolve();
84
+ } else {
85
+ speakerInput.once("drain", () => resolve());
86
+ }
87
+ });
88
+ }
89
+
90
+ /**
91
+ * Clear the VPIO playback buffer immediately (sends SIGUSR1).
92
+ */
93
+ function interrupt(): void {
94
+ interruptPlayback();
95
+ }
96
+
97
+ /**
98
+ * Resume VPIO stdin processing after an interrupt (sends SIGUSR2).
99
+ */
100
+ function resume(): void {
101
+ resumePlayback();
102
+ }
103
+
104
+ /**
105
+ * Play the macOS system ready chime. Fire-and-forget.
106
+ */
107
+ function playChime(): void {
108
+ spawn("afplay", ["--volume", "6", READY_CHIME_PATH]).on("error", () => {});
109
+ }
110
+
111
+ /**
112
+ * Stop the VPIO process and free all resources.
113
+ */
114
+ function destroy(): void {
115
+ stopCapture();
116
+ }
117
+
118
+ return {
119
+ onAudio,
120
+ writeSpeaker,
121
+ interrupt,
122
+ resume,
123
+ playChime,
124
+ destroy,
125
+ };
126
+ }
Binary file