voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.claude-plugin/plugin.json +6 -0
  2. package/README.md +48 -0
  3. package/bin/voicecc.js +39 -0
  4. package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
  5. package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
  6. package/dashboard/dist/audio-processor.js +126 -0
  7. package/dashboard/dist/index.html +13 -0
  8. package/dashboard/routes/auth.ts +119 -0
  9. package/dashboard/routes/browser-call.ts +87 -0
  10. package/dashboard/routes/claude-md.ts +50 -0
  11. package/dashboard/routes/conversations.ts +203 -0
  12. package/dashboard/routes/integrations.ts +154 -0
  13. package/dashboard/routes/mcp-servers.ts +198 -0
  14. package/dashboard/routes/settings.ts +64 -0
  15. package/dashboard/routes/tunnel.ts +66 -0
  16. package/dashboard/routes/twilio.ts +120 -0
  17. package/dashboard/routes/voice.ts +48 -0
  18. package/dashboard/routes/webrtc.ts +85 -0
  19. package/dashboard/server.ts +130 -0
  20. package/dashboard/tsconfig.json +13 -0
  21. package/init/CLAUDE.md +18 -0
  22. package/package.json +59 -0
  23. package/run.ts +68 -0
  24. package/scripts/postinstall.js +228 -0
  25. package/services/browser-call-manager.ts +106 -0
  26. package/services/device-pairing.ts +176 -0
  27. package/services/env.ts +88 -0
  28. package/services/tunnel.ts +204 -0
  29. package/services/twilio-manager.ts +126 -0
  30. package/sidecar/assets/startup.pcm +0 -0
  31. package/sidecar/audio-adapter.ts +60 -0
  32. package/sidecar/audio-capture.ts +220 -0
  33. package/sidecar/browser-audio-playback.test.ts +149 -0
  34. package/sidecar/browser-audio.ts +147 -0
  35. package/sidecar/browser-server.ts +331 -0
  36. package/sidecar/chime.test.ts +69 -0
  37. package/sidecar/chime.ts +54 -0
  38. package/sidecar/claude-session.ts +295 -0
  39. package/sidecar/endpointing.ts +163 -0
  40. package/sidecar/index.ts +83 -0
  41. package/sidecar/local-audio.ts +126 -0
  42. package/sidecar/mic-vpio +0 -0
  43. package/sidecar/mic-vpio.swift +484 -0
  44. package/sidecar/mock-tts-server-tagged.mjs +132 -0
  45. package/sidecar/narration.ts +204 -0
  46. package/sidecar/scripts/generate-startup-audio.py +79 -0
  47. package/sidecar/session-lock.ts +123 -0
  48. package/sidecar/sherpa-onnx-node.d.ts +4 -0
  49. package/sidecar/stt.ts +199 -0
  50. package/sidecar/tts-server.py +193 -0
  51. package/sidecar/tts.ts +481 -0
  52. package/sidecar/twilio-audio.ts +338 -0
  53. package/sidecar/twilio-server.ts +436 -0
  54. package/sidecar/types.ts +210 -0
  55. package/sidecar/vad.ts +101 -0
  56. package/sidecar/voice-loop-bugs.test.ts +522 -0
  57. package/sidecar/voice-session.ts +523 -0
  58. package/skills/voice/SKILL.md +26 -0
  59. package/tsconfig.json +22 -0
@@ -0,0 +1,436 @@
1
+ /**
2
+ * HTTP + WebSocket server that accepts Twilio phone calls and creates a
3
+ * voice session per call.
4
+ *
5
+ * Standalone entry point for the Twilio voice call path. Runs as a separate
6
+ * process from the local mic path (index.ts).
7
+ *
8
+ * Responsibilities:
9
+ * - Start HTTP server on TWILIO_PORT for Twilio webhooks
10
+ * - Validate incoming call webhooks via Twilio signature verification
11
+ * - Generate per-call UUID tokens for secure WebSocket upgrade
12
+ * - Accept Twilio media stream WebSocket connections
13
+ * - Create a TwilioAudioAdapter + VoiceSession per call
14
+ * - Enforce global session limit via session locks
15
+ * - Tear down sessions on hangup, stop phrase, or error
16
+ */
17
+
18
+ import "dotenv/config";
19
+
20
+ import { randomUUID } from "crypto";
21
+ import { createServer, request as httpRequest } from "http";
22
+ import { homedir } from "os";
23
+ import { join } from "path";
24
+
25
+ import twilio from "twilio";
26
+ import { WebSocketServer } from "ws";
27
+
28
+ import { createTwilioAudioAdapter } from "./twilio-audio.js";
29
+ import { createVoiceSession } from "./voice-session.js";
30
+
31
+ import type { IncomingMessage, ServerResponse } from "http";
32
+ import type { Duplex } from "stream";
33
+ import type { WebSocket } from "ws";
34
+ import type { VoiceSession } from "./voice-session.js";
35
+
36
+ // ============================================================================
37
+ // CONSTANTS
38
+ // ============================================================================
39
+
40
+ /** Default port for the Twilio HTTP/WebSocket server */
41
+ const DEFAULT_PORT = 8080;
42
+
43
+ /** Interruption threshold for phone calls (higher than local mic due to no VPIO echo cancellation) */
44
+ const PHONE_INTERRUPTION_THRESHOLD_MS = 2000;
45
+
46
+ /** Default voice session config for phone calls (same as index.ts DEFAULT_CONFIG but with phone-tuned threshold) */
47
+ const DEFAULT_CONFIG = {
48
+ stopPhrase: "stop listening",
49
+ sttModelPath: join(homedir(), ".claude-voice-models", "whisper-small"),
50
+ ttsModel: "prince-canuma/Kokoro-82M",
51
+ ttsVoice: "af_heart",
52
+ modelCacheDir: join(homedir(), ".claude-voice-models"),
53
+ interruptionThresholdMs: PHONE_INTERRUPTION_THRESHOLD_MS,
54
+ endpointing: {
55
+ silenceThresholdMs: 700,
56
+ maxSilenceBeforeTimeoutMs: 1200,
57
+ minWordCountForFastPath: 2,
58
+ enableHaikuFallback: false,
59
+ },
60
+ narration: {
61
+ summaryIntervalMs: 12000,
62
+ },
63
+ claudeSession: {
64
+ allowedTools: [] as string[],
65
+ permissionMode: "bypassPermissions",
66
+ systemPrompt:
67
+ "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
68
+ },
69
+ };
70
+
71
+ // ============================================================================
72
+ // TYPES
73
+ // ============================================================================
74
+
75
+ /** Tracks an active phone call from Twilio webhook through WebSocket session */
76
+ interface ActiveCall {
77
+ /** Twilio call SID (populated when the WebSocket start event arrives) */
78
+ callSid: string;
79
+ /** Voice session handle (null until WebSocket start event creates it) */
80
+ session: VoiceSession | null;
81
+ }
82
+
83
+ // ============================================================================
84
+ // STATE
85
+ // ============================================================================
86
+
87
+ /** Active calls keyed by per-call UUID token */
88
+ const activeCalls = new Map<string, ActiveCall>();
89
+
90
+ // ============================================================================
91
+ // MAIN ENTRYPOINT
92
+ // ============================================================================
93
+
94
+ /**
95
+ * Start the Twilio HTTP + WebSocket server.
96
+ *
97
+ * Loads configuration from .env via dotenv. Throws immediately if required
98
+ * env vars (TWILIO_AUTH_TOKEN, TWILIO_WEBHOOK_URL) are missing.
99
+ * Creates an HTTP server for the /twilio/incoming-call webhook and a
100
+ * WebSocket server for Twilio media stream connections.
101
+ *
102
+ * @returns Resolves when the server is listening
103
+ * @throws Error if TWILIO_AUTH_TOKEN or TWILIO_WEBHOOK_URL are not set
104
+ */
105
+ async function startTwilioServer(): Promise<void> {
106
+ const authToken = process.env.TWILIO_AUTH_TOKEN;
107
+ const webhookUrl = process.env.TWILIO_WEBHOOK_URL;
108
+ const port = parseInt(process.env.TWILIO_PORT ?? "", 10) || DEFAULT_PORT;
109
+
110
+ if (!authToken) {
111
+ throw new Error("TWILIO_AUTH_TOKEN is required in .env");
112
+ }
113
+ if (!webhookUrl) {
114
+ throw new Error("TWILIO_WEBHOOK_URL is required in .env");
115
+ }
116
+
117
+ // Extract the host from the webhook URL for TwiML WebSocket URLs
118
+ const webhookHost = new URL(webhookUrl).host;
119
+
120
+ const dashboardPort = parseInt(process.env.DASHBOARD_PORT ?? "", 10);
121
+
122
+ // Create HTTP server
123
+ const server = createServer((req, res) => {
124
+ if (req.method === "POST" && req.url === "/twilio/incoming-call") {
125
+ handleIncomingCall(req, res, authToken, webhookUrl, webhookHost);
126
+ return;
127
+ }
128
+
129
+ // Proxy all other requests to the dashboard server
130
+ if (dashboardPort) {
131
+ proxyToDashboard(req, res, dashboardPort);
132
+ return;
133
+ }
134
+
135
+ res.writeHead(404, { "Content-Type": "text/plain" });
136
+ res.end("Not Found");
137
+ });
138
+
139
+ // Create WebSocket server (no automatic HTTP handling -- upgrades only)
140
+ const wss = new WebSocketServer({ noServer: true });
141
+
142
+ // Handle WebSocket upgrade requests
143
+ server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
144
+ handleWebSocketUpgrade(req, socket, head, wss);
145
+ });
146
+
147
+ // Start listening
148
+ return new Promise<void>((resolve) => {
149
+ server.listen(port, () => {
150
+ console.log(`Twilio server listening on port ${port}`);
151
+ console.log(`Webhook URL: ${webhookUrl}`);
152
+ resolve();
153
+ });
154
+ });
155
+ }
156
+
157
+ // ============================================================================
158
+ // MAIN HANDLERS
159
+ // ============================================================================
160
+
161
+ /**
162
+ * Handle an incoming call webhook from Twilio (POST /twilio/incoming-call).
163
+ *
164
+ * Validates the Twilio request signature, generates a per-call token, and
165
+ * responds with TwiML that tells Twilio to connect a media stream WebSocket.
166
+ *
167
+ * @param req - HTTP request from Twilio
168
+ * @param res - HTTP response to send TwiML back
169
+ * @param authToken - Twilio auth token for signature validation
170
+ * @param webhookUrl - Public webhook URL for signature validation
171
+ * @param webhookHost - Host portion of the webhook URL for WebSocket URLs
172
+ */
173
+ function handleIncomingCall(
174
+ req: IncomingMessage,
175
+ res: ServerResponse,
176
+ authToken: string,
177
+ webhookUrl: string,
178
+ webhookHost: string,
179
+ ): void {
180
+ // Collect the POST body for signature validation
181
+ let body = "";
182
+ req.on("data", (chunk: Buffer) => {
183
+ body += chunk.toString();
184
+ });
185
+
186
+ req.on("end", () => {
187
+ // Parse URL-encoded POST body into key-value params
188
+ const params = parseUrlEncodedBody(body);
189
+
190
+ // Validate Twilio signature (use full URL -- Twilio signs against the complete endpoint URL)
191
+ const validationUrl = webhookUrl.replace(/\/$/, "") + req.url;
192
+ const signature = req.headers["x-twilio-signature"] as string;
193
+ if (!signature || !twilio.validateRequest(authToken, signature, validationUrl, params)) {
194
+ console.log("Rejected incoming call: invalid Twilio signature");
195
+ res.writeHead(403, { "Content-Type": "text/plain" });
196
+ res.end("Forbidden");
197
+ return;
198
+ }
199
+
200
+ // Generate per-call token and register in active calls
201
+ const token = randomUUID();
202
+ activeCalls.set(token, { callSid: "", session: null });
203
+
204
+ console.log(`Incoming call accepted, token: ${token}`);
205
+
206
+ // Respond with TwiML to connect a media stream
207
+ const twiml = [
208
+ '<?xml version="1.0" encoding="UTF-8"?>',
209
+ "<Response>",
210
+ " <Connect>",
211
+ ` <Stream url="wss://${webhookHost}/media/${token}" />`,
212
+ " </Connect>",
213
+ "</Response>",
214
+ ].join("\n");
215
+
216
+ res.writeHead(200, { "Content-Type": "text/xml" });
217
+ res.end(twiml);
218
+ });
219
+ }
220
+
221
+ /**
222
+ * Handle a WebSocket upgrade request for the Twilio media stream.
223
+ *
224
+ * Extracts the per-call token from the URL path, validates it against
225
+ * the activeCalls map, and either accepts or rejects the connection.
226
+ *
227
+ * @param req - HTTP upgrade request
228
+ * @param socket - Underlying TCP socket
229
+ * @param head - First packet of the upgraded stream
230
+ * @param wss - WebSocketServer instance to accept the upgrade
231
+ */
232
+ function handleWebSocketUpgrade(
233
+ req: IncomingMessage,
234
+ socket: Duplex,
235
+ head: Buffer,
236
+ wss: WebSocketServer,
237
+ ): void {
238
+ // Extract token from URL path: /media/:token
239
+ const url = req.url ?? "";
240
+ const match = url.match(/^\/media\/([a-f0-9-]+)$/);
241
+
242
+ if (!match) {
243
+ console.log(`Rejected WebSocket upgrade: invalid path ${url}`);
244
+ socket.destroy();
245
+ return;
246
+ }
247
+
248
+ const token = match[1];
249
+
250
+ if (!activeCalls.has(token)) {
251
+ console.log(`Rejected WebSocket upgrade: unknown token ${token}`);
252
+ socket.destroy();
253
+ return;
254
+ }
255
+
256
+ // Accept the WebSocket connection
257
+ wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
258
+ wss.emit("connection", ws, req);
259
+ handleCallSession(ws, token);
260
+ });
261
+ }
262
+
263
+ /**
264
+ * Handle a connected Twilio media stream WebSocket session.
265
+ *
266
+ * Listens for Twilio WebSocket events (start, media, stop) and manages
267
+ * the voice session lifecycle. On the "start" event, creates a
268
+ * TwilioAudioAdapter and VoiceSession. On "stop" or WebSocket close,
269
+ * tears down the session and cleans up.
270
+ *
271
+ * @param ws - Connected WebSocket for the Twilio media stream
272
+ * @param token - Per-call UUID token identifying this call
273
+ */
274
+ function handleCallSession(ws: WebSocket, token: string): void {
275
+ let cleaned = false;
276
+
277
+ /**
278
+ * Clean up the call session. Stops the voice session, removes from
279
+ * activeCalls map. Uses cleaned flag to prevent double-cleanup.
280
+ */
281
+ async function cleanup(): Promise<void> {
282
+ if (cleaned) return;
283
+ cleaned = true;
284
+
285
+ const call = activeCalls.get(token);
286
+ if (call?.session) {
287
+ await call.session.stop();
288
+ }
289
+
290
+ activeCalls.delete(token);
291
+ console.log(`Call session cleaned up, token: ${token}`);
292
+ }
293
+
294
+ // WebSocket close handler -- always runs cleanup regardless of cause
295
+ ws.on("close", () => {
296
+ cleanup().catch((err) => {
297
+ console.error(`Error during call cleanup: ${err}`);
298
+ });
299
+ });
300
+
301
+ ws.on("error", (err) => {
302
+ console.error(`WebSocket error for token ${token}: ${err}`);
303
+ });
304
+
305
+ // Listen for Twilio media stream events
306
+ ws.on("message", (data: Buffer | string) => {
307
+ const msg = JSON.parse(typeof data === "string" ? data : data.toString("utf-8"));
308
+
309
+ if (msg.event === "start") {
310
+ handleStreamStart(ws, token, msg).catch((err) => {
311
+ console.error(`Error handling stream start: ${err}`);
312
+ });
313
+ return;
314
+ }
315
+
316
+ if (msg.event === "stop") {
317
+ console.log(`Twilio stream stopped for token: ${token}`);
318
+ ws.close();
319
+ return;
320
+ }
321
+
322
+ // "connected" and "media" events are handled elsewhere:
323
+ // - "connected": informational, no action needed
324
+ // - "media": handled by TwilioAudioAdapter's onAudio listener
325
+ });
326
+ }
327
+
328
+ // ============================================================================
329
+ // HELPER FUNCTIONS
330
+ // ============================================================================
331
+
332
+ /**
333
+ * Handle the Twilio "start" event on a media stream WebSocket.
334
+ *
335
+ * Extracts the streamSid and callSid, creates a TwilioAudioAdapter and
336
+ * VoiceSession. If session creation fails (e.g. limit reached), logs the
337
+ * error and closes the WebSocket.
338
+ *
339
+ * @param ws - Connected WebSocket for the Twilio media stream
340
+ * @param token - Per-call UUID token
341
+ * @param msg - Parsed Twilio "start" event message
342
+ */
343
+ async function handleStreamStart(
344
+ ws: WebSocket,
345
+ token: string,
346
+ msg: { start: { streamSid: string; callSid: string } },
347
+ ): Promise<void> {
348
+ const { streamSid, callSid } = msg.start;
349
+ console.log(`Stream started -- callSid: ${callSid}, streamSid: ${streamSid}`);
350
+
351
+ // Update the active call entry with the callSid
352
+ const call = activeCalls.get(token);
353
+ if (!call) return;
354
+ call.callSid = callSid;
355
+
356
+ try {
357
+ // Create the Twilio audio adapter
358
+ const adapter = createTwilioAudioAdapter({ ws, streamSid });
359
+
360
+ // Create the voice session (acquires a session lock -- may throw if limit reached)
361
+ const session = await createVoiceSession(adapter, {
362
+ ...DEFAULT_CONFIG,
363
+ onSessionEnd: () => ws.close(),
364
+ });
365
+
366
+ call.session = session;
367
+ } catch (err) {
368
+ console.error(`Failed to create voice session for call ${callSid}: ${err}`);
369
+
370
+ // Send a TwiML-style rejection message over the WebSocket is not possible,
371
+ // so just close the WebSocket. The caller will hear silence and Twilio will
372
+ // eventually disconnect.
373
+ ws.close();
374
+ }
375
+ }
376
+
377
+ /**
378
+ * Parse a URL-encoded POST body into a key-value record.
379
+ *
380
+ * @param body - URL-encoded string (e.g. "key1=value1&key2=value2")
381
+ * @returns Record of decoded key-value pairs
382
+ */
383
+ function parseUrlEncodedBody(body: string): Record<string, string> {
384
+ const params: Record<string, string> = {};
385
+
386
+ if (!body) return params;
387
+
388
+ for (const pair of body.split("&")) {
389
+ const [key, value] = pair.split("=");
390
+ if (key) {
391
+ params[decodeURIComponent(key)] = decodeURIComponent(value ?? "");
392
+ }
393
+ }
394
+
395
+ return params;
396
+ }
397
+
398
+ /**
399
+ * Proxy an HTTP request to the dashboard server on localhost.
400
+ * Forwards the request method, path, headers, and body.
401
+ *
402
+ * @param req - Original incoming request
403
+ * @param res - Response to write the proxied result to
404
+ * @param dashboardPort - Port the dashboard server is listening on
405
+ */
406
+ function proxyToDashboard(req: IncomingMessage, res: ServerResponse, dashboardPort: number): void {
407
+ const proxyReq = httpRequest(
408
+ {
409
+ hostname: "127.0.0.1",
410
+ port: dashboardPort,
411
+ path: req.url,
412
+ method: req.method,
413
+ headers: req.headers,
414
+ },
415
+ (proxyRes) => {
416
+ res.writeHead(proxyRes.statusCode ?? 502, proxyRes.headers);
417
+ proxyRes.pipe(res);
418
+ },
419
+ );
420
+
421
+ proxyReq.on("error", () => {
422
+ res.writeHead(502, { "Content-Type": "text/plain" });
423
+ res.end("Dashboard unavailable");
424
+ });
425
+
426
+ req.pipe(proxyReq);
427
+ }
428
+
429
+ // ============================================================================
430
+ // ENTRY POINT
431
+ // ============================================================================
432
+
433
+ startTwilioServer().catch((err) => {
434
+ console.error(`Twilio server failed: ${err}`);
435
+ process.exit(1);
436
+ });
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Shared types for the Claude Code voice sidecar.
3
+ *
4
+ * Defines all DTOs and interfaces used across the voice pipeline modules:
5
+ * - Voice loop configuration and state
6
+ * - Audio frame representation
7
+ * - VAD (voice activity detection) events
8
+ * - STT (speech-to-text) results
9
+ * - Endpointing decisions for turn detection
10
+ * - Claude session streaming events
11
+ * - TTS (text-to-speech) configuration
12
+ * - Narration configuration
13
+ */
14
+
15
+ // ============================================================================
16
+ // CONFIGURATION INTERFACES
17
+ // ============================================================================
18
+
19
+ /**
20
+ * Top-level configuration for the voice loop.
21
+ * Passed to `startVoiceLoop` to initialize all modules.
22
+ */
23
+ export interface VoiceLoopConfig {
24
+ /** Path to the sherpa-onnx Whisper ONNX model directory */
25
+ sttModelPath: string;
26
+ /** mlx-audio model ID for TTS (e.g. "prince-canuma/Kokoro-82M") */
27
+ ttsModel: string;
28
+ /** TTS voice ID (e.g. "af_heart" for Kokoro) */
29
+ ttsVoice: string;
30
+ /** Directory for cached model files */
31
+ modelCacheDir: string;
32
+ /** Endpointing configuration for turn detection */
33
+ endpointing: EndpointingConfig;
34
+ /** Narration configuration for Claude response processing */
35
+ narration: NarrationConfig;
36
+ /** Claude Agent SDK session configuration */
37
+ claudeSession: ClaudeSessionConfig;
38
+ /** Phrase that stops the voice loop when spoken */
39
+ stopPhrase: string;
40
+ }
41
+
42
+ /**
43
+ * Configuration for the endpointing module.
44
+ * Controls how the system decides when the user is done speaking.
45
+ */
46
+ export interface EndpointingConfig {
47
+ /** Silence duration (ms) before considering speech complete */
48
+ silenceThresholdMs: number;
49
+ /** Maximum silence duration (ms) before forcing completion regardless */
50
+ maxSilenceBeforeTimeoutMs: number;
51
+ /** Minimum word count for the VAD fast path (skips Haiku check) */
52
+ minWordCountForFastPath: number;
53
+ /** Whether to use Haiku API for ambiguous short utterances */
54
+ enableHaikuFallback: boolean;
55
+ }
56
+
57
+ /**
58
+ * Configuration for the Claude Agent SDK session.
59
+ */
60
+ export interface ClaudeSessionConfig {
61
+ /** List of allowed tool names (empty array means all tools allowed) */
62
+ allowedTools: string[];
63
+ /** Permission mode -- must be "bypassPermissions" for voice loop */
64
+ permissionMode: string;
65
+ /** System prompt instructing Claude to respond concisely for voice output */
66
+ systemPrompt: string;
67
+ }
68
+
69
+ /**
70
+ * Configuration for the narration module.
71
+ * Controls how Claude's streaming output is processed into speakable text.
72
+ */
73
+ export interface NarrationConfig {
74
+ /** Interval (ms) between "still working..." summaries during long tool runs */
75
+ summaryIntervalMs: number;
76
+ }
77
+
78
+ /**
79
+ * Configuration for the TTS (text-to-speech) module.
80
+ */
81
+ export interface TtsConfig {
82
+ /** mlx-audio model ID (e.g. "prince-canuma/Kokoro-82M") */
83
+ model: string;
84
+ /** Voice ID (e.g. "af_heart" for Kokoro) */
85
+ voice: string;
86
+ /** Writable stream for PCM audio output (VPIO process stdin) */
87
+ speakerInput: import("stream").Writable;
88
+ /** Callback to clear the VPIO playback buffer on interruption */
89
+ interruptPlayback: () => void;
90
+ /** Callback to resume VPIO stdin processing after an interrupt */
91
+ resumePlayback: () => void;
92
+ /** Override server command for testing (default: Python TTS server) */
93
+ serverCommand?: string[];
94
+ }
95
+
96
+ // ============================================================================
97
+ // AUDIO TYPES
98
+ // ============================================================================
99
+
100
+ /**
101
+ * A single frame of audio data from the microphone.
102
+ */
103
+ export interface AudioFrame {
104
+ /** PCM audio samples normalized to -1.0 to 1.0 range */
105
+ pcm: Float32Array;
106
+ /** Sample rate in Hz */
107
+ sampleRate: number;
108
+ /** Timestamp in milliseconds when this frame was captured */
109
+ timestamp: number;
110
+ }
111
+
112
+ // ============================================================================
113
+ // VAD TYPES
114
+ // ============================================================================
115
+
116
+ /** Possible VAD event types indicating speech activity state */
117
+ export type VadEventType = "SPEECH_START" | "SPEECH_CONTINUE" | "SPEECH_END" | "SILENCE";
118
+
119
+ /**
120
+ * Event emitted by the VAD processor after analyzing an audio frame.
121
+ */
122
+ export interface VadEvent {
123
+ /** The detected speech activity state */
124
+ type: VadEventType;
125
+ /** Speech probability from the VAD model (0.0 to 1.0) */
126
+ probability: number;
127
+ /** Timestamp in milliseconds */
128
+ timestamp: number;
129
+ }
130
+
131
+ // ============================================================================
132
+ // STT TYPES
133
+ // ============================================================================
134
+
135
+ /**
136
+ * Result from the speech-to-text transcription.
137
+ */
138
+ export interface TranscriptionResult {
139
+ /** The transcribed text */
140
+ text: string;
141
+ /** Whether this is a final transcription (always true for batch/offline mode) */
142
+ isFinal: boolean;
143
+ /** Timestamp in milliseconds when transcription completed */
144
+ timestamp: number;
145
+ }
146
+
147
+ // ============================================================================
148
+ // ENDPOINTING TYPES
149
+ // ============================================================================
150
+
151
+ /** Method used to determine that the user finished speaking */
152
+ export type EndpointMethod = "vad_fast" | "haiku_semantic" | "timeout";
153
+
154
+ /**
155
+ * Decision from the endpointing module on whether the user has finished speaking.
156
+ */
157
+ export interface EndpointDecision {
158
+ /** Whether the user's turn is considered complete */
159
+ isComplete: boolean;
160
+ /** The current accumulated transcript */
161
+ transcript: string;
162
+ /** Which method was used to make the decision */
163
+ method: EndpointMethod;
164
+ }
165
+
166
+ // ============================================================================
167
+ // CLAUDE SESSION TYPES
168
+ // ============================================================================
169
+
170
+ /** Possible event types from the Claude streaming response */
171
+ export type ClaudeStreamEventType = "text_delta" | "tool_start" | "tool_end" | "result" | "error";
172
+
173
+ /**
174
+ * Simplified streaming event from the Claude Agent SDK session.
175
+ * Mapped from the raw SDKMessage types for downstream consumption.
176
+ */
177
+ export interface ClaudeStreamEvent {
178
+ /** The type of streaming event */
179
+ type: ClaudeStreamEventType;
180
+ /** Text content (for text_delta events) or error message (for error events) */
181
+ content: string;
182
+ /** Tool name (only present for tool_start events) */
183
+ toolName?: string;
184
+ }
185
+
186
+ // ============================================================================
187
+ // TTS TEXT CHUNK TYPES
188
+ // ============================================================================
189
+
190
+ /** A text chunk for TTS. Plain string = streaming fragment (buffer it).
191
+ * Object with flush = complete sentence (speak immediately). */
192
+ export type TextChunk = string | { text: string; flush: true };
193
+
194
+ // ============================================================================
195
+ // VOICE LOOP STATE
196
+ // ============================================================================
197
+
198
+ /** Possible states of the voice loop state machine */
199
+ export type VoiceLoopStatus = "idle" | "listening" | "processing" | "speaking";
200
+
201
+ /**
202
+ * Current state of the voice loop.
203
+ * Used by the state machine in index.ts.
204
+ */
205
+ export interface VoiceLoopState {
206
+ /** Current state of the voice loop */
207
+ status: VoiceLoopStatus;
208
+ /** Active Claude session ID, or null if no session is active */
209
+ sessionId: string | null;
210
+ }