voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.claude-plugin/plugin.json +6 -0
  2. package/README.md +48 -0
  3. package/bin/voicecc.js +39 -0
  4. package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
  5. package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
  6. package/dashboard/dist/audio-processor.js +126 -0
  7. package/dashboard/dist/index.html +13 -0
  8. package/dashboard/routes/auth.ts +119 -0
  9. package/dashboard/routes/browser-call.ts +87 -0
  10. package/dashboard/routes/claude-md.ts +50 -0
  11. package/dashboard/routes/conversations.ts +203 -0
  12. package/dashboard/routes/integrations.ts +154 -0
  13. package/dashboard/routes/mcp-servers.ts +198 -0
  14. package/dashboard/routes/settings.ts +64 -0
  15. package/dashboard/routes/tunnel.ts +66 -0
  16. package/dashboard/routes/twilio.ts +120 -0
  17. package/dashboard/routes/voice.ts +48 -0
  18. package/dashboard/routes/webrtc.ts +85 -0
  19. package/dashboard/server.ts +130 -0
  20. package/dashboard/tsconfig.json +13 -0
  21. package/init/CLAUDE.md +18 -0
  22. package/package.json +59 -0
  23. package/run.ts +68 -0
  24. package/scripts/postinstall.js +228 -0
  25. package/services/browser-call-manager.ts +106 -0
  26. package/services/device-pairing.ts +176 -0
  27. package/services/env.ts +88 -0
  28. package/services/tunnel.ts +204 -0
  29. package/services/twilio-manager.ts +126 -0
  30. package/sidecar/assets/startup.pcm +0 -0
  31. package/sidecar/audio-adapter.ts +60 -0
  32. package/sidecar/audio-capture.ts +220 -0
  33. package/sidecar/browser-audio-playback.test.ts +149 -0
  34. package/sidecar/browser-audio.ts +147 -0
  35. package/sidecar/browser-server.ts +331 -0
  36. package/sidecar/chime.test.ts +69 -0
  37. package/sidecar/chime.ts +54 -0
  38. package/sidecar/claude-session.ts +295 -0
  39. package/sidecar/endpointing.ts +163 -0
  40. package/sidecar/index.ts +83 -0
  41. package/sidecar/local-audio.ts +126 -0
  42. package/sidecar/mic-vpio +0 -0
  43. package/sidecar/mic-vpio.swift +484 -0
  44. package/sidecar/mock-tts-server-tagged.mjs +132 -0
  45. package/sidecar/narration.ts +204 -0
  46. package/sidecar/scripts/generate-startup-audio.py +79 -0
  47. package/sidecar/session-lock.ts +123 -0
  48. package/sidecar/sherpa-onnx-node.d.ts +4 -0
  49. package/sidecar/stt.ts +199 -0
  50. package/sidecar/tts-server.py +193 -0
  51. package/sidecar/tts.ts +481 -0
  52. package/sidecar/twilio-audio.ts +338 -0
  53. package/sidecar/twilio-server.ts +436 -0
  54. package/sidecar/types.ts +210 -0
  55. package/sidecar/vad.ts +101 -0
  56. package/sidecar/voice-loop-bugs.test.ts +522 -0
  57. package/sidecar/voice-session.ts +523 -0
  58. package/skills/voice/SKILL.md +26 -0
  59. package/tsconfig.json +22 -0
@@ -0,0 +1,331 @@
1
+ /**
2
+ * Standalone HTTP + WebSocket server for browser audio sessions.
3
+ *
4
+ * Runs on TWILIO_PORT (default 8080) -- same port as twilio-server.ts.
5
+ * Only one of browser-server / twilio-server runs at a time.
6
+ * Entry point for the browser call sidecar process.
7
+ *
8
+ * Responsibilities:
9
+ * - Start HTTP server on TWILIO_PORT for browser audio connections
10
+ * - Accept WebSocket upgrades on /audio?token=<deviceToken>
11
+ * - Validate device tokens via isValidDeviceToken() (localhost bypasses validation)
12
+ * - Reject duplicate connections for the same device token
13
+ * - Create BrowserAudioAdapter + VoiceSession per connection
14
+ * - Proxy non-audio HTTP requests to the dashboard server
15
+ * - Send periodic ws.ping() to keep connections alive through tunnel
16
+ */
17
+
18
+ import "dotenv/config";
19
+
20
+ import { createServer, request as httpRequest } from "http";
21
+ import { homedir } from "os";
22
+ import { join } from "path";
23
+
24
+ import { WebSocketServer } from "ws";
25
+
26
+ import { createBrowserAudioAdapter } from "./browser-audio.js";
27
+ import { createVoiceSession } from "./voice-session.js";
28
+ import { isValidDeviceToken } from "../services/device-pairing.js";
29
+
30
+ import type { IncomingMessage, ServerResponse } from "http";
31
+ import type { Duplex } from "stream";
32
+ import type { WebSocket } from "ws";
33
+ import type { VoiceSession } from "./voice-session.js";
34
+
35
+ // ============================================================================
36
+ // CONSTANTS
37
+ // ============================================================================
38
+
39
+ /** Default port for the browser audio server (same as Twilio) */
40
+ const DEFAULT_PORT = 8080;
41
+
42
+ /** Interruption threshold for browser calls (lower than Twilio's 2000ms because browser getUserMedia includes AEC) */
43
+ const BROWSER_INTERRUPTION_THRESHOLD_MS = 1500;
44
+
45
+ /** Ping interval to keep WebSocket connections alive through tunnel (ms) */
46
+ const PING_INTERVAL_MS = 30_000;
47
+
48
+ /** Default voice session config for browser calls */
49
+ const DEFAULT_CONFIG = {
50
+ stopPhrase: "stop listening",
51
+ sttModelPath: join(homedir(), ".claude-voice-models", "whisper-small"),
52
+ ttsModel: "prince-canuma/Kokoro-82M",
53
+ ttsVoice: "af_heart",
54
+ modelCacheDir: join(homedir(), ".claude-voice-models"),
55
+ interruptionThresholdMs: BROWSER_INTERRUPTION_THRESHOLD_MS,
56
+ endpointing: {
57
+ silenceThresholdMs: 700,
58
+ maxSilenceBeforeTimeoutMs: 1200,
59
+ minWordCountForFastPath: 2,
60
+ enableHaikuFallback: false,
61
+ },
62
+ narration: {
63
+ summaryIntervalMs: 12000,
64
+ },
65
+ claudeSession: {
66
+ allowedTools: [] as string[],
67
+ permissionMode: "bypassPermissions",
68
+ systemPrompt:
69
+ "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
70
+ },
71
+ };
72
+
73
+ // ============================================================================
74
+ // TYPES
75
+ // ============================================================================
76
+
77
+ /** Tracks an active browser audio session */
78
+ interface ActiveBrowserSession {
79
+ /** The device token used for this session */
80
+ deviceToken: string;
81
+ /** Voice session handle (null until created) */
82
+ session: VoiceSession | null;
83
+ }
84
+
85
+ // ============================================================================
86
+ // STATE
87
+ // ============================================================================
88
+
89
+ /** Active sessions keyed by device token */
90
+ const activeSessions = new Map<string, ActiveBrowserSession>();
91
+
92
+ // ============================================================================
93
+ // MAIN ENTRYPOINT
94
+ // ============================================================================
95
+
96
+ /**
97
+ * Start the browser audio HTTP + WebSocket server.
98
+ *
99
+ * Reads TWILIO_PORT (default 8080) and DASHBOARD_PORT from environment.
100
+ * Creates an HTTP server that proxies non-audio requests to the dashboard.
101
+ * WebSocket upgrade on /audio?token=<token> with device token validation.
102
+ * Sends periodic ws.ping() every 30s to keep connections alive through tunnel.
103
+ *
104
+ * @returns Resolves when the server is listening
105
+ * @throws Error if DASHBOARD_PORT is not set
106
+ */
107
+ async function startBrowserServer(): Promise<void> {
108
+ const port = parseInt(process.env.TWILIO_PORT ?? "", 10) || DEFAULT_PORT;
109
+ const dashboardPort = parseInt(process.env.DASHBOARD_PORT ?? "", 10);
110
+
111
+ if (!dashboardPort) {
112
+ throw new Error("DASHBOARD_PORT is required");
113
+ }
114
+
115
+ // Create HTTP server
116
+ const server = createServer((req, res) => {
117
+ // Proxy all HTTP requests to the dashboard server
118
+ proxyToDashboard(req, res, dashboardPort);
119
+ });
120
+
121
+ // Create WebSocket server (no automatic HTTP handling -- upgrades only)
122
+ const wss = new WebSocketServer({ noServer: true });
123
+
124
+ // Handle WebSocket upgrade requests
125
+ server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
126
+ handleWebSocketUpgrade(req, socket, head, wss);
127
+ });
128
+
129
+ // Periodic ping to keep connections alive through tunnel
130
+ setInterval(() => {
131
+ wss.clients.forEach((ws) => {
132
+ if (ws.readyState === ws.OPEN) {
133
+ ws.ping();
134
+ }
135
+ });
136
+ }, PING_INTERVAL_MS);
137
+
138
+ // Start listening
139
+ return new Promise<void>((resolve) => {
140
+ server.listen(port, () => {
141
+ console.log(`Browser audio server listening on port ${port}`);
142
+ resolve();
143
+ });
144
+ });
145
+ }
146
+
147
+ // ============================================================================
148
+ // MAIN HANDLERS
149
+ // ============================================================================
150
+
151
+ /**
152
+ * Handle a WebSocket upgrade request for browser audio.
153
+ *
154
+ * Validates that the path is /audio, extracts the device token from the query
155
+ * string, checks authorization (localhost or valid device token), and rejects
156
+ * duplicate connections for the same device token.
157
+ *
158
+ * @param req - HTTP upgrade request
159
+ * @param socket - Underlying TCP socket
160
+ * @param head - First packet of the upgraded stream
161
+ * @param wss - WebSocketServer instance to accept the upgrade
162
+ */
163
+ function handleWebSocketUpgrade(
164
+ req: IncomingMessage,
165
+ socket: Duplex,
166
+ head: Buffer,
167
+ wss: WebSocketServer,
168
+ ): void {
169
+ const url = new URL(req.url ?? "", `http://${req.headers.host}`);
170
+
171
+ // Validate path
172
+ if (url.pathname !== "/audio") {
173
+ console.log(`Rejected WebSocket upgrade: invalid path ${url.pathname}`);
174
+ socket.destroy();
175
+ return;
176
+ }
177
+
178
+ // Extract device token from query string
179
+ const token = url.searchParams.get("token") ?? "";
180
+
181
+ // Check authorization: localhost bypasses token validation
182
+ const remoteAddr = req.socket.remoteAddress ?? "";
183
+ const isLocalhost =
184
+ remoteAddr === "127.0.0.1" ||
185
+ remoteAddr === "::1" ||
186
+ remoteAddr === "::ffff:127.0.0.1";
187
+
188
+ if (!isLocalhost && !token) {
189
+ console.log("Rejected WebSocket upgrade: missing device token");
190
+ socket.destroy();
191
+ return;
192
+ }
193
+
194
+ if (!isLocalhost && !isValidDeviceToken(token)) {
195
+ console.log("Rejected WebSocket upgrade: invalid device token");
196
+ socket.destroy();
197
+ return;
198
+ }
199
+
200
+ // Reject duplicate connections for the same device token
201
+ if (token && activeSessions.has(token)) {
202
+ console.log(`Rejected WebSocket upgrade: duplicate device token ${token}`);
203
+ socket.destroy();
204
+ return;
205
+ }
206
+
207
+ // Accept the WebSocket connection
208
+ wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
209
+ wss.emit("connection", ws, req);
210
+ handleBrowserSession(ws, token || "localhost");
211
+ });
212
+ }
213
+
214
+ /**
215
+ * Handle a connected browser audio WebSocket session.
216
+ *
217
+ * Creates a BrowserAudioAdapter and VoiceSession with browser-tuned config.
218
+ * Registers close/error handlers for cleanup. Removes from activeSessions
219
+ * on disconnect.
220
+ *
221
+ * @param ws - Connected WebSocket for browser audio
222
+ * @param deviceToken - Device token identifying this connection
223
+ */
224
+ function handleBrowserSession(ws: WebSocket, deviceToken: string): void {
225
+ let cleaned = false;
226
+
227
+ // Register in active sessions
228
+ const entry: ActiveBrowserSession = { deviceToken, session: null };
229
+ activeSessions.set(deviceToken, entry);
230
+
231
+ console.log(`Browser session connected, token: ${deviceToken}`);
232
+
233
+ /**
234
+ * Clean up the browser session. Stops the voice session and removes from
235
+ * the activeSessions map. Uses cleaned flag to prevent double-cleanup.
236
+ */
237
+ async function cleanup(): Promise<void> {
238
+ if (cleaned) return;
239
+ cleaned = true;
240
+
241
+ if (entry.session) {
242
+ await entry.session.stop();
243
+ }
244
+
245
+ activeSessions.delete(deviceToken);
246
+ console.log(`Browser session cleaned up, token: ${deviceToken}`);
247
+ }
248
+
249
+ // WebSocket close handler
250
+ ws.on("close", () => {
251
+ cleanup().catch((err) => {
252
+ console.error(`Error during browser session cleanup: ${err}`);
253
+ });
254
+ });
255
+
256
+ ws.on("error", (err) => {
257
+ console.error(`WebSocket error for token ${deviceToken}: ${err}`);
258
+ });
259
+
260
+ // Create adapter and voice session
261
+ createSession(ws, entry).catch((err) => {
262
+ console.error(`Failed to create voice session for token ${deviceToken}: ${err}`);
263
+ ws.close();
264
+ });
265
+ }
266
+
267
+ // ============================================================================
268
+ // HELPER FUNCTIONS
269
+ // ============================================================================
270
+
271
+ /**
272
+ * Create the BrowserAudioAdapter and VoiceSession for a connected WebSocket.
273
+ *
274
+ * @param ws - Connected WebSocket for browser audio
275
+ * @param entry - Active session entry to populate with the voice session
276
+ */
277
+ async function createSession(ws: WebSocket, entry: ActiveBrowserSession): Promise<void> {
278
+ const adapter = createBrowserAudioAdapter({ ws });
279
+
280
+ const session = await createVoiceSession(adapter, {
281
+ ...DEFAULT_CONFIG,
282
+ onSessionEnd: () => ws.close(),
283
+ });
284
+
285
+ entry.session = session;
286
+ }
287
+
288
+ /**
289
+ * Proxy an HTTP request to the dashboard server on localhost.
290
+ * Forwards the request method, path, headers, and body.
291
+ *
292
+ * @param req - Original incoming request
293
+ * @param res - Response to write the proxied result to
294
+ * @param dashboardPort - Port the dashboard server is listening on
295
+ */
296
+ function proxyToDashboard(req: IncomingMessage, res: ServerResponse, dashboardPort: number): void {
297
+ const clientIp = req.headers["x-forwarded-for"] || req.socket.remoteAddress || "unknown";
298
+ console.log(`[proxy] ${req.method} ${req.url} from ${clientIp}`);
299
+
300
+ const proxyReq = httpRequest(
301
+ {
302
+ hostname: "127.0.0.1",
303
+ port: dashboardPort,
304
+ path: req.url,
305
+ method: req.method,
306
+ headers: req.headers,
307
+ },
308
+ (proxyRes) => {
309
+ console.log(`[proxy] ${req.url} -> ${proxyRes.statusCode}`);
310
+ res.writeHead(proxyRes.statusCode ?? 502, proxyRes.headers);
311
+ proxyRes.pipe(res);
312
+ },
313
+ );
314
+
315
+ proxyReq.on("error", (err) => {
316
+ console.error(`[proxy] ${req.url} proxy error:`, err.message);
317
+ res.writeHead(502, { "Content-Type": "text/plain" });
318
+ res.end("Dashboard unavailable");
319
+ });
320
+
321
+ req.pipe(proxyReq);
322
+ }
323
+
324
+ // ============================================================================
325
+ // ENTRY POINT
326
+ // ============================================================================
327
+
328
+ startBrowserServer().catch((err) => {
329
+ console.error(`Browser audio server failed: ${err}`);
330
+ process.exit(1);
331
+ });
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Tests that decodeChimeToPcm produces clean audio without artifacts.
3
+ *
4
+ * The chime PCM is sent directly to the browser as raw int16 samples.
5
+ * If the buffer contains file-format headers, they get played as loud
6
+ * garbage ("bop") before the actual chime.
7
+ *
8
+ * Run: npx tsx --test sidecar/chime.test.ts
9
+ */
10
+
11
+ import { test } from "node:test";
12
+ import { strict as assert } from "node:assert";
13
+
14
+ import { decodeChimeToPcm } from "./chime.js";
15
+
16
+ // ============================================================================
17
+ // CONSTANTS
18
+ // ============================================================================
19
+
20
+ /** Sample rate of the decoded chime */
21
+ const CHIME_RATE = 24000;
22
+
23
+ /**
24
+ * Max acceptable amplitude for the first 10ms of the chime.
25
+ * Glass.aiff fades in from silence, so early samples should be near-zero.
26
+ * A value above this means non-audio data (e.g. file headers) is present.
27
+ */
28
+ const MAX_AMPLITUDE_FIRST_10MS = 500;
29
+
30
+ // ============================================================================
31
+ // TESTS
32
+ // ============================================================================
33
+
34
+ /**
35
+ * The chime starts quietly -- the Glass.aiff sound fades in from silence.
36
+ * If the first samples contain large values, the buffer has non-audio data
37
+ * (file-format headers) that would be heard as a loud pop/bop.
38
+ */
39
+ test("chime PCM starts with near-silent samples (no file header artifacts)", () => {
40
+ const buf = decodeChimeToPcm();
41
+ const int16 = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2);
42
+
43
+ const samplesIn10ms = Math.floor(CHIME_RATE * 0.01);
44
+
45
+ let maxAmplitude = 0;
46
+ for (let i = 0; i < samplesIn10ms; i++) {
47
+ maxAmplitude = Math.max(maxAmplitude, Math.abs(int16[i]));
48
+ }
49
+
50
+ assert.ok(
51
+ maxAmplitude < MAX_AMPLITUDE_FIRST_10MS,
52
+ `First 10ms of chime has amplitude ${maxAmplitude} (limit: ${MAX_AMPLITUDE_FIRST_10MS}). ` +
53
+ `This likely means file-format header bytes are being included as audio data.`
54
+ );
55
+ });
56
+
57
+ /**
58
+ * The decoded chime should contain roughly 1-2 seconds of audio.
59
+ * If the buffer is much larger, it likely includes a large file header.
60
+ * If much smaller, the decoding failed.
61
+ */
62
+ test("chime PCM has a plausible duration for Glass.aiff", () => {
63
+ const buf = decodeChimeToPcm();
64
+ const sampleCount = buf.byteLength / 2; // int16 = 2 bytes per sample
65
+ const durationSec = sampleCount / CHIME_RATE;
66
+
67
+ assert.ok(durationSec > 0.5, `Chime too short: ${durationSec.toFixed(2)}s`);
68
+ assert.ok(durationSec < 3.0, `Chime too long: ${durationSec.toFixed(2)}s -- may contain header data`);
69
+ });
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Shared utility for decoding the macOS ready chime to raw PCM.
3
+ *
4
+ * Extracted from twilio-audio.ts so the chime decoding logic is reused across
5
+ * audio adapters (Twilio, browser) without duplication.
6
+ *
7
+ * Responsibilities:
8
+ * - Decode macOS Glass.aiff to raw 24kHz int16 mono PCM via afconvert
9
+ * - Use a PID-scoped temp file to avoid race conditions across processes
10
+ */
11
+
12
+ import { execSync } from "child_process";
13
+ import { readFileSync, unlinkSync } from "fs";
14
+
15
+ // ============================================================================
16
+ // CONSTANTS
17
+ // ============================================================================
18
+
19
+ /** macOS system sound used for the ready chime */
20
+ export const READY_CHIME_PATH = "/System/Library/Sounds/Glass.aiff";
21
+
22
+ /** Temp file path for afconvert output, scoped by PID to avoid collisions */
23
+ export const CHIME_TEMP_PATH = `/tmp/chime-24k-${process.pid}.raw`;
24
+
25
+ // ============================================================================
26
+ // MAIN ENTRYPOINT
27
+ // ============================================================================
28
+
29
+ /** CAF data chunk: 'data' (4B) + size (8B) + editCount (4B) = 16 bytes before PCM */
30
+ const CAF_DATA_CHUNK_HEADER_SIZE = 16;
31
+
32
+ /**
33
+ * Decode the macOS Glass.aiff system sound to raw 24kHz int16 PCM.
34
+ * Uses afconvert (macOS built-in) to convert to CAF format, then strips
35
+ * the CAF container header to extract the raw PCM payload.
36
+ *
37
+ * @returns Buffer containing raw 24kHz int16 mono PCM
38
+ * @throws Error if afconvert fails, temp file cannot be read, or CAF has no data chunk
39
+ */
40
+ export function decodeChimeToPcm(): Buffer {
41
+ execSync(`afconvert -f caff -d LEI16@24000 -c 1 ${READY_CHIME_PATH} ${CHIME_TEMP_PATH}`);
42
+
43
+ const caf = readFileSync(CHIME_TEMP_PATH);
44
+
45
+ unlinkSync(CHIME_TEMP_PATH);
46
+
47
+ // Find the 'data' chunk marker and skip past its header to the raw PCM
48
+ const dataMarker = caf.indexOf("data", 0, "ascii");
49
+ if (dataMarker === -1) {
50
+ throw new Error("CAF file missing 'data' chunk");
51
+ }
52
+
53
+ return caf.subarray(dataMarker + CAF_DATA_CHUNK_HEADER_SIZE);
54
+ }