npm - voicecc - Versions diffs - 1.0.7 - Mend

voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/.claude-plugin/plugin.json +6 -0
package/README.md +48 -0
package/bin/voicecc.js +39 -0
package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
package/dashboard/dist/audio-processor.js +126 -0
package/dashboard/dist/index.html +13 -0
package/dashboard/routes/auth.ts +119 -0
package/dashboard/routes/browser-call.ts +87 -0
package/dashboard/routes/claude-md.ts +50 -0
package/dashboard/routes/conversations.ts +203 -0
package/dashboard/routes/integrations.ts +154 -0
package/dashboard/routes/mcp-servers.ts +198 -0
package/dashboard/routes/settings.ts +64 -0
package/dashboard/routes/tunnel.ts +66 -0
package/dashboard/routes/twilio.ts +120 -0
package/dashboard/routes/voice.ts +48 -0
package/dashboard/routes/webrtc.ts +85 -0
package/dashboard/server.ts +130 -0
package/dashboard/tsconfig.json +13 -0
package/init/CLAUDE.md +18 -0
package/package.json +59 -0
package/run.ts +68 -0
package/scripts/postinstall.js +228 -0
package/services/browser-call-manager.ts +106 -0
package/services/device-pairing.ts +176 -0
package/services/env.ts +88 -0
package/services/tunnel.ts +204 -0
package/services/twilio-manager.ts +126 -0
package/sidecar/assets/startup.pcm +0 -0
package/sidecar/audio-adapter.ts +60 -0
package/sidecar/audio-capture.ts +220 -0
package/sidecar/browser-audio-playback.test.ts +149 -0
package/sidecar/browser-audio.ts +147 -0
package/sidecar/browser-server.ts +331 -0
package/sidecar/chime.test.ts +69 -0
package/sidecar/chime.ts +54 -0
package/sidecar/claude-session.ts +295 -0
package/sidecar/endpointing.ts +163 -0
package/sidecar/index.ts +83 -0
package/sidecar/local-audio.ts +126 -0
package/sidecar/mic-vpio +0 -0
package/sidecar/mic-vpio.swift +484 -0
package/sidecar/mock-tts-server-tagged.mjs +132 -0
package/sidecar/narration.ts +204 -0
package/sidecar/scripts/generate-startup-audio.py +79 -0
package/sidecar/session-lock.ts +123 -0
package/sidecar/sherpa-onnx-node.d.ts +4 -0
package/sidecar/stt.ts +199 -0
package/sidecar/tts-server.py +193 -0
package/sidecar/tts.ts +481 -0
package/sidecar/twilio-audio.ts +338 -0
package/sidecar/twilio-server.ts +436 -0
package/sidecar/types.ts +210 -0
package/sidecar/vad.ts +101 -0
package/sidecar/voice-loop-bugs.test.ts +522 -0
package/sidecar/voice-session.ts +523 -0
package/skills/voice/SKILL.md +26 -0
package/tsconfig.json +22 -0

package/sidecar/browser-server.ts ADDED Viewed

@@ -0,0 +1,331 @@
+/**
+ * Standalone HTTP + WebSocket server for browser audio sessions.
+ *
+ * Runs on TWILIO_PORT (default 8080) -- same port as twilio-server.ts.
+ * Only one of browser-server / twilio-server runs at a time.
+ * Entry point for the browser call sidecar process.
+ *
+ * Responsibilities:
+ * - Start HTTP server on TWILIO_PORT for browser audio connections
+ * - Accept WebSocket upgrades on /audio?token=<deviceToken>
+ * - Validate device tokens via isValidDeviceToken() (localhost bypasses validation)
+ * - Reject duplicate connections for the same device token
+ * - Create BrowserAudioAdapter + VoiceSession per connection
+ * - Proxy non-audio HTTP requests to the dashboard server
+ * - Send periodic ws.ping() to keep connections alive through tunnel
+ */
+import "dotenv/config";
+import { createServer, request as httpRequest } from "http";
+import { homedir } from "os";
+import { join } from "path";
+import { WebSocketServer } from "ws";
+import { createBrowserAudioAdapter } from "./browser-audio.js";
+import { createVoiceSession } from "./voice-session.js";
+import { isValidDeviceToken } from "../services/device-pairing.js";
+import type { IncomingMessage, ServerResponse } from "http";
+import type { Duplex } from "stream";
+import type { WebSocket } from "ws";
+import type { VoiceSession } from "./voice-session.js";
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+/** Default port for the browser audio server (same as Twilio) */
+const DEFAULT_PORT = 8080;
+/** Interruption threshold for browser calls (lower than Twilio's 2000ms because browser getUserMedia includes AEC) */
+const BROWSER_INTERRUPTION_THRESHOLD_MS = 1500;
+/** Ping interval to keep WebSocket connections alive through tunnel (ms) */
+const PING_INTERVAL_MS = 30_000;
+/** Default voice session config for browser calls */
+const DEFAULT_CONFIG = {
+  stopPhrase: "stop listening",
+  sttModelPath: join(homedir(), ".claude-voice-models", "whisper-small"),
+  ttsModel: "prince-canuma/Kokoro-82M",
+  ttsVoice: "af_heart",
+  modelCacheDir: join(homedir(), ".claude-voice-models"),
+  interruptionThresholdMs: BROWSER_INTERRUPTION_THRESHOLD_MS,
+  endpointing: {
+    silenceThresholdMs: 700,
+    maxSilenceBeforeTimeoutMs: 1200,
+    minWordCountForFastPath: 2,
+    enableHaikuFallback: false,
+  },
+  narration: {
+    summaryIntervalMs: 12000,
+  },
+  claudeSession: {
+    allowedTools: [] as string[],
+    permissionMode: "bypassPermissions",
+    systemPrompt:
+      "Respond concisely. You are in voice mode -- your responses will be spoken aloud. Keep answers conversational and brief.",
+  },
+};
+// ============================================================================
+// TYPES
+// ============================================================================
+/** Tracks an active browser audio session */
+interface ActiveBrowserSession {
+  /** The device token used for this session */
+  deviceToken: string;
+  /** Voice session handle (null until created) */
+  session: VoiceSession | null;
+}
+// ============================================================================
+// STATE
+// ============================================================================
+/** Active sessions keyed by device token */
+const activeSessions = new Map<string, ActiveBrowserSession>();
+// ============================================================================
+// MAIN ENTRYPOINT
+// ============================================================================
+/**
+ * Start the browser audio HTTP + WebSocket server.
+ *
+ * Reads TWILIO_PORT (default 8080) and DASHBOARD_PORT from environment.
+ * Creates an HTTP server that proxies non-audio requests to the dashboard.
+ * WebSocket upgrade on /audio?token=<token> with device token validation.
+ * Sends periodic ws.ping() every 30s to keep connections alive through tunnel.
+ *
+ * @returns Resolves when the server is listening
+ * @throws Error if DASHBOARD_PORT is not set
+ */
+async function startBrowserServer(): Promise<void> {
+  const port = parseInt(process.env.TWILIO_PORT ?? "", 10) || DEFAULT_PORT;
+  const dashboardPort = parseInt(process.env.DASHBOARD_PORT ?? "", 10);
+  if (!dashboardPort) {
+    throw new Error("DASHBOARD_PORT is required");
+  }
+  // Create HTTP server
+  const server = createServer((req, res) => {
+    // Proxy all HTTP requests to the dashboard server
+    proxyToDashboard(req, res, dashboardPort);
+  });
+  // Create WebSocket server (no automatic HTTP handling -- upgrades only)
+  const wss = new WebSocketServer({ noServer: true });
+  // Handle WebSocket upgrade requests
+  server.on("upgrade", (req: IncomingMessage, socket: Duplex, head: Buffer) => {
+    handleWebSocketUpgrade(req, socket, head, wss);
+  });
+  // Periodic ping to keep connections alive through tunnel
+  setInterval(() => {
+    wss.clients.forEach((ws) => {
+      if (ws.readyState === ws.OPEN) {
+        ws.ping();
+      }
+    });
+  }, PING_INTERVAL_MS);
+  // Start listening
+  return new Promise<void>((resolve) => {
+    server.listen(port, () => {
+      console.log(`Browser audio server listening on port ${port}`);
+      resolve();
+    });
+  });
+}
+// ============================================================================
+// MAIN HANDLERS
+// ============================================================================
+/**
+ * Handle a WebSocket upgrade request for browser audio.
+ *
+ * Validates that the path is /audio, extracts the device token from the query
+ * string, checks authorization (localhost or valid device token), and rejects
+ * duplicate connections for the same device token.
+ *
+ * @param req - HTTP upgrade request
+ * @param socket - Underlying TCP socket
+ * @param head - First packet of the upgraded stream
+ * @param wss - WebSocketServer instance to accept the upgrade
+ */
+function handleWebSocketUpgrade(
+  req: IncomingMessage,
+  socket: Duplex,
+  head: Buffer,
+  wss: WebSocketServer,
+): void {
+  const url = new URL(req.url ?? "", `http://${req.headers.host}`);
+  // Validate path
+  if (url.pathname !== "/audio") {
+    console.log(`Rejected WebSocket upgrade: invalid path ${url.pathname}`);
+    socket.destroy();
+    return;
+  }
+  // Extract device token from query string
+  const token = url.searchParams.get("token") ?? "";
+  // Check authorization: localhost bypasses token validation
+  const remoteAddr = req.socket.remoteAddress ?? "";
+  const isLocalhost =
+    remoteAddr === "127.0.0.1" ||
+    remoteAddr === "::1" ||
+    remoteAddr === "::ffff:127.0.0.1";
+  if (!isLocalhost && !token) {
+    console.log("Rejected WebSocket upgrade: missing device token");
+    socket.destroy();
+    return;
+  }
+  if (!isLocalhost && !isValidDeviceToken(token)) {
+    console.log("Rejected WebSocket upgrade: invalid device token");
+    socket.destroy();
+    return;
+  }
+  // Reject duplicate connections for the same device token
+  if (token && activeSessions.has(token)) {
+    console.log(`Rejected WebSocket upgrade: duplicate device token ${token}`);
+    socket.destroy();
+    return;
+  }
+  // Accept the WebSocket connection
+  wss.handleUpgrade(req, socket, head, (ws: WebSocket) => {
+    wss.emit("connection", ws, req);
+    handleBrowserSession(ws, token || "localhost");
+  });
+}
+/**
+ * Handle a connected browser audio WebSocket session.
+ *
+ * Creates a BrowserAudioAdapter and VoiceSession with browser-tuned config.
+ * Registers close/error handlers for cleanup. Removes from activeSessions
+ * on disconnect.
+ *
+ * @param ws - Connected WebSocket for browser audio
+ * @param deviceToken - Device token identifying this connection
+ */
+function handleBrowserSession(ws: WebSocket, deviceToken: string): void {
+  let cleaned = false;
+  // Register in active sessions
+  const entry: ActiveBrowserSession = { deviceToken, session: null };
+  activeSessions.set(deviceToken, entry);
+  console.log(`Browser session connected, token: ${deviceToken}`);
+  /**
+   * Clean up the browser session. Stops the voice session and removes from
+   * the activeSessions map. Uses cleaned flag to prevent double-cleanup.
+   */
+  async function cleanup(): Promise<void> {
+    if (cleaned) return;
+    cleaned = true;
+    if (entry.session) {
+      await entry.session.stop();
+    }
+    activeSessions.delete(deviceToken);
+    console.log(`Browser session cleaned up, token: ${deviceToken}`);
+  }
+  // WebSocket close handler
+  ws.on("close", () => {
+    cleanup().catch((err) => {
+      console.error(`Error during browser session cleanup: ${err}`);
+    });
+  });
+  ws.on("error", (err) => {
+    console.error(`WebSocket error for token ${deviceToken}: ${err}`);
+  });
+  // Create adapter and voice session
+  createSession(ws, entry).catch((err) => {
+    console.error(`Failed to create voice session for token ${deviceToken}: ${err}`);
+    ws.close();
+  });
+}
+// ============================================================================
+// HELPER FUNCTIONS
+// ============================================================================
+/**
+ * Create the BrowserAudioAdapter and VoiceSession for a connected WebSocket.
+ *
+ * @param ws - Connected WebSocket for browser audio
+ * @param entry - Active session entry to populate with the voice session
+ */
+async function createSession(ws: WebSocket, entry: ActiveBrowserSession): Promise<void> {
+  const adapter = createBrowserAudioAdapter({ ws });
+  const session = await createVoiceSession(adapter, {
+    ...DEFAULT_CONFIG,
+    onSessionEnd: () => ws.close(),
+  });
+  entry.session = session;
+}
+/**
+ * Proxy an HTTP request to the dashboard server on localhost.
+ * Forwards the request method, path, headers, and body.
+ *
+ * @param req - Original incoming request
+ * @param res - Response to write the proxied result to
+ * @param dashboardPort - Port the dashboard server is listening on
+ */
+function proxyToDashboard(req: IncomingMessage, res: ServerResponse, dashboardPort: number): void {
+  const clientIp = req.headers["x-forwarded-for"] || req.socket.remoteAddress || "unknown";
+  console.log(`[proxy] ${req.method} ${req.url} from ${clientIp}`);
+  const proxyReq = httpRequest(
+    {
+      hostname: "127.0.0.1",
+      port: dashboardPort,
+      path: req.url,
+      method: req.method,
+      headers: req.headers,
+    },
+    (proxyRes) => {
+      console.log(`[proxy] ${req.url} -> ${proxyRes.statusCode}`);
+      res.writeHead(proxyRes.statusCode ?? 502, proxyRes.headers);
+      proxyRes.pipe(res);
+    },
+  );
+  proxyReq.on("error", (err) => {
+    console.error(`[proxy] ${req.url} proxy error:`, err.message);
+    res.writeHead(502, { "Content-Type": "text/plain" });
+    res.end("Dashboard unavailable");
+  });
+  req.pipe(proxyReq);
+}
+// ============================================================================
+// ENTRY POINT
+// ============================================================================
+startBrowserServer().catch((err) => {
+  console.error(`Browser audio server failed: ${err}`);
+  process.exit(1);
+});

package/sidecar/chime.test.ts ADDED Viewed

@@ -0,0 +1,69 @@
+/**
+ * Tests that decodeChimeToPcm produces clean audio without artifacts.
+ *
+ * The chime PCM is sent directly to the browser as raw int16 samples.
+ * If the buffer contains file-format headers, they get played as loud
+ * garbage ("bop") before the actual chime.
+ *
+ * Run: npx tsx --test sidecar/chime.test.ts
+ */
+import { test } from "node:test";
+import { strict as assert } from "node:assert";
+import { decodeChimeToPcm } from "./chime.js";
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+/** Sample rate of the decoded chime */
+const CHIME_RATE = 24000;
+/**
+ * Max acceptable amplitude for the first 10ms of the chime.
+ * Glass.aiff fades in from silence, so early samples should be near-zero.
+ * A value above this means non-audio data (e.g. file headers) is present.
+ */
+const MAX_AMPLITUDE_FIRST_10MS = 500;
+// ============================================================================
+// TESTS
+// ============================================================================
+/**
+ * The chime starts quietly -- the Glass.aiff sound fades in from silence.
+ * If the first samples contain large values, the buffer has non-audio data
+ * (file-format headers) that would be heard as a loud pop/bop.
+ */
+test("chime PCM starts with near-silent samples (no file header artifacts)", () => {
+  const buf = decodeChimeToPcm();
+  const int16 = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2);
+  const samplesIn10ms = Math.floor(CHIME_RATE * 0.01);
+  let maxAmplitude = 0;
+  for (let i = 0; i < samplesIn10ms; i++) {
+    maxAmplitude = Math.max(maxAmplitude, Math.abs(int16[i]));
+  }
+  assert.ok(
+    maxAmplitude < MAX_AMPLITUDE_FIRST_10MS,
+    `First 10ms of chime has amplitude ${maxAmplitude} (limit: ${MAX_AMPLITUDE_FIRST_10MS}). ` +
+    `This likely means file-format header bytes are being included as audio data.`
+  );
+});
+/**
+ * The decoded chime should contain roughly 1-2 seconds of audio.
+ * If the buffer is much larger, it likely includes a large file header.
+ * If much smaller, the decoding failed.
+ */
+test("chime PCM has a plausible duration for Glass.aiff", () => {
+  const buf = decodeChimeToPcm();
+  const sampleCount = buf.byteLength / 2; // int16 = 2 bytes per sample
+  const durationSec = sampleCount / CHIME_RATE;
+  assert.ok(durationSec > 0.5, `Chime too short: ${durationSec.toFixed(2)}s`);
+  assert.ok(durationSec < 3.0, `Chime too long: ${durationSec.toFixed(2)}s -- may contain header data`);
+});

package/sidecar/chime.ts ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * Shared utility for decoding the macOS ready chime to raw PCM.
+ *
+ * Extracted from twilio-audio.ts so the chime decoding logic is reused across
+ * audio adapters (Twilio, browser) without duplication.
+ *
+ * Responsibilities:
+ * - Decode macOS Glass.aiff to raw 24kHz int16 mono PCM via afconvert
+ * - Use a PID-scoped temp file to avoid race conditions across processes
+ */
+import { execSync } from "child_process";
+import { readFileSync, unlinkSync } from "fs";
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+/** macOS system sound used for the ready chime */
+export const READY_CHIME_PATH = "/System/Library/Sounds/Glass.aiff";
+/** Temp file path for afconvert output, scoped by PID to avoid collisions */
+export const CHIME_TEMP_PATH = `/tmp/chime-24k-${process.pid}.raw`;
+// ============================================================================
+// MAIN ENTRYPOINT
+// ============================================================================
+/** CAF data chunk: 'data' (4B) + size (8B) + editCount (4B) = 16 bytes before PCM */
+const CAF_DATA_CHUNK_HEADER_SIZE = 16;
+/**
+ * Decode the macOS Glass.aiff system sound to raw 24kHz int16 PCM.
+ * Uses afconvert (macOS built-in) to convert to CAF format, then strips
+ * the CAF container header to extract the raw PCM payload.
+ *
+ * @returns Buffer containing raw 24kHz int16 mono PCM
+ * @throws Error if afconvert fails, temp file cannot be read, or CAF has no data chunk
+ */
+export function decodeChimeToPcm(): Buffer {
+  execSync(`afconvert -f caff -d LEI16@24000 -c 1 ${READY_CHIME_PATH} ${CHIME_TEMP_PATH}`);
+  const caf = readFileSync(CHIME_TEMP_PATH);
+  unlinkSync(CHIME_TEMP_PATH);
+  // Find the 'data' chunk marker and skip past its header to the raw PCM
+  const dataMarker = caf.indexOf("data", 0, "ascii");
+  if (dataMarker === -1) {
+    throw new Error("CAF file missing 'data' chunk");
+  }
+  return caf.subarray(dataMarker + CAF_DATA_CHUNK_HEADER_SIZE);
+}