npm - voicecc - Versions diffs - 1.0.7 - Mend

voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/.claude-plugin/plugin.json +6 -0
package/README.md +48 -0
package/bin/voicecc.js +39 -0
package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
package/dashboard/dist/audio-processor.js +126 -0
package/dashboard/dist/index.html +13 -0
package/dashboard/routes/auth.ts +119 -0
package/dashboard/routes/browser-call.ts +87 -0
package/dashboard/routes/claude-md.ts +50 -0
package/dashboard/routes/conversations.ts +203 -0
package/dashboard/routes/integrations.ts +154 -0
package/dashboard/routes/mcp-servers.ts +198 -0
package/dashboard/routes/settings.ts +64 -0
package/dashboard/routes/tunnel.ts +66 -0
package/dashboard/routes/twilio.ts +120 -0
package/dashboard/routes/voice.ts +48 -0
package/dashboard/routes/webrtc.ts +85 -0
package/dashboard/server.ts +130 -0
package/dashboard/tsconfig.json +13 -0
package/init/CLAUDE.md +18 -0
package/package.json +59 -0
package/run.ts +68 -0
package/scripts/postinstall.js +228 -0
package/services/browser-call-manager.ts +106 -0
package/services/device-pairing.ts +176 -0
package/services/env.ts +88 -0
package/services/tunnel.ts +204 -0
package/services/twilio-manager.ts +126 -0
package/sidecar/assets/startup.pcm +0 -0
package/sidecar/audio-adapter.ts +60 -0
package/sidecar/audio-capture.ts +220 -0
package/sidecar/browser-audio-playback.test.ts +149 -0
package/sidecar/browser-audio.ts +147 -0
package/sidecar/browser-server.ts +331 -0
package/sidecar/chime.test.ts +69 -0
package/sidecar/chime.ts +54 -0
package/sidecar/claude-session.ts +295 -0
package/sidecar/endpointing.ts +163 -0
package/sidecar/index.ts +83 -0
package/sidecar/local-audio.ts +126 -0
package/sidecar/mic-vpio +0 -0
package/sidecar/mic-vpio.swift +484 -0
package/sidecar/mock-tts-server-tagged.mjs +132 -0
package/sidecar/narration.ts +204 -0
package/sidecar/scripts/generate-startup-audio.py +79 -0
package/sidecar/session-lock.ts +123 -0
package/sidecar/sherpa-onnx-node.d.ts +4 -0
package/sidecar/stt.ts +199 -0
package/sidecar/tts-server.py +193 -0
package/sidecar/tts.ts +481 -0
package/sidecar/twilio-audio.ts +338 -0
package/sidecar/twilio-server.ts +436 -0
package/sidecar/types.ts +210 -0
package/sidecar/vad.ts +101 -0
package/sidecar/voice-loop-bugs.test.ts +522 -0
package/sidecar/voice-session.ts +523 -0
package/skills/voice/SKILL.md +26 -0
package/tsconfig.json +22 -0

package/sidecar/audio-adapter.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * AudioAdapter interface for abstracting audio I/O in voice sessions.
+ *
+ * Any audio transport (local mic, Twilio, WhatsApp) implements this interface
+ * so the voice session logic remains transport-agnostic.
+ *
+ * Responsibilities:
+ * - Define a common contract for audio input (microphone) and output (speaker)
+ * - Support playback interruption and resumption
+ * - Provide a ready chime signal
+ * - Clean up resources on destroy
+ */
+// ============================================================================
+// INTERFACES
+// ============================================================================
+/**
+ * Abstraction over audio I/O for the voice session.
+ * Implemented by local-audio.ts (VPIO) and twilio-audio.ts (WebSocket).
+ */
+export interface AudioAdapter {
+  /**
+   * Subscribe to incoming audio chunks from the microphone.
+   * The callback receives Float32Array samples (16kHz, normalized -1.0 to 1.0).
+   * The callback is synchronous -- the consumer wraps async work internally.
+   *
+   * @param callback - Called with each audio chunk as Float32Array
+   */
+  onAudio: (callback: (samples: Float32Array) => void) => void;
+  /**
+   * Write PCM audio to the speaker output.
+   * Audio format: 16-bit signed, 24kHz mono.
+   *
+   * @param pcm - Raw PCM buffer to play
+   * @returns Resolves when the write completes (backpressure)
+   */
+  writeSpeaker: (pcm: Buffer) => Promise<void>;
+  /**
+   * Clear the output audio buffer immediately (user interruption).
+   */
+  interrupt: () => void;
+  /**
+   * Resume output after an interrupt. Must be called before writing new audio.
+   */
+  resume: () => void;
+  /**
+   * Play the ready chime through the output.
+   */
+  playChime: () => void;
+  /**
+   * Clean up all resources (kill processes, close connections).
+   */
+  destroy: () => void;
+}

package/sidecar/audio-capture.ts ADDED Viewed

@@ -0,0 +1,220 @@
+/**
+ * Audio I/O via macOS Voice Processing IO (VPIO) with echo cancellation.
+ *
+ * Spawns a native mic-vpio binary that uses macOS's built-in acoustic echo
+ * cancellation. The binary handles both mic capture and speaker playback
+ * through a single VPIO AudioUnit, so the AEC has a reference signal of
+ * what's being played to subtract from the mic input.
+ *
+ * Responsibilities:
+ * - Start/stop the mic-vpio binary for echo-cancelled audio I/O
+ * - Provide a readable stream of echo-cancelled 16-bit signed PCM mic data
+ * - Provide a writable stream for TTS audio playback
+ * - Support playback interruption (clears audio buffer via SIGUSR1)
+ * - Convert raw PCM buffers to Float32Array for downstream VAD/STT consumption
+ */
+import { spawn, type ChildProcess } from "child_process";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+import type { Readable, Writable } from "stream";
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+/** Divisor for normalizing 16-bit signed PCM to -1.0..1.0 range */
+const PCM_16BIT_MAX = 32768.0;
+/** Number of bytes per 16-bit sample */
+const BYTES_PER_SAMPLE = 2;
+/** Path to the compiled mic-vpio binary */
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const MIC_VPIO_BIN = join(__dirname, "mic-vpio");
+/** Timeout for the VPIO binary to initialize (ms) */
+const READY_TIMEOUT_MS = 10_000;
+// ============================================================================
+// INTERFACES
+// ============================================================================
+/** Streams returned by startCapture for both mic input and speaker output */
+interface AudioIO {
+  /** Readable stream of echo-cancelled mic PCM (16-bit signed, mono) */
+  micStream: Readable;
+  /** Writable stream for TTS PCM playback (16-bit signed, mono) */
+  speakerInput: Writable;
+}
+// ============================================================================
+// STATE
+// ============================================================================
+/** The active mic-vpio child process */
+let vpioProcess: ChildProcess | null = null;
+// ============================================================================
+// MAIN HANDLERS
+// ============================================================================
+/**
+ * Start the VPIO audio I/O process with echo cancellation.
+ *
+ * Spawns the mic-vpio binary which sets up a macOS VoiceProcessingIO AudioUnit.
+ * Waits for the binary to report READY before returning.
+ *
+ * @param micRate - Mic output sample rate in Hz (e.g. 16000 for VAD/STT)
+ * @param speakerRate - Speaker input sample rate in Hz (e.g. 24000 for TTS)
+ * @returns AudioIO with mic and speaker streams
+ * @throws Error if already capturing, binary not found, or initialization fails
+ */
+async function startCapture(micRate: number, speakerRate: number): Promise<AudioIO> {
+  if (vpioProcess) {
+    throw new Error("Capture already in progress. Call stopCapture() first.");
+  }
+  vpioProcess = spawn(MIC_VPIO_BIN, [String(micRate), String(speakerRate)]);
+  if (!vpioProcess.stdout || !vpioProcess.stdin) {
+    throw new Error("Failed to get mic-vpio stdio streams");
+  }
+  // Wait for the binary to report READY on stderr
+  await waitForReady(vpioProcess);
+  return {
+    micStream: vpioProcess.stdout,
+    speakerInput: vpioProcess.stdin,
+  };
+}
+/**
+ * Interrupt current speaker playback by clearing the VPIO ring buffer.
+ * Sends SIGUSR1 to the mic-vpio process which clears pending audio
+ * and starts discarding any stale PCM data remaining in the OS pipe buffer.
+ */
+function interruptPlayback(): void {
+  if (vpioProcess) {
+    vpioProcess.kill("SIGUSR1");
+  }
+}
+/**
+ * Resume speaker playback after an interrupt.
+ * Sends SIGUSR2 to the mic-vpio process which stops discarding stdin data,
+ * allowing new PCM audio to flow through to the ring buffer and speakers.
+ * Must be called before writing new audio after an interrupt.
+ */
+function resumePlayback(): void {
+  if (vpioProcess) {
+    vpioProcess.kill("SIGUSR2");
+  }
+}
+/**
+ * Stop the VPIO audio I/O process and free resources.
+ */
+function stopCapture(): void {
+  if (!vpioProcess) return;
+  vpioProcess.kill();
+  vpioProcess = null;
+}
+/**
+ * Returns whether audio I/O is currently active.
+ *
+ * @returns true if the VPIO process is running
+ */
+function isCapturing(): boolean {
+  return vpioProcess !== null;
+}
+// ============================================================================
+// HELPER FUNCTIONS
+// ============================================================================
+/**
+ * Wait for the mic-vpio binary to print READY on stderr.
+ *
+ * @param proc - The mic-vpio child process
+ * @throws Error if the process exits or times out before READY
+ */
+function waitForReady(proc: ChildProcess): Promise<void> {
+  return new Promise<void>((resolve, reject) => {
+    let stderrBuffer = "";
+    const timeout = setTimeout(() => {
+      reject(new Error(`mic-vpio did not become ready within ${READY_TIMEOUT_MS}ms`));
+    }, READY_TIMEOUT_MS);
+    const onData = (data: Buffer) => {
+      const text = data.toString();
+      stderrBuffer += text;
+      // Log non-READY stderr output (errors, diagnostics)
+      for (const line of text.split("\n")) {
+        const trimmed = line.trim();
+        if (trimmed && trimmed !== "READY") {
+          console.log(`[mic-vpio] ${trimmed}`);
+        }
+      }
+      if (stderrBuffer.includes("READY")) {
+        clearTimeout(timeout);
+        proc.stderr!.off("data", onData);
+        // Continue logging stderr after READY
+        proc.stderr!.on("data", (d: Buffer) => {
+          for (const line of d.toString().split("\n")) {
+            const trimmed = line.trim();
+            if (trimmed) console.log(`[mic-vpio] ${trimmed}`);
+          }
+        });
+        resolve();
+      }
+    };
+    proc.stderr!.on("data", onData);
+    proc.on("error", (err) => {
+      clearTimeout(timeout);
+      reject(new Error(
+        `mic-vpio failed to start: ${err.message}. ` +
+        `Compile with: swiftc -O -o sidecar/mic-vpio sidecar/mic-vpio.swift -framework AudioToolbox -framework CoreAudio`
+      ));
+    });
+    proc.on("exit", (code) => {
+      clearTimeout(timeout);
+      reject(new Error(`mic-vpio exited with code ${code} before READY`));
+    });
+  });
+}
+/**
+ * Converts a raw 16-bit signed PCM buffer to a Float32Array normalized to -1.0..1.0.
+ *
+ * Each pair of bytes in the buffer represents one 16-bit signed little-endian sample.
+ * The normalized value is computed as: sample / 32768.0
+ *
+ * @param buffer - Raw 16-bit signed PCM buffer from the mic stream
+ * @returns Float32Array with values in the range -1.0 to 1.0
+ */
+function bufferToFloat32(buffer: Buffer): Float32Array {
+  const sampleCount = buffer.length / BYTES_PER_SAMPLE;
+  const float32 = new Float32Array(sampleCount);
+  for (let i = 0; i < sampleCount; i++) {
+    const sample = buffer.readInt16LE(i * BYTES_PER_SAMPLE);
+    float32[i] = sample / PCM_16BIT_MAX;
+  }
+  return float32;
+}
+export { startCapture, stopCapture, interruptPlayback, resumePlayback, isCapturing, bufferToFloat32 };
+export type { AudioIO };

package/sidecar/browser-audio-playback.test.ts ADDED Viewed

@@ -0,0 +1,149 @@
+/**
+ * Tests that the AudioWorklet processor plays back all TTS audio without
+ * dropping samples, regardless of chunk size or arrival timing.
+ *
+ * Loads the actual audio-processor.js and exercises it through the same
+ * postMessage/process interface the browser uses. Tests outcomes only --
+ * no assumptions about internal buffering strategy.
+ *
+ * Run: npx tsx --test sidecar/browser-audio-playback.test.ts
+ */
+import { test } from "node:test";
+import { strict as assert } from "node:assert";
+import { readFileSync } from "fs";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+const __dirname = dirname(fileURLToPath(import.meta.url));
+// ============================================================================
+// HARNESS -- stub browser AudioWorklet APIs so we can load audio-processor.js
+// ============================================================================
+function loadProcessor(): {
+  postMessage: (data: Record<string, unknown>) => void;
+  process: (inputs: Float32Array[][], outputs: Float32Array[][]) => boolean;
+} {
+  const source = readFileSync(join(__dirname, "../dashboard/public/audio-processor.js"), "utf-8");
+  let ProcessorClass: any;
+  // Stub globals that audio-processor.js expects
+  const globals = {
+    AudioWorkletProcessor: class {
+      port = {
+        onmessage: null as ((event: { data: Record<string, unknown> }) => void) | null,
+        postMessage(_data: unknown) {},
+      };
+    },
+    registerProcessor(_name: string, cls: any) {
+      ProcessorClass = cls;
+    },
+  };
+  const fn = new Function(...Object.keys(globals), source);
+  fn(...Object.values(globals));
+  const instance = new ProcessorClass();
+  return {
+    postMessage(data: Record<string, unknown>) {
+      instance.port.onmessage?.({ data });
+    },
+    process(inputs: Float32Array[][], outputs: Float32Array[][]) {
+      return instance.process(inputs, outputs, {});
+    },
+  };
+}
+// ============================================================================
+// TESTS
+// ============================================================================
+/**
+ * Simulates the exact scenario from the logs:
+ *   chunk 0: 2.0s audio at 24kHz -> 96,000 samples at 48kHz
+ *   chunk 1: 3.0s audio at 24kHz -> 144,000 samples at 48kHz
+ *
+ * Both chunks arrive within ~500ms. The process() callback drains 128
+ * samples per frame. Between the two chunk arrivals, only ~24,000 samples
+ * drain -- far less than the total audio.
+ *
+ * All 240,000 samples should be played back with no drops.
+ */
+test("all TTS audio plays back without drops across multi-second chunks", () => {
+  const proc = loadProcessor();
+  const BROWSER_RATE = 48_000;
+  const FRAME_SIZE = 128;
+  // Chunk 0: 2s at 48kHz, filled with 0.5
+  const chunk0 = new Float32Array(2.0 * BROWSER_RATE);
+  chunk0.fill(0.5);
+  // Chunk 1: 3s at 48kHz, filled with 0.3
+  const chunk1 = new Float32Array(3.0 * BROWSER_RATE);
+  chunk1.fill(0.3);
+  const totalSamples = chunk0.length + chunk1.length; // 240,000
+  // Post chunk 0
+  proc.postMessage({ type: "playback", samples: chunk0 });
+  // Simulate ~500ms of process() draining between chunk arrivals
+  const framesBetweenChunks = Math.floor((0.5 * BROWSER_RATE) / FRAME_SIZE);
+  let totalNonSilent = 0;
+  for (let i = 0; i < framesBetweenChunks; i++) {
+    const output = new Float32Array(FRAME_SIZE);
+    proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
+    for (let j = 0; j < output.length; j++) {
+      if (output[j] !== 0) totalNonSilent++;
+    }
+  }
+  // Post chunk 1
+  proc.postMessage({ type: "playback", samples: chunk1 });
+  // Drain until we get a full frame of silence (queue exhausted)
+  let silentFrames = 0;
+  while (silentFrames < 3) {
+    const output = new Float32Array(FRAME_SIZE);
+    proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
+    let frameSilent = true;
+    for (let j = 0; j < output.length; j++) {
+      if (output[j] !== 0) {
+        totalNonSilent++;
+        frameSilent = false;
+      }
+    }
+    silentFrames = frameSilent ? silentFrames + 1 : 0;
+  }
+  assert.equal(
+    totalNonSilent, totalSamples,
+    `Expected all ${totalSamples} samples (${(totalSamples / BROWSER_RATE).toFixed(1)}s) to play back, ` +
+    `but only ${totalNonSilent} (${(totalNonSilent / BROWSER_RATE).toFixed(1)}s) were non-silent. ` +
+    `${totalSamples - totalNonSilent} samples were dropped.`
+  );
+});
+/**
+ * Verifies that "clear" discards all pending audio immediately.
+ * After clear, process() should output silence.
+ */
+test("clear discards all pending audio", () => {
+  const proc = loadProcessor();
+  const FRAME_SIZE = 128;
+  proc.postMessage({ type: "playback", samples: new Float32Array(100_000).fill(0.5) });
+  proc.postMessage({ type: "clear" });
+  const output = new Float32Array(FRAME_SIZE);
+  proc.process([[new Float32Array(FRAME_SIZE)]], [[output]]);
+  for (let i = 0; i < output.length; i++) {
+    assert.equal(output[i], 0, `Expected silence at index ${i} after clear`);
+  }
+});

package/sidecar/browser-audio.ts ADDED Viewed

@@ -0,0 +1,147 @@
+/**
+ * Browser audio adapter for direct WebSocket connections.
+ *
+ * Implements the AudioAdapter interface for browser-based voice calls by
+ * exchanging raw PCM audio over a WebSocket. Simpler than TwilioAudioAdapter --
+ * no mulaw codec, no Twilio-specific protocol framing.
+ *
+ * Responsibilities:
+ * - Receive Float32Array PCM at 16kHz from the browser via binary WebSocket messages
+ * - Send int16 24kHz PCM as binary WebSocket messages to the browser
+ * - Handle backpressure on writeSpeaker via ws.send callback
+ * - Send JSON control messages (e.g. "clear" for interruption)
+ * - Cache the ready chime as 24kHz PCM for playback
+ */
+import type { WebSocket } from "ws";
+import type { AudioAdapter } from "./audio-adapter.js";
+import { decodeChimeToPcm } from "./chime.js";
+// ============================================================================
+// TYPES
+// ============================================================================
+/** Configuration for creating a browser audio adapter */
+export interface BrowserAudioAdapterConfig {
+  /** Active WebSocket connection to the browser */
+  ws: WebSocket;
+}
+// ============================================================================
+// MAIN ENTRYPOINT
+// ============================================================================
+/**
+ * Create an AudioAdapter that reads/writes audio over a browser WebSocket connection.
+ *
+ * Decodes the macOS Glass.aiff chime to raw 24kHz PCM during initialization
+ * and caches the buffer for playChime(). The browser sends Float32Array PCM at
+ * 16kHz as binary messages, and receives int16 24kHz PCM as binary messages.
+ *
+ * @param config - Browser WebSocket connection
+ * @returns An AudioAdapter for browser audio I/O
+ */
+export function createBrowserAudioAdapter(config: BrowserAudioAdapterConfig): AudioAdapter {
+  const { ws } = config;
+  let wsClosed = false;
+  // Track WebSocket close state
+  ws.on("close", () => {
+    wsClosed = true;
+  });
+  // Decode chime to raw 24kHz PCM and cache it
+  const chimePcm = decodeChimeToPcm();
+  // --------------------------------------------------------------------------
+  // AudioAdapter methods
+  // --------------------------------------------------------------------------
+  /**
+   * Subscribe to incoming audio chunks from the browser.
+   * Registers a WebSocket binary message handler that converts the incoming
+   * Buffer to Float32Array and invokes the callback. Ignores text (JSON) messages.
+   *
+   * @param callback - Called with each audio chunk as Float32Array (16kHz)
+   */
+  function onAudio(callback: (samples: Float32Array) => void): void {
+    ws.on("message", (data: Buffer | string, isBinary: boolean) => {
+      if (wsClosed) return;
+      // Only process binary messages (audio data)
+      if (!isBinary) return;
+      // Convert Buffer to Float32Array (copy to ensure 4-byte alignment)
+      const buffer = data as Buffer;
+      const aligned = new ArrayBuffer(buffer.byteLength);
+      new Uint8Array(aligned).set(buffer);
+      const float32 = new Float32Array(aligned);
+      callback(float32);
+    });
+  }
+  /**
+   * Write PCM audio to the browser via WebSocket.
+   * Sends 24kHz int16 PCM buffer as a binary WebSocket message.
+   * Uses ws.send callback for backpressure -- resolves when the data is flushed.
+   * Silently returns if the WebSocket has closed.
+   *
+   * @param pcm - Raw PCM buffer (16-bit signed, 24kHz mono)
+   * @returns Resolves when the write completes
+   */
+  function writeSpeaker(pcm: Buffer): Promise<void> {
+    if (wsClosed) return Promise.resolve();
+    return new Promise<void>((resolve) => {
+      ws.send(pcm, { binary: true }, () => {
+        // Resolve on both success and error -- write errors mean the
+        // connection is closing, and callers should not need to handle that
+        resolve();
+      });
+    });
+  }
+  /**
+   * Clear the browser's playback buffer immediately (user interruption).
+   * Sends a JSON "clear" message over the WebSocket.
+   */
+  function interrupt(): void {
+    if (wsClosed) return;
+    ws.send(JSON.stringify({ type: "clear" }));
+  }
+  /**
+   * Resume output after an interrupt. No-op for browser --
+   * AudioWorklet resumes consuming from ring buffer automatically after clear.
+   */
+  function resume(): void {
+    // No-op: browser AudioWorklet resumes automatically
+  }
+  /**
+   * Play the ready chime by sending the cached 24kHz PCM through writeSpeaker.
+   */
+  function playChime(): void {
+    writeSpeaker(chimePcm);
+  }
+  /**
+   * Clean up resources. No-op for browser -- WebSocket lifecycle is
+   * managed by browser-server.ts.
+   */
+  function destroy(): void {
+    // No-op: WebSocket lifecycle managed by browser-server.ts
+  }
+  return {
+    onAudio,
+    writeSpeaker,
+    interrupt,
+    resume,
+    playChime,
+    destroy,
+  };
+}