npm - agent-voice - Versions diffs - 0.1.2 → 0.2.0 - Mend

agent-voice 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/ask-GUSXGYSY.js +208 -0
package/dist/auth-KET5DNSE.js +63 -0
package/dist/{chunk-D3AGL5JD.js → chunk-AHLLYIEW.js} +2 -0
package/dist/{chunk-7ERYR6ZY.js → chunk-RGYWLATZ.js} +1 -1
package/dist/{chunk-EBYXFYS5.js → chunk-VV2VNOC4.js} +1 -70
package/dist/cli.js +103 -21
package/dist/index.d.ts +6 -16
package/dist/index.js +183 -129
package/dist/{say-HPM3WIE2.js → say-W56HCNK4.js} +21 -19
package/package.json +5 -11
package/dist/ask-NW4PBKFP.js +0 -93
package/dist/auth-42XIU3B7.js +0 -37

package/dist/ask-GUSXGYSY.js ADDED Viewed

@@ -0,0 +1,208 @@
+#!/usr/bin/env node
+import {
+  createRealtimeSession
+} from "./chunk-VV2VNOC4.js";
+import {
+  DEFAULT_VOICE,
+  SAMPLE_RATE
+} from "./chunk-AHLLYIEW.js";
+// src/ask.ts
+import { createRequire } from "module";
+var require2 = createRequire(import.meta.url);
+async function ask(message, options = {}) {
+  const {
+    voice = DEFAULT_VOICE,
+    timeout = 30,
+    ack = false,
+    auth,
+    onAudioFrameSent,
+    onAssistantAudio,
+    onMicAudio
+  } = options;
+  const { AudioEngine } = require2("agent-voice-audio");
+  const streamDelayMs = Number.parseInt(
+    process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
+    10
+  );
+  const engine = new AudioEngine({
+    sampleRate: SAMPLE_RATE,
+    channels: 1,
+    enableAec: true,
+    streamDelayMs
+  });
+  engine.start();
+  const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
+  const startMs = Date.now();
+  function logEvent(event, detail) {
+    if (!debug) return;
+    const elapsed = Date.now() - startMs;
+    const suffix = detail ? ` ${detail}` : "";
+    process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
+`);
+  }
+  logEvent("start");
+  return new Promise((resolve, reject) => {
+    let transcript = "";
+    let timeoutTimer = null;
+    let responseStartTimer = null;
+    let transcriptTimer = null;
+    let capturePollTimer = null;
+    let speechDetected = false;
+    let initialResponseDone = false;
+    let heardAssistantAudio = false;
+    let lastAssistantAudioAt = 0;
+    let cleaned = false;
+    let settled = false;
+    async function cleanup() {
+      if (cleaned) return;
+      cleaned = true;
+      logEvent("cleanup:start");
+      if (timeoutTimer) clearTimeout(timeoutTimer);
+      if (responseStartTimer) clearTimeout(responseStartTimer);
+      if (transcriptTimer) clearTimeout(transcriptTimer);
+      if (capturePollTimer) clearInterval(capturePollTimer);
+      try {
+        engine.stop();
+        engine.close();
+      } catch {
+      }
+      session.close();
+      logEvent("cleanup:done");
+    }
+    function resolveOnce(value) {
+      if (settled) return;
+      settled = true;
+      cleanup().then(() => resolve(value));
+    }
+    function rejectOnce(error) {
+      if (settled) return;
+      settled = true;
+      cleanup().then(() => reject(error));
+    }
+    capturePollTimer = setInterval(() => {
+      if (settled) return;
+      let rawFrames = [];
+      let processedFrames = [];
+      try {
+        rawFrames = engine.readRawCapture(64);
+        processedFrames = engine.readProcessedCapture(64);
+      } catch (err) {
+        rejectOnce(
+          new Error(
+            `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
+          )
+        );
+        return;
+      }
+      for (const frame of rawFrames) onMicAudio?.(frame);
+      if (!heardAssistantAudio) return;
+      for (const frame of processedFrames) {
+        onAudioFrameSent?.(frame);
+        session.sendAudio(frame);
+      }
+    }, 10);
+    const session = createRealtimeSession({
+      voice,
+      mode: "default",
+      ack,
+      auth,
+      onAudioDelta(pcm16) {
+        logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
+        heardAssistantAudio = true;
+        lastAssistantAudioAt = Date.now();
+        onAssistantAudio?.(pcm16);
+        engine.play(pcm16);
+      },
+      onTranscript(text) {
+        const echoGuardMs = Number.parseInt(
+          process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
+          10
+        );
+        const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
+        if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
+          logEvent(
+            "realtime:transcript_ignored_echo_guard",
+            `since_assistant_ms=${sinceAssistantMs} text="${text}"`
+          );
+          return;
+        }
+        logEvent("realtime:transcript", `text="${text}"`);
+        if (transcriptTimer) {
+          clearTimeout(transcriptTimer);
+          transcriptTimer = null;
+        }
+        transcript = text;
+        if (!ack) resolveOnce(transcript);
+      },
+      onSpeechStarted() {
+        logEvent("realtime:speech_started");
+        speechDetected = true;
+        if (timeoutTimer) {
+          clearTimeout(timeoutTimer);
+          timeoutTimer = null;
+        }
+        if (transcriptTimer) clearTimeout(transcriptTimer);
+        transcriptTimer = setTimeout(() => {
+          logEvent("timeout:no_transcript_after_speech");
+          rejectOnce(
+            new Error(
+              `No transcript received within ${timeout}s after speech started`
+            )
+          );
+        }, timeout * 1e3);
+        if (!initialResponseDone && heardAssistantAudio) {
+          try {
+            engine.play(Buffer.alloc(0));
+          } catch {
+          }
+        }
+      },
+      onInitialResponseDone() {
+        logEvent("realtime:initial_response_done");
+        initialResponseDone = true;
+        timeoutTimer = setTimeout(() => {
+          if (!speechDetected) {
+            logEvent("timeout:no_speech");
+            rejectOnce(
+              new Error(`No speech detected within ${timeout}s timeout`)
+            );
+          }
+        }, timeout * 1e3);
+      },
+      onDone() {
+        logEvent("realtime:done");
+        if (ack) resolveOnce(transcript);
+      },
+      onError(error) {
+        logEvent("realtime:error", error);
+        rejectOnce(new Error(error));
+      }
+    });
+    session.connect().then(
+      () => {
+        logEvent("realtime:connected");
+        logEvent("realtime:send_message");
+        session.sendMessage(message);
+        responseStartTimer = setTimeout(() => {
+          if (!heardAssistantAudio) {
+            logEvent("timeout:no_assistant_audio");
+            rejectOnce(
+              new Error("No assistant audio received after sending message")
+            );
+          }
+        }, 1e4);
+      },
+      (err) => {
+        logEvent(
+          "realtime:connect_error",
+          err instanceof Error ? err.message : String(err)
+        );
+        rejectOnce(err instanceof Error ? err : new Error(String(err)));
+      }
+    );
+  });
+}
+export {
+  ask
+};

package/dist/auth-KET5DNSE.js ADDED Viewed

@@ -0,0 +1,63 @@
+#!/usr/bin/env node
+import {
+  writeAuthConfig
+} from "./chunk-RGYWLATZ.js";
+import "./chunk-AHLLYIEW.js";
+// src/auth.ts
+import { input, password } from "@inquirer/prompts";
+import OpenAI from "openai";
+var DEFAULT_BASE_URL = "https://api.openai.com/v1";
+async function verifyAuth(apiKey, baseURL) {
+  const client = new OpenAI({ apiKey, baseURL });
+  await client.models.list();
+}
+async function readKeyFromStdin() {
+  const chunks = [];
+  for await (const chunk of process.stdin) chunks.push(chunk);
+  return Buffer.concat(chunks).toString("utf-8").trim();
+}
+async function auth(flags = {}) {
+  const nonInteractive = flags.apiUrl != null || flags.apiKey != null || flags.noVerify === true;
+  let baseUrl;
+  let apiKey;
+  if (nonInteractive) {
+    baseUrl = flags.apiUrl ?? DEFAULT_BASE_URL;
+    if (flags.apiKey) {
+      apiKey = flags.apiKey;
+    } else {
+      apiKey = await readKeyFromStdin();
+      if (!apiKey) {
+        throw new Error(
+          "No API key provided. Pass --api-key or pipe via stdin."
+        );
+      }
+    }
+    if (!flags.noVerify) {
+      process.stderr.write("Verifying...\n");
+      await verifyAuth(apiKey, baseUrl);
+    }
+  } else {
+    baseUrl = await input({
+      message: "Base URL",
+      default: DEFAULT_BASE_URL
+    });
+    apiKey = await password({
+      message: "API key"
+    });
+    if (!apiKey) {
+      throw new Error("API key is required.");
+    }
+    process.stderr.write("Verifying...\n");
+    await verifyAuth(apiKey, baseUrl);
+  }
+  const config = { apiKey };
+  if (baseUrl !== DEFAULT_BASE_URL) {
+    config.baseUrl = baseUrl;
+  }
+  writeAuthConfig(config);
+  process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
+}
+export {
+  auth
+};

package/dist/{chunk-D3AGL5JD.js → chunk-AHLLYIEW.js} RENAMED Viewed

@@ -3,6 +3,7 @@
 // src/types.ts
 var SAMPLE_RATE = 24e3;
 var CHANNELS = 1;
+var BIT_DEPTH = 16;
 var VOICES = [
   "alloy",
   "ash",
@@ -21,6 +22,7 @@ var DEFAULT_VOICE = "ash";
 export {
   SAMPLE_RATE,
   CHANNELS,
+  BIT_DEPTH,
   VOICES,
   DEFAULT_VOICE
 };

package/dist/{chunk-7ERYR6ZY.js → chunk-RGYWLATZ.js} RENAMED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 import {
   DEFAULT_VOICE
-} from "./chunk-D3AGL5JD.js";
+} from "./chunk-AHLLYIEW.js";
 // src/config.ts
 import { chmodSync, mkdirSync, readFileSync, writeFileSync } from "fs";

package/dist/{chunk-EBYXFYS5.js → chunk-VV2VNOC4.js} RENAMED Viewed

@@ -1,71 +1,4 @@
 #!/usr/bin/env node
-import {
-  CHANNELS,
-  SAMPLE_RATE
-} from "./chunk-D3AGL5JD.js";
-// src/audio.ts
-import { AudioIO, SampleFormat16Bit } from "naudiodon2";
-function createAudioPlayer() {
-  const stream = AudioIO({
-    outOptions: {
-      channelCount: CHANNELS,
-      sampleFormat: SampleFormat16Bit,
-      sampleRate: SAMPLE_RATE,
-      closeOnError: true
-    }
-  });
-  let closed = false;
-  return {
-    write(pcm16) {
-      return stream.write(pcm16);
-    },
-    start() {
-      stream.start();
-    },
-    drain() {
-      if (closed) return Promise.resolve();
-      closed = true;
-      return new Promise((resolve) => {
-        stream.quit(() => resolve());
-      });
-    },
-    close() {
-      if (closed) return;
-      closed = true;
-      stream.quit();
-    }
-  };
-}
-function createAudioRecorder() {
-  const stream = AudioIO({
-    inOptions: {
-      channelCount: CHANNELS,
-      sampleFormat: SampleFormat16Bit,
-      sampleRate: SAMPLE_RATE,
-      closeOnError: true
-    }
-  });
-  let stopped = false;
-  return {
-    onData(cb) {
-      stream.on("data", cb);
-    },
-    start() {
-      stream.start();
-    },
-    stop() {
-      if (stopped) return;
-      stopped = true;
-      stream.quit();
-    },
-    close() {
-      if (stopped) return;
-      stopped = true;
-      stream.quit();
-    }
-  };
-}
 // src/realtime.ts
 import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -87,7 +20,7 @@ function createRealtimeSession(options) {
   let rt;
   let responseCount = 0;
   function configureSession() {
-    const turnDetection = options.mode === "say" ? null : {
+    const turnDetection = options.mode === "say" ? void 0 : {
       type: "semantic_vad",
       eagerness: "medium",
       create_response: options.ack,
@@ -177,7 +110,5 @@ ${text}`
 }
 export {
-  createAudioPlayer,
-  createAudioRecorder,
   createRealtimeSession
 };

package/dist/cli.js CHANGED Viewed

@@ -3,26 +3,37 @@ import {
   resolveAuth,
   resolveVoice,
   writeVoiceConfig
-} from "./chunk-7ERYR6ZY.js";
+} from "./chunk-RGYWLATZ.js";
 import {
+  BIT_DEPTH,
+  CHANNELS,
+  SAMPLE_RATE,
   VOICES
-} from "./chunk-D3AGL5JD.js";
+} from "./chunk-AHLLYIEW.js";
 // src/cli.ts
-import { closeSync, openSync, writeSync } from "fs";
+import { closeSync, mkdirSync, openSync, writeFileSync, writeSync } from "fs";
+import { join } from "path";
 import { Command } from "commander";
-async function withSuppressedStdout() {
-  const savedFd = openSync("/dev/fd/1", "w");
+async function withSuppressedNativeOutput() {
+  const savedStdout = openSync("/dev/fd/1", "w");
+  const savedStderr = openSync("/dev/fd/2", "w");
   closeSync(1);
   openSync("/dev/null", "w");
-  const { ask } = await import("./ask-NW4PBKFP.js");
-  const { say } = await import("./say-HPM3WIE2.js");
+  closeSync(2);
+  openSync("/dev/null", "w");
+  const { ask } = await import("./ask-GUSXGYSY.js");
+  const { say } = await import("./say-W56HCNK4.js");
   function writeResult(text) {
-    writeSync(savedFd, `${text}
+    writeSync(savedStdout, `${text}
+`);
+    closeSync(savedStdout);
+  }
+  function writeError(text) {
+    writeSync(savedStderr, `${text}
 `);
-    closeSync(savedFd);
   }
-  return { ask, say, writeResult };
+  return { ask, say, writeResult, writeError };
 }
 async function readStdin() {
   if (process.stdin.isTTY) return "";
@@ -38,11 +49,47 @@ async function getMessage(flag) {
   if (stdin) return stdin;
   throw new Error("No message provided. Use -m or pipe via stdin.");
 }
+function createWavBuffer(pcm16) {
+  const header = Buffer.alloc(44);
+  const dataSize = pcm16.length;
+  const fileSize = 36 + dataSize;
+  const byteRate = SAMPLE_RATE * CHANNELS * (BIT_DEPTH / 8);
+  const blockAlign = CHANNELS * (BIT_DEPTH / 8);
+  header.write("RIFF", 0);
+  header.writeUInt32LE(fileSize, 4);
+  header.write("WAVE", 8);
+  header.write("fmt ", 12);
+  header.writeUInt32LE(16, 16);
+  header.writeUInt16LE(1, 20);
+  header.writeUInt16LE(CHANNELS, 22);
+  header.writeUInt32LE(SAMPLE_RATE, 24);
+  header.writeUInt32LE(byteRate, 28);
+  header.writeUInt16LE(blockAlign, 32);
+  header.writeUInt16LE(BIT_DEPTH, 34);
+  header.write("data", 36);
+  header.writeUInt32LE(dataSize, 40);
+  return Buffer.concat([header, pcm16]);
+}
+function writeDebugAudio(dir, assistantChunks, micChunks, modelInputChunks) {
+  mkdirSync(dir, { recursive: true });
+  const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+  const assistantFile = join(dir, `ask-${stamp}-assistant-output.wav`);
+  const micFile = join(dir, `ask-${stamp}-mic-input.wav`);
+  const modelInputFile = join(dir, `ask-${stamp}-model-input.wav`);
+  writeFileSync(assistantFile, createWavBuffer(Buffer.concat(assistantChunks)));
+  writeFileSync(micFile, createWavBuffer(Buffer.concat(micChunks)));
+  writeFileSync(modelInputFile, createWavBuffer(Buffer.concat(modelInputChunks)));
+  return { assistantFile, micFile, modelInputFile };
+}
 var program = new Command().name("agent-voice").description("AI agent voice interaction CLI");
-program.command("auth").description("Configure API key and base URL").action(async () => {
+program.command("auth").description("Configure API key and base URL").option("--api-url <url>", "Base URL for the API").option("--api-key <key>", "API key").option("--no-verify", "Skip API key verification").action(async (opts) => {
   try {
-    const { auth } = await import("./auth-42XIU3B7.js");
-    await auth();
+    const { auth } = await import("./auth-KET5DNSE.js");
+    await auth({
+      apiUrl: opts.apiUrl,
+      apiKey: opts.apiKey,
+      noVerify: !opts.verify
+    });
     process.exit(0);
   } catch (err) {
     process.stderr.write(`${err instanceof Error ? err.message : err}
@@ -73,35 +120,70 @@ voicesCmd.command("set <voice>").description("Set the default voice").action((vo
 `);
   process.exit(0);
 });
-program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "30").option("--ack", "Speak an acknowledgment after the user responds").action(async (opts) => {
+program.command("ask").description("Speak a message and listen for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option("--timeout <seconds>", "Seconds to wait for user speech", "120").option("--ack", "Speak an acknowledgment after the user responds").option("--debug-audio-dir <dir>", "Write ask audio debug WAVs to this directory").action(async (opts) => {
+  const { ask, writeResult, writeError } = await withSuppressedNativeOutput();
+  const assistantChunks = [];
+  const micChunks = [];
+  const modelInputChunks = [];
   try {
-    const { ask, writeResult } = await withSuppressedStdout();
     const auth = resolveAuth();
     const message = await getMessage(opts.message);
     const transcript = await ask(message, {
       voice: opts.voice,
       timeout: Number.parseInt(opts.timeout, 10),
       ack: opts.ack ?? false,
-      auth
+      auth,
+      onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0,
+      onMicAudio: opts.debugAudioDir ? (pcm16) => micChunks.push(Buffer.from(pcm16)) : void 0,
+      onAudioFrameSent: opts.debugAudioDir ? (pcm16) => modelInputChunks.push(Buffer.from(pcm16)) : void 0
     });
+    if (opts.debugAudioDir) {
+      const files = writeDebugAudio(
+        opts.debugAudioDir,
+        assistantChunks,
+        micChunks,
+        modelInputChunks
+      );
+      writeError(
+        `debug audio written:
+${files.assistantFile}
+${files.micFile}
+${files.modelInputFile}`
+      );
+    }
     writeResult(transcript);
     process.exit(0);
   } catch (err) {
-    process.stderr.write(`${err instanceof Error ? err.message : err}
-`);
+    if (opts.debugAudioDir) {
+      try {
+        const files = writeDebugAudio(
+          opts.debugAudioDir,
+          assistantChunks,
+          micChunks,
+          modelInputChunks
+        );
+        writeError(
+          `debug audio written:
+${files.assistantFile}
+${files.micFile}
+${files.modelInputFile}`
+        );
+      } catch {
+      }
+    }
+    writeError(`${err instanceof Error ? err.message : err}`);
     process.exit(1);
   }
 });
 program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).action(async (opts) => {
+  const { say, writeError } = await withSuppressedNativeOutput();
   try {
-    const { say } = await withSuppressedStdout();
     const auth = resolveAuth();
     const message = await getMessage(opts.message);
     await say(message, { voice: opts.voice, auth });
     process.exit(0);
   } catch (err) {
-    process.stderr.write(`${err instanceof Error ? err.message : err}
-`);
+    writeError(`${err instanceof Error ? err.message : err}`);
     process.exit(1);
   }
 });

package/dist/index.d.ts CHANGED Viewed

@@ -1,16 +1,3 @@
-type AudioPlayer = {
-    write(pcm16: Buffer): boolean;
-    start(): void;
-    drain(): Promise<void>;
-    close(): void;
-};
-type AudioRecorder = {
-    onData(cb: (pcm16: Buffer) => void): void;
-    start(): void;
-    stop(): void;
-    close(): void;
-};
 type AuthConfig = {
     apiKey: string;
     baseUrl?: string;
@@ -23,15 +10,18 @@ type AskOptions = {
     timeout?: number;
     ack?: boolean;
     auth?: AuthConfig;
-    createPlayer?: () => AudioPlayer;
-    createRecorder?: () => AudioRecorder;
+    createPlayer?: unknown;
+    createRecorder?: unknown;
+    onAudioFrameSent?: (pcm16: Buffer) => void;
+    onAssistantAudio?: (pcm16: Buffer) => void;
+    onMicAudio?: (pcm16: Buffer) => void;
 };
 declare function ask(message: string, options?: AskOptions): Promise<string>;
 type SayOptions = {
     voice?: string;
     auth?: AuthConfig;
-    createPlayer?: () => AudioPlayer;
+    createPlayer?: unknown;
 };
 declare function say(message: string, options?: SayOptions): Promise<void>;

package/dist/index.js CHANGED Viewed

@@ -1,85 +1,5 @@
-// src/audio.ts
-import { AudioIO, SampleFormat16Bit } from "naudiodon2";
-// src/types.ts
-var SAMPLE_RATE = 24e3;
-var CHANNELS = 1;
-var VOICES = [
-  "alloy",
-  "ash",
-  "ballad",
-  "coral",
-  "echo",
-  "fable",
-  "nova",
-  "onyx",
-  "sage",
-  "shimmer",
-  "verse"
-];
-var DEFAULT_VOICE = "ash";
-// src/audio.ts
-function createAudioPlayer() {
-  const stream = AudioIO({
-    outOptions: {
-      channelCount: CHANNELS,
-      sampleFormat: SampleFormat16Bit,
-      sampleRate: SAMPLE_RATE,
-      closeOnError: true
-    }
-  });
-  let closed = false;
-  return {
-    write(pcm16) {
-      return stream.write(pcm16);
-    },
-    start() {
-      stream.start();
-    },
-    drain() {
-      if (closed) return Promise.resolve();
-      closed = true;
-      return new Promise((resolve) => {
-        stream.quit(() => resolve());
-      });
-    },
-    close() {
-      if (closed) return;
-      closed = true;
-      stream.quit();
-    }
-  };
-}
-function createAudioRecorder() {
-  const stream = AudioIO({
-    inOptions: {
-      channelCount: CHANNELS,
-      sampleFormat: SampleFormat16Bit,
-      sampleRate: SAMPLE_RATE,
-      closeOnError: true
-    }
-  });
-  let stopped = false;
-  return {
-    onData(cb) {
-      stream.on("data", cb);
-    },
-    start() {
-      stream.start();
-    },
-    stop() {
-      if (stopped) return;
-      stopped = true;
-      stream.quit();
-    },
-    close() {
-      if (stopped) return;
-      stopped = true;
-      stream.quit();
-    }
-  };
-}
+// src/ask.ts
+import { createRequire } from "module";
 // src/realtime.ts
 import { OpenAIRealtimeWS } from "openai/beta/realtime/ws";
@@ -101,7 +21,7 @@ function createRealtimeSession(options) {
   let rt;
   let responseCount = 0;
   function configureSession() {
-    const turnDetection = options.mode === "say" ? null : {
+    const turnDetection = options.mode === "say" ? void 0 : {
       type: "semantic_vad",
       eagerness: "medium",
       create_response: options.ack,
@@ -190,84 +110,216 @@ ${text}`
   };
 }
+// src/types.ts
+var SAMPLE_RATE = 24e3;
+var VOICES = [
+  "alloy",
+  "ash",
+  "ballad",
+  "coral",
+  "echo",
+  "fable",
+  "nova",
+  "onyx",
+  "sage",
+  "shimmer",
+  "verse"
+];
+var DEFAULT_VOICE = "ash";
 // src/ask.ts
+var require2 = createRequire(import.meta.url);
 async function ask(message, options = {}) {
   const {
     voice = DEFAULT_VOICE,
     timeout = 30,
     ack = false,
     auth,
-    createPlayer = createAudioPlayer,
-    createRecorder = createAudioRecorder
+    onAudioFrameSent,
+    onAssistantAudio,
+    onMicAudio
   } = options;
-  const player = createPlayer();
-  player.start();
+  const { AudioEngine } = require2("agent-voice-audio");
+  const streamDelayMs = Number.parseInt(
+    process.env.AGENT_VOICE_AEC_STREAM_DELAY_MS ?? "30",
+    10
+  );
+  const engine = new AudioEngine({
+    sampleRate: SAMPLE_RATE,
+    channels: 1,
+    enableAec: true,
+    streamDelayMs
+  });
+  engine.start();
+  const debug = process.env.AGENT_VOICE_DEBUG_ASK_EVENTS === "1";
+  const startMs = Date.now();
+  function logEvent(event, detail) {
+    if (!debug) return;
+    const elapsed = Date.now() - startMs;
+    const suffix = detail ? ` ${detail}` : "";
+    process.stderr.write(`[ask ${elapsed}ms] ${event}${suffix}
+`);
+  }
+  logEvent("start");
   return new Promise((resolve, reject) => {
-    let recorder = null;
     let transcript = "";
     let timeoutTimer = null;
+    let responseStartTimer = null;
+    let transcriptTimer = null;
+    let capturePollTimer = null;
     let speechDetected = false;
+    let initialResponseDone = false;
+    let heardAssistantAudio = false;
+    let lastAssistantAudioAt = 0;
     let cleaned = false;
-    let resolved = false;
+    let settled = false;
     async function cleanup() {
       if (cleaned) return;
       cleaned = true;
+      logEvent("cleanup:start");
       if (timeoutTimer) clearTimeout(timeoutTimer);
-      recorder?.stop();
-      recorder?.close();
-      await player.drain();
+      if (responseStartTimer) clearTimeout(responseStartTimer);
+      if (transcriptTimer) clearTimeout(transcriptTimer);
+      if (capturePollTimer) clearInterval(capturePollTimer);
+      try {
+        engine.stop();
+        engine.close();
+      } catch {
+      }
       session.close();
+      logEvent("cleanup:done");
+    }
+    function resolveOnce(value) {
+      if (settled) return;
+      settled = true;
+      cleanup().then(() => resolve(value));
     }
-    function finish() {
-      if (resolved) return;
-      resolved = true;
-      cleanup().then(() => resolve(transcript));
+    function rejectOnce(error) {
+      if (settled) return;
+      settled = true;
+      cleanup().then(() => reject(error));
     }
+    capturePollTimer = setInterval(() => {
+      if (settled) return;
+      let rawFrames = [];
+      let processedFrames = [];
+      try {
+        rawFrames = engine.readRawCapture(64);
+        processedFrames = engine.readProcessedCapture(64);
+      } catch (err) {
+        rejectOnce(
+          new Error(
+            `audio engine capture read failed: ${err instanceof Error ? err.message : String(err)}`
+          )
+        );
+        return;
+      }
+      for (const frame of rawFrames) onMicAudio?.(frame);
+      if (!heardAssistantAudio) return;
+      for (const frame of processedFrames) {
+        onAudioFrameSent?.(frame);
+        session.sendAudio(frame);
+      }
+    }, 10);
     const session = createRealtimeSession({
       voice,
       mode: "default",
       ack,
       auth,
       onAudioDelta(pcm16) {
-        player.write(pcm16);
+        logEvent("realtime:audio_delta", `bytes=${pcm16.length}`);
+        heardAssistantAudio = true;
+        lastAssistantAudioAt = Date.now();
+        onAssistantAudio?.(pcm16);
+        engine.play(pcm16);
       },
       onTranscript(text) {
+        const echoGuardMs = Number.parseInt(
+          process.env.AGENT_VOICE_ECHO_GUARD_MS ?? "1500",
+          10
+        );
+        const sinceAssistantMs = Date.now() - lastAssistantAudioAt;
+        if (heardAssistantAudio && sinceAssistantMs < echoGuardMs) {
+          logEvent(
+            "realtime:transcript_ignored_echo_guard",
+            `since_assistant_ms=${sinceAssistantMs} text="${text}"`
+          );
+          return;
+        }
+        logEvent("realtime:transcript", `text="${text}"`);
+        if (transcriptTimer) {
+          clearTimeout(transcriptTimer);
+          transcriptTimer = null;
+        }
         transcript = text;
-        if (!ack) finish();
+        if (!ack) resolveOnce(transcript);
       },
       onSpeechStarted() {
+        logEvent("realtime:speech_started");
         speechDetected = true;
         if (timeoutTimer) {
           clearTimeout(timeoutTimer);
           timeoutTimer = null;
         }
+        if (transcriptTimer) clearTimeout(transcriptTimer);
+        transcriptTimer = setTimeout(() => {
+          logEvent("timeout:no_transcript_after_speech");
+          rejectOnce(
+            new Error(
+              `No transcript received within ${timeout}s after speech started`
+            )
+          );
+        }, timeout * 1e3);
+        if (!initialResponseDone && heardAssistantAudio) {
+          try {
+            engine.play(Buffer.alloc(0));
+          } catch {
+          }
+        }
       },
       onInitialResponseDone() {
-        setTimeout(() => {
-          recorder = createRecorder();
-          recorder.onData((pcm16) => {
-            session.sendAudio(pcm16);
-          });
-          recorder.start();
-        }, 500);
+        logEvent("realtime:initial_response_done");
+        initialResponseDone = true;
         timeoutTimer = setTimeout(() => {
           if (!speechDetected) {
-            cleanup();
-            reject(new Error(`No speech detected within ${timeout}s timeout`));
+            logEvent("timeout:no_speech");
+            rejectOnce(
+              new Error(`No speech detected within ${timeout}s timeout`)
+            );
           }
         }, timeout * 1e3);
       },
       onDone() {
-        if (ack) finish();
+        logEvent("realtime:done");
+        if (ack) resolveOnce(transcript);
       },
-      async onError(error) {
-        await cleanup();
-        reject(new Error(error));
+      onError(error) {
+        logEvent("realtime:error", error);
+        rejectOnce(new Error(error));
       }
     });
-    session.connect().then(() => {
-      session.sendMessage(message);
-    }, reject);
+    session.connect().then(
+      () => {
+        logEvent("realtime:connected");
+        logEvent("realtime:send_message");
+        session.sendMessage(message);
+        responseStartTimer = setTimeout(() => {
+          if (!heardAssistantAudio) {
+            logEvent("timeout:no_assistant_audio");
+            rejectOnce(
+              new Error("No assistant audio received after sending message")
+            );
+          }
+        }, 1e4);
+      },
+      (err) => {
+        logEvent(
+          "realtime:connect_error",
+          err instanceof Error ? err.message : String(err)
+        );
+        rejectOnce(err instanceof Error ? err : new Error(String(err)));
+      }
+    );
   });
 }
@@ -302,19 +354,27 @@ function resolveVoice() {
 }
 // src/say.ts
+import { createRequire as createRequire2 } from "module";
+var require3 = createRequire2(import.meta.url);
 async function say(message, options = {}) {
-  const {
-    voice = DEFAULT_VOICE,
-    auth,
-    createPlayer = createAudioPlayer
-  } = options;
-  const player = createPlayer();
-  player.start();
+  const { voice = DEFAULT_VOICE, auth } = options;
+  const { AudioEngine } = require3("agent-voice-audio");
+  const engine = new AudioEngine({
+    sampleRate: SAMPLE_RATE,
+    channels: 1,
+    enableAec: false
+  });
+  engine.start();
   return new Promise((resolve, reject) => {
     let cleaned = false;
     function cleanup() {
       if (cleaned) return;
       cleaned = true;
+      try {
+        engine.stop();
+        engine.close();
+      } catch {
+      }
       session.close();
     }
     const session = createRealtimeSession({
@@ -323,25 +383,19 @@ async function say(message, options = {}) {
       ack: false,
       auth,
       onAudioDelta(pcm16) {
-        player.write(pcm16);
+        engine.play(pcm16);
       },
       onTranscript() {
       },
       onSpeechStarted() {
       },
-      async onInitialResponseDone() {
-        try {
-          await player.drain();
-        } catch {
-          player.close();
-        }
+      onInitialResponseDone() {
         cleanup();
         resolve();
       },
       onDone() {
       },
       onError(error) {
-        player.close();
         cleanup();
         reject(new Error(error));
       }

package/dist/{say-HPM3WIE2.js → say-W56HCNK4.js} RENAMED Viewed

@@ -1,26 +1,34 @@
 #!/usr/bin/env node
 import {
-  createAudioPlayer,
   createRealtimeSession
-} from "./chunk-EBYXFYS5.js";
+} from "./chunk-VV2VNOC4.js";
 import {
-  DEFAULT_VOICE
-} from "./chunk-D3AGL5JD.js";
+  DEFAULT_VOICE,
+  SAMPLE_RATE
+} from "./chunk-AHLLYIEW.js";
 // src/say.ts
+import { createRequire } from "module";
+var require2 = createRequire(import.meta.url);
 async function say(message, options = {}) {
-  const {
-    voice = DEFAULT_VOICE,
-    auth,
-    createPlayer = createAudioPlayer
-  } = options;
-  const player = createPlayer();
-  player.start();
+  const { voice = DEFAULT_VOICE, auth } = options;
+  const { AudioEngine } = require2("agent-voice-audio");
+  const engine = new AudioEngine({
+    sampleRate: SAMPLE_RATE,
+    channels: 1,
+    enableAec: false
+  });
+  engine.start();
   return new Promise((resolve, reject) => {
     let cleaned = false;
     function cleanup() {
       if (cleaned) return;
       cleaned = true;
+      try {
+        engine.stop();
+        engine.close();
+      } catch {
+      }
       session.close();
     }
     const session = createRealtimeSession({
@@ -29,25 +37,19 @@ async function say(message, options = {}) {
       ack: false,
       auth,
       onAudioDelta(pcm16) {
-        player.write(pcm16);
+        engine.play(pcm16);
       },
       onTranscript() {
       },
       onSpeechStarted() {
       },
-      async onInitialResponseDone() {
-        try {
-          await player.drain();
-        } catch {
-          player.close();
-        }
+      onInitialResponseDone() {
         cleanup();
         resolve();
       },
       onDone() {
       },
       onError(error) {
-        player.close();
         cleanup();
         reject(new Error(error));
       }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-voice",
-  "version": "0.1.2",
+  "version": "0.2.0",
   "description": "CLI for AI agents to interact with humans via voice",
   "type": "module",
   "main": "./dist/index.js",
@@ -18,32 +18,26 @@
     "dist"
   ],
   "dependencies": {
+    "agent-voice-audio": "^0.2.0",
     "@inquirer/prompts": "^8.2.0",
     "commander": "^13.1.0",
-    "naudiodon2": "^2.1.0",
     "openai": "^4.96.0",
     "ws": "^8.18.0"
   },
   "devDependencies": {
-    "@biomejs/biome": "^1.9.4",
-    "@changesets/cli": "^2.29.8",
     "@types/node": "^22.12.0",
     "@types/ws": "^8.5.14",
-    "dotenv-cli": "^11.0.0",
-    "lefthook": "^2.1.0",
     "tsup": "^8.3.6",
     "tsx": "^4.19.2",
     "typescript": "^5.7.3",
     "vitest": "^4.0.18"
   },
   "scripts": {
-    "dev": "dotenv -e .env.local -- tsx src/cli.ts",
-    "agent-voice": "dotenv -e .env.local -- tsx src/cli.ts",
+    "dev": "tsx src/cli.ts",
     "build": "tsup",
     "check": "biome check --write .",
     "typecheck": "tsc --noEmit",
-    "test": "dotenv -e .env.local -- vitest run",
-    "test:watch": "dotenv -e .env.local -- vitest",
-    "release": "pnpm build && changeset publish"
+    "test": "vitest run",
+    "test:watch": "vitest"
   }
 }

package/dist/ask-NW4PBKFP.js DELETED Viewed

@@ -1,93 +0,0 @@
-#!/usr/bin/env node
-import {
-  createAudioPlayer,
-  createAudioRecorder,
-  createRealtimeSession
-} from "./chunk-EBYXFYS5.js";
-import {
-  DEFAULT_VOICE
-} from "./chunk-D3AGL5JD.js";
-// src/ask.ts
-async function ask(message, options = {}) {
-  const {
-    voice = DEFAULT_VOICE,
-    timeout = 30,
-    ack = false,
-    auth,
-    createPlayer = createAudioPlayer,
-    createRecorder = createAudioRecorder
-  } = options;
-  const player = createPlayer();
-  player.start();
-  return new Promise((resolve, reject) => {
-    let recorder = null;
-    let transcript = "";
-    let timeoutTimer = null;
-    let speechDetected = false;
-    let cleaned = false;
-    let resolved = false;
-    async function cleanup() {
-      if (cleaned) return;
-      cleaned = true;
-      if (timeoutTimer) clearTimeout(timeoutTimer);
-      recorder?.stop();
-      recorder?.close();
-      await player.drain();
-      session.close();
-    }
-    function finish() {
-      if (resolved) return;
-      resolved = true;
-      cleanup().then(() => resolve(transcript));
-    }
-    const session = createRealtimeSession({
-      voice,
-      mode: "default",
-      ack,
-      auth,
-      onAudioDelta(pcm16) {
-        player.write(pcm16);
-      },
-      onTranscript(text) {
-        transcript = text;
-        if (!ack) finish();
-      },
-      onSpeechStarted() {
-        speechDetected = true;
-        if (timeoutTimer) {
-          clearTimeout(timeoutTimer);
-          timeoutTimer = null;
-        }
-      },
-      onInitialResponseDone() {
-        setTimeout(() => {
-          recorder = createRecorder();
-          recorder.onData((pcm16) => {
-            session.sendAudio(pcm16);
-          });
-          recorder.start();
-        }, 500);
-        timeoutTimer = setTimeout(() => {
-          if (!speechDetected) {
-            cleanup();
-            reject(new Error(`No speech detected within ${timeout}s timeout`));
-          }
-        }, timeout * 1e3);
-      },
-      onDone() {
-        if (ack) finish();
-      },
-      async onError(error) {
-        await cleanup();
-        reject(new Error(error));
-      }
-    });
-    session.connect().then(() => {
-      session.sendMessage(message);
-    }, reject);
-  });
-}
-export {
-  ask
-};

package/dist/auth-42XIU3B7.js DELETED Viewed

@@ -1,37 +0,0 @@
-#!/usr/bin/env node
-import {
-  writeAuthConfig
-} from "./chunk-7ERYR6ZY.js";
-import "./chunk-D3AGL5JD.js";
-// src/auth.ts
-import { input, password } from "@inquirer/prompts";
-import OpenAI from "openai";
-var DEFAULT_BASE_URL = "https://api.openai.com/v1";
-async function verifyAuth(apiKey, baseURL) {
-  const client = new OpenAI({ apiKey, baseURL });
-  await client.models.list();
-}
-async function auth() {
-  const baseUrl = await input({
-    message: "Base URL",
-    default: DEFAULT_BASE_URL
-  });
-  const apiKey = await password({
-    message: "API key"
-  });
-  if (!apiKey) {
-    throw new Error("API key is required.");
-  }
-  process.stderr.write("Verifying...\n");
-  await verifyAuth(apiKey, baseUrl);
-  const config = { apiKey };
-  if (baseUrl !== DEFAULT_BASE_URL) {
-    config.baseUrl = baseUrl;
-  }
-  writeAuthConfig(config);
-  process.stderr.write("Auth config saved to ~/.agent-voice/config.json\n");
-}
-export {
-  auth
-};