npm - agent-voice - Versions diffs - 0.2.2 → 0.2.4 - Mend

agent-voice 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/{ask-OIE6HL2H.js → ask-KM3JPI36.js} +26 -8
package/dist/cli.js +40 -4
package/dist/index.d.ts +1 -0
package/dist/index.js +61 -11
package/dist/{say-ZVF6EX52.js → say-OEQQFOCC.js} +35 -3
package/package.json +1 -1

package/dist/{ask-OIE6HL2H.js → ask-KM3JPI36.js} RENAMED Viewed

@@ -127,7 +127,19 @@ async function ask(message, options = {}) {
       if (!heardAssistantAudio) return;
       for (const frame of processedFrames) {
         const rms = pcm16Rms(frame);
-        const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
+        const configuredMinSpeechRms = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS",
+          220
+        );
+        const relaxAfterMs = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
+          500
+        );
+        const relaxedMinSpeechRms = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
+          120
+        );
+        const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
         if (rms >= minSpeechRms) {
           nearEndEvidenceSeen = true;
           nearEndEvidenceAtMs = Date.now();
@@ -170,19 +182,25 @@ async function ask(message, options = {}) {
         logEvent("realtime:transcript", `text="${text}"`);
         trace("realtime:transcript", { text });
         if (speechDetected) {
-          const evidenceWindowMs = readEnvInt(
-            "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
-            1200
+          const evidencePreRollMs = readEnvInt(
+            "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
+            200
+          );
+          const evidencePostRollMs = readEnvInt(
+            "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
+            1500
           );
-          const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
-          if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
+          const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
+          const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
+          const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
+          if (!hasTimelyNearEndEvidence) {
             trace("realtime:transcript_ignored_no_near_end_evidence", {
               text,
               speechStartedAtMs,
               nearEndEvidenceSeen,
               nearEndEvidenceAtMs,
-              evidenceAgeMs,
-              evidenceWindowMs
+              evidenceEarliestMs,
+              evidenceLatestMs
             });
             return;
           }

package/dist/cli.js CHANGED Viewed

@@ -28,8 +28,8 @@ async function withSuppressedNativeOutput() {
   openSync("/dev/null", "w");
   closeSync(2);
   openSync("/dev/null", "w");
-  const { ask } = await import("./ask-OIE6HL2H.js");
-  const { say } = await import("./say-ZVF6EX52.js");
+  const { ask } = await import("./ask-KM3JPI36.js");
+  const { say } = await import("./say-OEQQFOCC.js");
   function writeResult(text) {
     writeSync(savedStdout, `${text}
 `);
@@ -187,14 +187,50 @@ ${files.modelInputFile}`
     process.exit(1);
   }
 });
-program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).action(async (opts) => {
+program.command("say").description("Speak a message without listening for a response").option("-m, --message <text>", "Text message to speak").option("--voice <name>", "OpenAI voice", defaultVoice).option(
+  "--debug-audio-dir <dir>",
+  "Write say audio debug WAV to this directory"
+).action(async (opts) => {
   const { say, writeError } = await withSuppressedNativeOutput();
+  const assistantChunks = [];
   try {
     const auth = resolveAuth();
     const message = await getMessage(opts.message);
-    await say(message, { voice: opts.voice, auth });
+    await say(message, {
+      voice: opts.voice,
+      auth,
+      onAssistantAudio: opts.debugAudioDir ? (pcm16) => assistantChunks.push(Buffer.from(pcm16)) : void 0
+    });
+    if (opts.debugAudioDir) {
+      mkdirSync(opts.debugAudioDir, { recursive: true });
+      const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+      const file = join(
+        opts.debugAudioDir,
+        `say-${stamp}-assistant-output.wav`
+      );
+      writeFileSync(file, createWavBuffer(Buffer.concat(assistantChunks)));
+      writeError(`debug audio written:
+${file}`);
+    }
     process.exit(0);
   } catch (err) {
+    if (opts.debugAudioDir && assistantChunks.length > 0) {
+      try {
+        mkdirSync(opts.debugAudioDir, { recursive: true });
+        const stamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+        const file = join(
+          opts.debugAudioDir,
+          `say-${stamp}-assistant-output.wav`
+        );
+        writeFileSync(
+          file,
+          createWavBuffer(Buffer.concat(assistantChunks))
+        );
+        writeError(`debug audio written:
+${file}`);
+      } catch {
+      }
+    }
     writeError(`${err instanceof Error ? err.message : err}`);
     process.exit(1);
   }

package/dist/index.d.ts CHANGED Viewed

@@ -91,6 +91,7 @@ type SayOptions = {
         enableAec?: boolean;
         streamDelayMs?: number;
     }) => RustAudioEngine;
+    onAssistantAudio?: (pcm16: Buffer) => void;
     onTrace?: (event: {
         atMs: number;
         event: string;

package/dist/index.js CHANGED Viewed

@@ -249,7 +249,19 @@ async function ask(message, options = {}) {
       if (!heardAssistantAudio) return;
       for (const frame of processedFrames) {
         const rms = pcm16Rms(frame);
-        const minSpeechRms = readEnvInt("AGENT_VOICE_MIN_SPEECH_RMS", 550);
+        const configuredMinSpeechRms = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS",
+          220
+        );
+        const relaxAfterMs = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS_RELAX_AFTER_MS",
+          500
+        );
+        const relaxedMinSpeechRms = readEnvInt(
+          "AGENT_VOICE_MIN_SPEECH_RMS_RELAXED",
+          120
+        );
+        const minSpeechRms = speechDetected && speechStartedAtMs > 0 && Date.now() - speechStartedAtMs >= relaxAfterMs ? relaxedMinSpeechRms : configuredMinSpeechRms;
         if (rms >= minSpeechRms) {
           nearEndEvidenceSeen = true;
           nearEndEvidenceAtMs = Date.now();
@@ -292,19 +304,25 @@ async function ask(message, options = {}) {
         logEvent("realtime:transcript", `text="${text}"`);
         trace("realtime:transcript", { text });
         if (speechDetected) {
-          const evidenceWindowMs = readEnvInt(
-            "AGENT_VOICE_SPEECH_EVIDENCE_WINDOW_MS",
-            1200
+          const evidencePreRollMs = readEnvInt(
+            "AGENT_VOICE_SPEECH_EVIDENCE_PREROLL_MS",
+            200
+          );
+          const evidencePostRollMs = readEnvInt(
+            "AGENT_VOICE_SPEECH_EVIDENCE_POSTROLL_MS",
+            1500
           );
-          const evidenceAgeMs = nearEndEvidenceSeen ? Math.abs(nearEndEvidenceAtMs - speechStartedAtMs) : Number.POSITIVE_INFINITY;
-          if (!nearEndEvidenceSeen || evidenceAgeMs > evidenceWindowMs) {
+          const evidenceEarliestMs = speechStartedAtMs - evidencePreRollMs;
+          const evidenceLatestMs = speechStartedAtMs + evidencePostRollMs;
+          const hasTimelyNearEndEvidence = nearEndEvidenceSeen && nearEndEvidenceAtMs >= evidenceEarliestMs && nearEndEvidenceAtMs <= evidenceLatestMs;
+          if (!hasTimelyNearEndEvidence) {
             trace("realtime:transcript_ignored_no_near_end_evidence", {
               text,
               speechStartedAtMs,
               nearEndEvidenceSeen,
               nearEndEvidenceAtMs,
-              evidenceAgeMs,
-              evidenceWindowMs
+              evidenceEarliestMs,
+              evidenceLatestMs
             });
             return;
           }
@@ -439,6 +457,7 @@ async function say(message, options = {}) {
     auth,
     createSession,
     createAudioEngine,
+    onAssistantAudio,
     onTrace
   } = options;
   const { AudioEngine } = require3("agent-voice-audio");
@@ -460,6 +479,9 @@ async function say(message, options = {}) {
     let completionTailTimer = null;
     let drainPollTimer = null;
     let drainDeadlineTimer = null;
+    let playoutDeadlineTimer = null;
+    let firstAudioAtMs = 0;
+    let totalReceivedSamples = 0;
     function cleanup() {
       if (cleaned) return;
       cleaned = true;
@@ -467,6 +489,7 @@ async function say(message, options = {}) {
       if (completionTailTimer) clearTimeout(completionTailTimer);
       if (drainPollTimer) clearInterval(drainPollTimer);
       if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
+      if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
       try {
         engine.stop();
         engine.close();
@@ -487,6 +510,28 @@ async function say(message, options = {}) {
       cleanup();
       reject(error);
     }
+    function waitForWallClockPlayout() {
+      if (settled) return;
+      if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
+        resolveOnce();
+        return;
+      }
+      const expectedPlayoutMs = Math.ceil(
+        totalReceivedSamples / SAMPLE_RATE * 1e3
+      );
+      const playoutTailMs = 140;
+      const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
+      const waitMs = Math.max(0, dueAtMs - Date.now());
+      trace("playout:wall_clock_wait", {
+        totalReceivedSamples,
+        expectedPlayoutMs,
+        playoutTailMs,
+        waitMs
+      });
+      playoutDeadlineTimer = setTimeout(() => {
+        resolveOnce();
+      }, waitMs);
+    }
     function waitForPlaybackDrain() {
       if (settled) return;
       if (!engine.getStats) {
@@ -520,7 +565,7 @@ async function say(message, options = {}) {
         if (pending <= 0) {
           zeroStreak += 1;
           if (zeroStreak >= 3) {
-            resolveOnce();
+            waitForWallClockPlayout();
           }
           return;
         }
@@ -529,12 +574,12 @@ async function say(message, options = {}) {
           trace("drain:no_progress_timeout", {
             pendingPlaybackSamples: pending
           });
-          resolveOnce();
+          waitForWallClockPlayout();
         }
       }, 20);
       drainDeadlineTimer = setTimeout(() => {
         trace("drain:deadline");
-        resolveOnce();
+        waitForWallClockPlayout();
       }, absoluteDeadlineMs);
     }
     function scheduleTailResolve(delayMs) {
@@ -551,7 +596,12 @@ async function say(message, options = {}) {
       ack: false,
       auth,
       onAudioDelta(pcm16) {
+        if (firstAudioAtMs <= 0) {
+          firstAudioAtMs = Date.now();
+        }
+        totalReceivedSamples += Math.floor(pcm16.length / 2);
         engine.play(pcm16);
+        onAssistantAudio?.(pcm16);
         trace("realtime:audio_delta", { bytes: pcm16.length });
       },
       onAudioDone() {

package/dist/{say-ZVF6EX52.js → say-OEQQFOCC.js} RENAMED Viewed

@@ -16,6 +16,7 @@ async function say(message, options = {}) {
     auth,
     createSession,
     createAudioEngine,
+    onAssistantAudio,
     onTrace
   } = options;
   const { AudioEngine } = require2("agent-voice-audio");
@@ -37,6 +38,9 @@ async function say(message, options = {}) {
     let completionTailTimer = null;
     let drainPollTimer = null;
     let drainDeadlineTimer = null;
+    let playoutDeadlineTimer = null;
+    let firstAudioAtMs = 0;
+    let totalReceivedSamples = 0;
     function cleanup() {
       if (cleaned) return;
       cleaned = true;
@@ -44,6 +48,7 @@ async function say(message, options = {}) {
       if (completionTailTimer) clearTimeout(completionTailTimer);
       if (drainPollTimer) clearInterval(drainPollTimer);
       if (drainDeadlineTimer) clearTimeout(drainDeadlineTimer);
+      if (playoutDeadlineTimer) clearTimeout(playoutDeadlineTimer);
       try {
         engine.stop();
         engine.close();
@@ -64,6 +69,28 @@ async function say(message, options = {}) {
       cleanup();
       reject(error);
     }
+    function waitForWallClockPlayout() {
+      if (settled) return;
+      if (firstAudioAtMs <= 0 || totalReceivedSamples <= 0) {
+        resolveOnce();
+        return;
+      }
+      const expectedPlayoutMs = Math.ceil(
+        totalReceivedSamples / SAMPLE_RATE * 1e3
+      );
+      const playoutTailMs = 140;
+      const dueAtMs = firstAudioAtMs + expectedPlayoutMs + playoutTailMs;
+      const waitMs = Math.max(0, dueAtMs - Date.now());
+      trace("playout:wall_clock_wait", {
+        totalReceivedSamples,
+        expectedPlayoutMs,
+        playoutTailMs,
+        waitMs
+      });
+      playoutDeadlineTimer = setTimeout(() => {
+        resolveOnce();
+      }, waitMs);
+    }
     function waitForPlaybackDrain() {
       if (settled) return;
       if (!engine.getStats) {
@@ -97,7 +124,7 @@ async function say(message, options = {}) {
         if (pending <= 0) {
           zeroStreak += 1;
           if (zeroStreak >= 3) {
-            resolveOnce();
+            waitForWallClockPlayout();
           }
           return;
         }
@@ -106,12 +133,12 @@ async function say(message, options = {}) {
           trace("drain:no_progress_timeout", {
             pendingPlaybackSamples: pending
           });
-          resolveOnce();
+          waitForWallClockPlayout();
         }
       }, 20);
       drainDeadlineTimer = setTimeout(() => {
         trace("drain:deadline");
-        resolveOnce();
+        waitForWallClockPlayout();
       }, absoluteDeadlineMs);
     }
     function scheduleTailResolve(delayMs) {
@@ -128,7 +155,12 @@ async function say(message, options = {}) {
       ack: false,
       auth,
       onAudioDelta(pcm16) {
+        if (firstAudioAtMs <= 0) {
+          firstAudioAtMs = Date.now();
+        }
+        totalReceivedSamples += Math.floor(pcm16.length / 2);
         engine.play(pcm16);
+        onAssistantAudio?.(pcm16);
         trace("realtime:audio_delta", { bytes: pcm16.length });
       },
       onAudioDone() {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-voice",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "CLI for AI agents to interact with humans via voice",
   "type": "module",
   "main": "./dist/index.js",