npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.580 → 0.0.22-beta.582 - Mend

@absolutejs/voice 0.0.22-beta.580 → 0.0.22-beta.582

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/client/audioPlayer.d.ts +7 -0
package/dist/client/htmxBootstrap.js +25 -0
package/dist/client/index.js +25 -0
package/dist/core/types.d.ts +3 -0
package/dist/index.js +41 -11
package/dist/testing/index.js +66 -11
package/package.json +1 -1

package/dist/client/audioPlayer.d.ts CHANGED Viewed

@@ -22,9 +22,16 @@ type MinimalGainNode = {
         value: number;
     };
 };
+type MinimalAnalyserNode = {
+    connect?: (destination: unknown) => void;
+    disconnect?: () => void;
+    fftSize: number;
+    getByteTimeDomainData: (array: Uint8Array) => void;
+};
 type MinimalAudioContext = {
     baseLatency?: number;
     close: () => Promise<void>;
+    createAnalyser?: () => MinimalAnalyserNode;
     createBuffer: (numberOfChannels: number, length: number, sampleRate: number) => MinimalAudioBuffer;
     createBufferSource: () => MinimalAudioBufferSourceNode;
     createGain?: () => MinimalGainNode;

package/dist/client/htmxBootstrap.js CHANGED Viewed

@@ -1693,6 +1693,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
 var MIN_PLAYBACK_RATE = 0.5;
 var MAX_PLAYBACK_RATE = 2;
 var STRETCH_BYPASS_EPSILON = 0.01;
+var ANALYSER_FFT_SIZE = 256;
+var PCM_BYTE_MIDPOINT = 128;
 var createInitialState3 = () => ({
   activeSourceCount: 0,
   error: null,
@@ -1753,6 +1755,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
   let state = createInitialState3();
   let audioContext = null;
   let outputNode = null;
+  let analyserNode = null;
+  let analyserBuffer = null;
   let volume = clampVolume(options.volume);
   let playbackRate = clampPlaybackRate(options.playbackRate);
   let stretcher = null;
@@ -1849,6 +1853,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     if (audioContext.createGain) {
       outputNode = audioContext.createGain();
       outputNode.connect?.(audioContext.destination);
+      if (audioContext.createAnalyser) {
+        analyserNode = audioContext.createAnalyser();
+        analyserNode.fftSize = ANALYSER_FFT_SIZE;
+        analyserBuffer = new Uint8Array(analyserNode.fftSize);
+        outputNode.connect?.(analyserNode);
+      }
     }
     queueEndTime = audioContext.currentTime;
     return audioContext;
@@ -1973,6 +1983,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
       audioContext = null;
       outputNode?.disconnect?.();
       outputNode = null;
+      analyserNode?.disconnect?.();
+      analyserNode = null;
+      analyserBuffer = null;
       queueEndTime = 0;
       setState({
         activeSourceCount: 0,
@@ -1983,6 +1996,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     get error() {
       return state.error;
     },
+    getOutputLevel: () => {
+      if (!analyserNode || !analyserBuffer) {
+        return 0;
+      }
+      analyserNode.getByteTimeDomainData(analyserBuffer);
+      let sumSquares = 0;
+      for (const sample of analyserBuffer) {
+        const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
+        sumSquares += centered * centered;
+      }
+      return Math.sqrt(sumSquares / analyserBuffer.length);
+    },
     getSnapshot: () => state,
     interrupt: async () => {
       const startedAt = Date.now();

package/dist/client/index.js CHANGED Viewed

@@ -529,6 +529,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
 var MIN_PLAYBACK_RATE = 0.5;
 var MAX_PLAYBACK_RATE = 2;
 var STRETCH_BYPASS_EPSILON = 0.01;
+var ANALYSER_FFT_SIZE = 256;
+var PCM_BYTE_MIDPOINT = 128;
 var createInitialState = () => ({
   activeSourceCount: 0,
   error: null,
@@ -589,6 +591,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
   let state = createInitialState();
   let audioContext = null;
   let outputNode = null;
+  let analyserNode = null;
+  let analyserBuffer = null;
   let volume = clampVolume(options.volume);
   let playbackRate = clampPlaybackRate(options.playbackRate);
   let stretcher = null;
@@ -685,6 +689,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     if (audioContext.createGain) {
       outputNode = audioContext.createGain();
       outputNode.connect?.(audioContext.destination);
+      if (audioContext.createAnalyser) {
+        analyserNode = audioContext.createAnalyser();
+        analyserNode.fftSize = ANALYSER_FFT_SIZE;
+        analyserBuffer = new Uint8Array(analyserNode.fftSize);
+        outputNode.connect?.(analyserNode);
+      }
     }
     queueEndTime = audioContext.currentTime;
     return audioContext;
@@ -809,6 +819,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
       audioContext = null;
       outputNode?.disconnect?.();
       outputNode = null;
+      analyserNode?.disconnect?.();
+      analyserNode = null;
+      analyserBuffer = null;
       queueEndTime = 0;
       setState({
         activeSourceCount: 0,
@@ -819,6 +832,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     get error() {
       return state.error;
     },
+    getOutputLevel: () => {
+      if (!analyserNode || !analyserBuffer) {
+        return 0;
+      }
+      analyserNode.getByteTimeDomainData(analyserBuffer);
+      let sumSquares = 0;
+      for (const sample of analyserBuffer) {
+        const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
+        sumSquares += centered * centered;
+      }
+      return Math.sqrt(sumSquares / analyserBuffer.length);
+    },
     getSnapshot: () => state,
     interrupt: async () => {
       const startedAt = Date.now();

package/dist/core/types.d.ts CHANGED Viewed

@@ -1331,6 +1331,9 @@ export type VoiceAudioPlayerSource = {
 export type VoiceAudioPlayer = {
     close: () => Promise<void>;
     error: string | null;
+    /** Instantaneous RMS amplitude (0..1) of the assistant's audio output — for
+     *  driving a visualizer from the actual voice. 0 when idle / no analyser. */
+    getOutputLevel: () => number;
     getSnapshot: () => VoiceAudioPlayerState;
     activeSourceCount: number;
     isActive: boolean;

package/dist/index.js CHANGED Viewed

@@ -3936,6 +3936,8 @@ var createVoiceSession = (options) => {
   let activeAdapterGeneration = 0;
   let activeTTSTurnId;
   let assistantSpeechEndsAt = 0;
+  let lastAssistantAudioAt = 0;
+  let lastTtsSendAt = 0;
   let fillerTimer = null;
   let fillerActive = false;
   let fillerToken = 0;
@@ -4209,6 +4211,15 @@ var createVoiceSession = (options) => {
     });
     return result;
   };
+  let assistantAudioQueue = Promise.resolve();
+  const runAudioSerial = (operation) => {
+    const next = assistantAudioQueue.then(operation);
+    assistantAudioQueue = next.then(() => {
+      return;
+    }, () => {
+      return;
+    });
+  };
   const closeAdapter = async (reason) => {
     if (!sttSession) {
       return;
@@ -4377,6 +4388,7 @@ var createVoiceSession = (options) => {
       const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
       assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
     }
+    lastAssistantAudioAt = Date.now();
     if (activeTTSTurnId) {
       await appendTurnLatencyStage({
         at: input.receivedAt,
@@ -4486,18 +4498,28 @@ var createVoiceSession = (options) => {
       session
     });
   };
-  const DRAIN_POLL_MS = 200;
+  const DRAIN_POLL_MS = 100;
   const DRAIN_TAIL_BUFFER_MS = 300;
-  const DRAIN_MAX_MS = 12000;
-  const drainAssistantSpeech = async () => {
+  const DRAIN_QUIET_MS = 600;
+  const DRAIN_RENDER_START_MS = 4000;
+  const DRAIN_MAX_MS = 20000;
+  const drainAssistantSpeech = async (renderPendingSince) => {
     const startedAt = Date.now();
+    const sleep3 = (delayMs) => new Promise((resolve) => {
+      setTimeout(resolve, delayMs);
+    });
     while (Date.now() - startedAt < DRAIN_MAX_MS) {
-      const remaining = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS - Date.now();
-      if (remaining <= 0)
+      const now = Date.now();
+      const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
+      if (!renderStarted) {
+        await sleep3(DRAIN_POLL_MS);
+        continue;
+      }
+      const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
+      const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
+      if (streamQuiet && playbackDrained)
         return;
-      await new Promise((resolve) => {
-        setTimeout(resolve, Math.min(remaining, DRAIN_POLL_MS));
-      });
+      await sleep3(DRAIN_POLL_MS);
     }
   };
   const completeInternal = async (result, input = {}) => {
@@ -4534,7 +4556,8 @@ var createVoiceSession = (options) => {
       return;
     }
     if (disposition === "completed") {
-      await drainAssistantSpeech();
+      await drainAssistantSpeech(lastTtsSendAt);
+      await assistantAudioQueue;
     }
     await appendTrace({
       payload: {
@@ -5204,7 +5227,10 @@ var createVoiceSession = (options) => {
     });
     if (options.realtime) {
       openedSession.on("audio", ({ chunk, format, receivedAt }) => {
-        runAdapterEvent("adapter.audio", async () => {
+        runAudioSerial(async () => {
+          if (activeAdapterGeneration !== generation) {
+            return;
+          }
           await sendAssistantAudio(chunk, {
             format,
             receivedAt
@@ -5233,7 +5259,7 @@ var createVoiceSession = (options) => {
       });
       ttsSession = openedSession;
       openedSession.on("audio", ({ chunk, format, receivedAt }) => {
-        runSerial("tts.audio", async () => {
+        runAudioSerial(async () => {
           if (ttsSession !== openedSession) {
             return;
           }
@@ -5361,6 +5387,7 @@ var createVoiceSession = (options) => {
         try {
           await ttsSession2.send(text);
           charsSent += text.length;
+          lastTtsSendAt = Date.now();
         } catch (error) {
           logger.warn("voice assistant audio send failed", {
             error: toError(error).message,
@@ -5643,6 +5670,7 @@ var createVoiceSession = (options) => {
             turnId: turn.id
           });
           await activeTTSSession.send(output.assistantText);
+          lastTtsSendAt = Date.now();
           if (options.costAccountant) {
             options.costAccountant.recordTTS({
               characters: output.assistantText.length
@@ -6067,10 +6095,12 @@ var createVoiceSession = (options) => {
         if (greetingTTSSession) {
           activeTTSTurnId = greetingTurnId;
           await greetingTTSSession.send(greetingText);
+          lastTtsSendAt = Date.now();
         } else if (options.realtime) {
           const greetingRealtimeSession = await ensureAdapter();
           activeTTSTurnId = greetingTurnId;
           await greetingRealtimeSession.send(greetingText);
+          lastTtsSendAt = Date.now();
         }
       } catch {}
     }

package/dist/testing/index.js CHANGED Viewed

@@ -1736,6 +1736,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
 var MIN_PLAYBACK_RATE = 0.5;
 var MAX_PLAYBACK_RATE = 2;
 var STRETCH_BYPASS_EPSILON = 0.01;
+var ANALYSER_FFT_SIZE = 256;
+var PCM_BYTE_MIDPOINT = 128;
 var createInitialState = () => ({
   activeSourceCount: 0,
   error: null,
@@ -1796,6 +1798,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
   let state = createInitialState();
   let audioContext = null;
   let outputNode = null;
+  let analyserNode = null;
+  let analyserBuffer = null;
   let volume = clampVolume(options.volume);
   let playbackRate = clampPlaybackRate(options.playbackRate);
   let stretcher = null;
@@ -1892,6 +1896,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     if (audioContext.createGain) {
       outputNode = audioContext.createGain();
       outputNode.connect?.(audioContext.destination);
+      if (audioContext.createAnalyser) {
+        analyserNode = audioContext.createAnalyser();
+        analyserNode.fftSize = ANALYSER_FFT_SIZE;
+        analyserBuffer = new Uint8Array(analyserNode.fftSize);
+        outputNode.connect?.(analyserNode);
+      }
     }
     queueEndTime = audioContext.currentTime;
     return audioContext;
@@ -2016,6 +2026,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
       audioContext = null;
       outputNode?.disconnect?.();
       outputNode = null;
+      analyserNode?.disconnect?.();
+      analyserNode = null;
+      analyserBuffer = null;
       queueEndTime = 0;
       setState({
         activeSourceCount: 0,
@@ -2026,6 +2039,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
     get error() {
       return state.error;
     },
+    getOutputLevel: () => {
+      if (!analyserNode || !analyserBuffer) {
+        return 0;
+      }
+      analyserNode.getByteTimeDomainData(analyserBuffer);
+      let sumSquares = 0;
+      for (const sample of analyserBuffer) {
+        const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
+        sumSquares += centered * centered;
+      }
+      return Math.sqrt(sumSquares / analyserBuffer.length);
+    },
     getSnapshot: () => state,
     interrupt: async () => {
       const startedAt = Date.now();
@@ -6053,6 +6078,8 @@ var createVoiceSession = (options) => {
   let activeAdapterGeneration = 0;
   let activeTTSTurnId;
   let assistantSpeechEndsAt = 0;
+  let lastAssistantAudioAt = 0;
+  let lastTtsSendAt = 0;
   let fillerTimer = null;
   let fillerActive = false;
   let fillerToken = 0;
@@ -6326,6 +6353,15 @@ var createVoiceSession = (options) => {
     });
     return result;
   };
+  let assistantAudioQueue = Promise.resolve();
+  const runAudioSerial = (operation) => {
+    const next = assistantAudioQueue.then(operation);
+    assistantAudioQueue = next.then(() => {
+      return;
+    }, () => {
+      return;
+    });
+  };
   const closeAdapter = async (reason) => {
     if (!sttSession) {
       return;
@@ -6494,6 +6530,7 @@ var createVoiceSession = (options) => {
       const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
       assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
     }
+    lastAssistantAudioAt = Date.now();
     if (activeTTSTurnId) {
       await appendTurnLatencyStage({
         at: input.receivedAt,
@@ -6603,18 +6640,28 @@ var createVoiceSession = (options) => {
       session
     });
   };
-  const DRAIN_POLL_MS = 200;
+  const DRAIN_POLL_MS = 100;
   const DRAIN_TAIL_BUFFER_MS = 300;
-  const DRAIN_MAX_MS = 12000;
-  const drainAssistantSpeech = async () => {
+  const DRAIN_QUIET_MS = 600;
+  const DRAIN_RENDER_START_MS = 4000;
+  const DRAIN_MAX_MS = 20000;
+  const drainAssistantSpeech = async (renderPendingSince) => {
     const startedAt = Date.now();
+    const sleep2 = (delayMs) => new Promise((resolve2) => {
+      setTimeout(resolve2, delayMs);
+    });
     while (Date.now() - startedAt < DRAIN_MAX_MS) {
-      const remaining = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS - Date.now();
-      if (remaining <= 0)
+      const now = Date.now();
+      const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
+      if (!renderStarted) {
+        await sleep2(DRAIN_POLL_MS);
+        continue;
+      }
+      const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
+      const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
+      if (streamQuiet && playbackDrained)
         return;
-      await new Promise((resolve2) => {
-        setTimeout(resolve2, Math.min(remaining, DRAIN_POLL_MS));
-      });
+      await sleep2(DRAIN_POLL_MS);
     }
   };
   const completeInternal = async (result, input = {}) => {
@@ -6651,7 +6698,8 @@ var createVoiceSession = (options) => {
       return;
     }
     if (disposition === "completed") {
-      await drainAssistantSpeech();
+      await drainAssistantSpeech(lastTtsSendAt);
+      await assistantAudioQueue;
     }
     await appendTrace({
       payload: {
@@ -7321,7 +7369,10 @@ var createVoiceSession = (options) => {
     });
     if (options.realtime) {
       openedSession.on("audio", ({ chunk, format, receivedAt }) => {
-        runAdapterEvent("adapter.audio", async () => {
+        runAudioSerial(async () => {
+          if (activeAdapterGeneration !== generation) {
+            return;
+          }
           await sendAssistantAudio(chunk, {
             format,
             receivedAt
@@ -7350,7 +7401,7 @@ var createVoiceSession = (options) => {
       });
       ttsSession = openedSession;
       openedSession.on("audio", ({ chunk, format, receivedAt }) => {
-        runSerial("tts.audio", async () => {
+        runAudioSerial(async () => {
           if (ttsSession !== openedSession) {
             return;
           }
@@ -7478,6 +7529,7 @@ var createVoiceSession = (options) => {
         try {
           await ttsSession2.send(text);
           charsSent += text.length;
+          lastTtsSendAt = Date.now();
         } catch (error) {
           logger.warn("voice assistant audio send failed", {
             error: toError(error).message,
@@ -7760,6 +7812,7 @@ var createVoiceSession = (options) => {
             turnId: turn.id
           });
           await activeTTSSession.send(output.assistantText);
+          lastTtsSendAt = Date.now();
           if (options.costAccountant) {
             options.costAccountant.recordTTS({
               characters: output.assistantText.length
@@ -8184,10 +8237,12 @@ var createVoiceSession = (options) => {
         if (greetingTTSSession) {
           activeTTSTurnId = greetingTurnId;
           await greetingTTSSession.send(greetingText);
+          lastTtsSendAt = Date.now();
         } else if (options.realtime) {
           const greetingRealtimeSession = await ensureAdapter();
           activeTTSTurnId = greetingTurnId;
           await greetingRealtimeSession.send(greetingText);
+          lastTtsSendAt = Date.now();
         }
       } catch {}
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.580",
+  "version": "0.0.22-beta.582",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",