npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.610 → 0.0.22-beta.611 - Mend

@absolutejs/voice 0.0.22-beta.610 → 0.0.22-beta.611

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/core/bargeInDetector.d.ts +41 -0
package/dist/core/types.d.ts +2 -0
package/dist/index.d.ts +2 -0
package/dist/index.js +87 -25
package/dist/testing/index.js +35 -25
package/package.json +1 -1

package/dist/core/bargeInDetector.d.ts ADDED Viewed

@@ -0,0 +1,41 @@
+import type { AudioFormat } from "./types";
+export type VoiceBargeInInput = {
+    /** The partial transcript that arrived while the assistant was speaking. */
+    partialText: string;
+    /** Word count of `partialText`. */
+    wordCount: number;
+    /** Whether the text matches a known backchannel cue (isBackchannelUtterance). */
+    isBackchannelByText: boolean;
+    /** The user's buffered PCM for this window (oldest→newest), if any. */
+    turnAudio?: ReadonlyArray<Uint8Array>;
+    turnAudioFormat?: AudioFormat;
+};
+export type VoiceBargeInVerdict = {
+    /** true = real interruption → cancel the assistant's TTS. false = keep talking. */
+    shouldCancel: boolean;
+    /** Diagnostic label, surfaced on the barge_in / barge_in_suppressed trace. */
+    reason?: string;
+};
+export type VoiceBargeInDetector = {
+    evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
+};
+export type CreateAcousticBargeInDetectorOptions = {
+    /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
+    sustainedMs?: number;
+    /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
+    emphaticRms?: number;
+    /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
+    noiseFloorRms?: number;
+};
+/**
+ * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
+ * speech duration + onset energy with the text backchannel signal:
+ *   - sustained speech            → real interruption (cancel)
+ *   - known cue word, stayed short → backchannel (keep talking)
+ *   - short but loud/sharp onset   → emphatic interruption like "Wait!" (cancel)
+ *   - short + quiet                → incidental noise (keep talking)
+ *   - short + moderate, real words → ambiguous, default to cancel (don't strand
+ *                                    a genuine short interruption)
+ * Runs in-process on raw arithmetic — no model, no sidecar.
+ */
+export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;

package/dist/core/types.d.ts CHANGED Viewed

@@ -802,6 +802,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
     reconnect?: VoiceReconnectConfig;
     turnDetection?: VoiceTurnDetectionConfig;
     semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
+    bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
     bargeInMinPartialWords?: number;
     /**
      * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
@@ -951,6 +952,7 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
     };
     redact?: import("./redaction").VoiceTranscriptRedactor;
     semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
+    bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
     /**
      * Pre-rendered filler phrases the runtime plays in the gap between
      * user-turn-commit and real assistant audio (typically 800-1500ms). The

package/dist/index.d.ts CHANGED Viewed

@@ -92,6 +92,8 @@ export type { CreateVoiceCostAccountantOptions, VoiceCostAccountant, VoiceCostBr
 export { describeVoiceAssistantMode, resolveVoiceAssistantMode, } from "./core/assistantMode";
 export type { VoiceAssistantMode, VoiceAssistantModality, VoiceAssistantModeDescriptor, VoiceSemanticVADConfig, } from "./core/assistantMode";
 export { createPunctuationSemanticTurnDetector, createRegexSemanticTurnDetector, } from "./core/semanticTurn";
+export { createAcousticBargeInDetector } from "./core/bargeInDetector";
+export type { CreateAcousticBargeInDetectorOptions, VoiceBargeInDetector, VoiceBargeInInput, VoiceBargeInVerdict, } from "./core/bargeInDetector";
 export { VOICE_WEBHOOK_SIGNATURE_HEADER, VOICE_WEBHOOK_TIMESTAMP_HEADER, extractVoiceWebhookSignatureFromHeaders, signVoiceWebhookBody, verifyVoiceWebhookSignature, } from "./core/webhookVerification";
 export { describeVoiceAgentUIState, deriveVoiceAgentUIState, voiceAgentUIStateOrder, } from "./core/agentState";
 export type { VoiceAgentUIInput, VoiceAgentUIState } from "./core/agentState";

package/dist/index.js CHANGED Viewed

@@ -4245,7 +4245,7 @@ var createVoiceSession = (options) => {
   };
   const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
   const getTurnAudioForDetector = () => {
-    if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
+    if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
       return { turnAudio: undefined, turnAudioFormat: undefined };
     }
     const turnAudio = currentTurnAudio.map((audio) => {
@@ -5270,30 +5270,7 @@ var createVoiceSession = (options) => {
       const triggeringText = transcript.text.trim();
       if (triggeringText) {
         const wordCount = triggeringText.split(/\s+/).length;
-        if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
-          backchannelSuppressedAt = Date.now();
-          appendTurnLatencyStage({
-            metadata: {
-              partial: triggeringText.slice(0, 200),
-              reason: "backchannel",
-              wordCount
-            },
-            stage: "barge_in_suppressed",
-            turnId: activeTTSTurnId
-          }).catch(() => {});
-        } else if (wordCount >= bargeInMinPartialWords) {
-          backchannelSuppressedAt = null;
-          appendTurnLatencyStage({
-            metadata: {
-              partial: triggeringText.slice(0, 200),
-              source: "stt_partial",
-              wordCount
-            },
-            stage: "barge_in",
-            turnId: activeTTSTurnId
-          }).catch(() => {});
-          cancelActiveTTS("barge-in");
-        } else {
+        if (wordCount < bargeInMinPartialWords) {
           appendTurnLatencyStage({
             metadata: {
               partial: triggeringText.slice(0, 200),
@@ -5303,6 +5280,39 @@ var createVoiceSession = (options) => {
             stage: "barge_in_suppressed",
             turnId: activeTTSTurnId
           }).catch(() => {});
+        } else {
+          const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
+          const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
+            isBackchannelByText,
+            partialText: triggeringText,
+            wordCount,
+            ...getTurnAudioForDetector()
+          })) : { reason: undefined, shouldCancel: !isBackchannelByText };
+          const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
+          if (verdict.shouldCancel) {
+            backchannelSuppressedAt = null;
+            appendTurnLatencyStage({
+              metadata: {
+                partial: triggeringText.slice(0, 200),
+                source: reason,
+                wordCount
+              },
+              stage: "barge_in",
+              turnId: activeTTSTurnId
+            }).catch(() => {});
+            cancelActiveTTS("barge-in");
+          } else {
+            backchannelSuppressedAt = Date.now();
+            appendTurnLatencyStage({
+              metadata: {
+                partial: triggeringText.slice(0, 200),
+                reason,
+                wordCount
+              },
+              stage: "barge_in_suppressed",
+              turnId: activeTTSTurnId
+            }).catch(() => {});
+          }
         }
       }
     }
@@ -39707,6 +39717,7 @@ var voice = (config) => {
       sttFallback: sessionOptions.sttFallback,
       sttLifecycle: sessionOptions.sttLifecycle,
       ...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
+      ...config.bargeInDetector ? { bargeInDetector: config.bargeInDetector } : {},
       ...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
       ...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
       ...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
@@ -41989,6 +42000,56 @@ var createRegexSemanticTurnDetector = (options) => {
     }
   };
 };
+// src/core/bargeInDetector.ts
+var measureTurnAudio = (chunks, format) => {
+  const channels = format.channels ?? 1;
+  const sampleRate = format.sampleRateHz ?? 16000;
+  let sumSquares = 0;
+  let sampleCount = 0;
+  for (const chunk of chunks) {
+    const usableBytes = chunk.byteLength - chunk.byteLength % 2;
+    const view = new DataView(chunk.buffer, chunk.byteOffset, usableBytes);
+    for (let offset = 0;offset < usableBytes; offset += 2) {
+      const sample = view.getInt16(offset, true) / 32768;
+      sumSquares += sample * sample;
+      sampleCount += 1;
+    }
+  }
+  if (sampleCount === 0) {
+    return { durationMs: 0, rms: 0 };
+  }
+  return {
+    durationMs: sampleCount / channels / sampleRate * 1000,
+    rms: Math.sqrt(sumSquares / sampleCount)
+  };
+};
+var createAcousticBargeInDetector = (options = {}) => {
+  const sustainedMs = options.sustainedMs ?? 700;
+  const emphaticRms = options.emphaticRms ?? 0.16;
+  const noiseFloorRms = options.noiseFloorRms ?? 0.035;
+  return {
+    evaluate: (input) => {
+      const { turnAudio, turnAudioFormat } = input;
+      if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
+        return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
+      }
+      const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
+      if (durationMs >= sustainedMs) {
+        return { reason: "acoustic_sustained", shouldCancel: true };
+      }
+      if (input.isBackchannelByText) {
+        return { reason: "acoustic_backchannel", shouldCancel: false };
+      }
+      if (rms >= emphaticRms) {
+        return { reason: "acoustic_emphatic", shouldCancel: true };
+      }
+      if (rms <= noiseFloorRms) {
+        return { reason: "acoustic_noise_floor", shouldCancel: false };
+      }
+      return { reason: "acoustic_ambiguous", shouldCancel: true };
+    }
+  };
+};
 // src/core/webhookVerification.ts
 var VOICE_WEBHOOK_SIGNATURE_HEADER = "x-absolutejs-signature";
 var VOICE_WEBHOOK_TIMESTAMP_HEADER = "x-absolutejs-timestamp";
@@ -53726,6 +53787,7 @@ export {
   createCoturnIceServers,
   createCachedTTS,
   createAnthropicVoiceAssistantModel,
+  createAcousticBargeInDetector,
   createAIVoiceModel,
   conditionAudioChunk,
   computeVoiceScorecardCalibration,

package/dist/testing/index.js CHANGED Viewed

@@ -6472,7 +6472,7 @@ var createVoiceSession = (options) => {
   };
   const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
   const getTurnAudioForDetector = () => {
-    if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
+    if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
       return { turnAudio: undefined, turnAudioFormat: undefined };
     }
     const turnAudio = currentTurnAudio.map((audio) => {
@@ -7497,30 +7497,7 @@ var createVoiceSession = (options) => {
       const triggeringText = transcript.text.trim();
       if (triggeringText) {
         const wordCount = triggeringText.split(/\s+/).length;
-        if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
-          backchannelSuppressedAt = Date.now();
-          appendTurnLatencyStage({
-            metadata: {
-              partial: triggeringText.slice(0, 200),
-              reason: "backchannel",
-              wordCount
-            },
-            stage: "barge_in_suppressed",
-            turnId: activeTTSTurnId
-          }).catch(() => {});
-        } else if (wordCount >= bargeInMinPartialWords) {
-          backchannelSuppressedAt = null;
-          appendTurnLatencyStage({
-            metadata: {
-              partial: triggeringText.slice(0, 200),
-              source: "stt_partial",
-              wordCount
-            },
-            stage: "barge_in",
-            turnId: activeTTSTurnId
-          }).catch(() => {});
-          cancelActiveTTS("barge-in");
-        } else {
+        if (wordCount < bargeInMinPartialWords) {
           appendTurnLatencyStage({
             metadata: {
               partial: triggeringText.slice(0, 200),
@@ -7530,6 +7507,39 @@ var createVoiceSession = (options) => {
             stage: "barge_in_suppressed",
             turnId: activeTTSTurnId
           }).catch(() => {});
+        } else {
+          const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
+          const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
+            isBackchannelByText,
+            partialText: triggeringText,
+            wordCount,
+            ...getTurnAudioForDetector()
+          })) : { reason: undefined, shouldCancel: !isBackchannelByText };
+          const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
+          if (verdict.shouldCancel) {
+            backchannelSuppressedAt = null;
+            appendTurnLatencyStage({
+              metadata: {
+                partial: triggeringText.slice(0, 200),
+                source: reason,
+                wordCount
+              },
+              stage: "barge_in",
+              turnId: activeTTSTurnId
+            }).catch(() => {});
+            cancelActiveTTS("barge-in");
+          } else {
+            backchannelSuppressedAt = Date.now();
+            appendTurnLatencyStage({
+              metadata: {
+                partial: triggeringText.slice(0, 200),
+                reason,
+                wordCount
+              },
+              stage: "barge_in_suppressed",
+              turnId: activeTTSTurnId
+            }).catch(() => {});
+          }
         }
       }
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.610",
+  "version": "0.0.22-beta.611",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",