npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.607 → 0.0.22-beta.608 - Mend

@absolutejs/voice 0.0.22-beta.607 → 0.0.22-beta.608

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/core/backchannel.d.ts +1 -0
package/dist/core/types.d.ts +16 -0
package/dist/index.js +116 -1
package/dist/testing/index.js +115 -1
package/package.json +1 -1

package/dist/core/backchannel.d.ts CHANGED Viewed

@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
     noteSilence: (timestampMs?: number) => void;
     reset: () => void;
 };
+export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
 export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;

package/dist/core/types.d.ts CHANGED Viewed

@@ -802,6 +802,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
     turnDetection?: VoiceTurnDetectionConfig;
     semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
     bargeInMinPartialWords?: number;
+    /**
+     * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
+     * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
+     * and the cue is dropped so it never becomes the caller's next turn. A bare
+     * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
+     * Default false (any in-speech words interrupt, the prior behavior).
+     */
+    backchannelBargeInGuard?: boolean;
     fillerPhrases?: ReadonlyArray<string>;
     fillerDelayMs?: number;
     fillerFor?: (input: {
@@ -975,6 +983,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
      * Word splitting is whitespace-based. Punctuation is left attached.
      */
     bargeInMinPartialWords?: number;
+    /**
+     * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
+     * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
+     * and the cue is dropped so it never becomes the caller's next turn. A bare
+     * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
+     * Default false (any in-speech words interrupt, the prior behavior).
+     */
+    backchannelBargeInGuard?: boolean;
     fillerPhrases?: ReadonlyArray<string>;
     /** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
     fillerDelayMs?: number;

package/dist/index.js CHANGED Viewed

@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
   { text: "right" },
   { text: "go on" }
 ];
+var BACKCHANNEL_TOKENS = new Set([
+  "mm",
+  "mmm",
+  "mhm",
+  "mmhm",
+  "mmhmm",
+  "hm",
+  "hmm",
+  "uh-huh",
+  "uhhuh",
+  "uh",
+  "huh",
+  "ah",
+  "oh",
+  "yeah",
+  "yep",
+  "yup",
+  "yes",
+  "ya",
+  "yah",
+  "ok",
+  "okay",
+  "k",
+  "kay",
+  "right",
+  "sure",
+  "totally",
+  "exactly",
+  "absolutely",
+  "definitely",
+  "gotcha",
+  "cool",
+  "nice",
+  "wow",
+  "true",
+  "fair",
+  "aha",
+  "perfect",
+  "awesome",
+  "great",
+  "good",
+  "wonderful",
+  "amazing",
+  "interesting",
+  "understood",
+  "agreed"
+]);
+var BACKCHANNEL_PHRASES = new Set([
+  "i see",
+  "got it",
+  "makes sense",
+  "of course",
+  "for sure",
+  "fair enough",
+  "sounds good",
+  "i know",
+  "oh ok",
+  "oh okay",
+  "that's right",
+  "thats right",
+  "oh wow",
+  "oh nice",
+  "oh cool",
+  "uh huh",
+  "mm hm",
+  "mm hmm",
+  "i hear you",
+  "for real",
+  "no way",
+  "makes total sense",
+  "got you",
+  "i get it",
+  "right right",
+  "yeah yeah",
+  "ok ok",
+  "oh i see",
+  "oh got it",
+  "yeah totally",
+  "yeah exactly"
+]);
+var isBackchannelUtterance = (text, maxWords = 3) => {
+  const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
+  if (!normalized) {
+    return false;
+  }
+  if (BACKCHANNEL_PHRASES.has(normalized)) {
+    return true;
+  }
+  const words = normalized.split(" ");
+  if (words.length > maxWords) {
+    return false;
+  }
+  return words.every((word) => BACKCHANNEL_TOKENS.has(word));
+};
 var createVoiceBackchannelDriver = (options) => {
   const cues = options.cues ?? DEFAULT_CUES;
   const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -3757,6 +3851,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
 var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
 var STREAM_IDLE_FLUSH_MS = 350;
 var SPECULATIVE_DELAY_MS = 500;
+var BACKCHANNEL_DROP_WINDOW_MS = 2000;
 var nextSpeakableBoundary = (buffer) => {
   const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
   return match ? match.index + match[0].length : -1;
@@ -3996,6 +4091,8 @@ var createVoiceSession = (options) => {
   const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
   const fillerDelayMs = options.fillerDelayMs ?? 250;
   const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
+  const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
+  let backchannelSuppressedAt = null;
   const fillerFor = options.fillerFor;
   const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
   const currentTurnAudio = [];
@@ -5169,7 +5266,19 @@ var createVoiceSession = (options) => {
       const triggeringText = transcript.text.trim();
       if (triggeringText) {
         const wordCount = triggeringText.split(/\s+/).length;
-        if (wordCount >= bargeInMinPartialWords) {
+        if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
+          backchannelSuppressedAt = Date.now();
+          appendTurnLatencyStage({
+            metadata: {
+              partial: triggeringText.slice(0, 200),
+              reason: "backchannel",
+              wordCount
+            },
+            stage: "barge_in_suppressed",
+            turnId: activeTTSTurnId
+          }).catch(() => {});
+        } else if (wordCount >= bargeInMinPartialWords) {
+          backchannelSuppressedAt = null;
           appendTurnLatencyStage({
             metadata: {
               partial: triggeringText.slice(0, 200),
@@ -5231,6 +5340,11 @@ var createVoiceSession = (options) => {
   };
   const handleFinal = async (transcript) => {
     sttReconnectCount = 0;
+    if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
+      backchannelSuppressedAt = null;
+      return;
+    }
+    backchannelSuppressedAt = null;
     const session = await writeSession((session2) => {
       const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
       if (!alreadyPresent) {
@@ -39581,6 +39695,7 @@ var voice = (config) => {
       sttLifecycle: sessionOptions.sttLifecycle,
       ...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
       ...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
+      ...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
       ...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
       ...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
       ...config.fillerFor ? { fillerFor: config.fillerFor } : {},

package/dist/testing/index.js CHANGED Viewed

@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
   { text: "right" },
   { text: "go on" }
 ];
+var BACKCHANNEL_TOKENS = new Set([
+  "mm",
+  "mmm",
+  "mhm",
+  "mmhm",
+  "mmhmm",
+  "hm",
+  "hmm",
+  "uh-huh",
+  "uhhuh",
+  "uh",
+  "huh",
+  "ah",
+  "oh",
+  "yeah",
+  "yep",
+  "yup",
+  "yes",
+  "ya",
+  "yah",
+  "ok",
+  "okay",
+  "k",
+  "kay",
+  "right",
+  "sure",
+  "totally",
+  "exactly",
+  "absolutely",
+  "definitely",
+  "gotcha",
+  "cool",
+  "nice",
+  "wow",
+  "true",
+  "fair",
+  "aha",
+  "perfect",
+  "awesome",
+  "great",
+  "good",
+  "wonderful",
+  "amazing",
+  "interesting",
+  "understood",
+  "agreed"
+]);
+var BACKCHANNEL_PHRASES = new Set([
+  "i see",
+  "got it",
+  "makes sense",
+  "of course",
+  "for sure",
+  "fair enough",
+  "sounds good",
+  "i know",
+  "oh ok",
+  "oh okay",
+  "that's right",
+  "thats right",
+  "oh wow",
+  "oh nice",
+  "oh cool",
+  "uh huh",
+  "mm hm",
+  "mm hmm",
+  "i hear you",
+  "for real",
+  "no way",
+  "makes total sense",
+  "got you",
+  "i get it",
+  "right right",
+  "yeah yeah",
+  "ok ok",
+  "oh i see",
+  "oh got it",
+  "yeah totally",
+  "yeah exactly"
+]);
+var isBackchannelUtterance = (text, maxWords = 3) => {
+  const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
+  if (!normalized) {
+    return false;
+  }
+  if (BACKCHANNEL_PHRASES.has(normalized)) {
+    return true;
+  }
+  const words = normalized.split(" ");
+  if (words.length > maxWords) {
+    return false;
+  }
+  return words.every((word) => BACKCHANNEL_TOKENS.has(word));
+};
 var createVoiceBackchannelDriver = (options) => {
   const cues = options.cues ?? DEFAULT_CUES;
   const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -5984,6 +6078,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
 var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
 var STREAM_IDLE_FLUSH_MS = 350;
 var SPECULATIVE_DELAY_MS = 500;
+var BACKCHANNEL_DROP_WINDOW_MS = 2000;
 var nextSpeakableBoundary = (buffer) => {
   const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
   return match ? match.index + match[0].length : -1;
@@ -6223,6 +6318,8 @@ var createVoiceSession = (options) => {
   const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
   const fillerDelayMs = options.fillerDelayMs ?? 250;
   const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
+  const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
+  let backchannelSuppressedAt = null;
   const fillerFor = options.fillerFor;
   const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
   const currentTurnAudio = [];
@@ -7396,7 +7493,19 @@ var createVoiceSession = (options) => {
       const triggeringText = transcript.text.trim();
       if (triggeringText) {
         const wordCount = triggeringText.split(/\s+/).length;
-        if (wordCount >= bargeInMinPartialWords) {
+        if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
+          backchannelSuppressedAt = Date.now();
+          appendTurnLatencyStage({
+            metadata: {
+              partial: triggeringText.slice(0, 200),
+              reason: "backchannel",
+              wordCount
+            },
+            stage: "barge_in_suppressed",
+            turnId: activeTTSTurnId
+          }).catch(() => {});
+        } else if (wordCount >= bargeInMinPartialWords) {
+          backchannelSuppressedAt = null;
           appendTurnLatencyStage({
             metadata: {
               partial: triggeringText.slice(0, 200),
@@ -7458,6 +7567,11 @@ var createVoiceSession = (options) => {
   };
   const handleFinal = async (transcript) => {
     sttReconnectCount = 0;
+    if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
+      backchannelSuppressedAt = null;
+      return;
+    }
+    backchannelSuppressedAt = null;
     const session = await writeSession((session2) => {
       const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
       if (!alreadyPresent) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.607",
+  "version": "0.0.22-beta.608",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",