npm - @absolutejs/voice - Versions diffs - 0.0.22-beta.613 → 0.0.22-beta.615 - Mend

@absolutejs/voice 0.0.22-beta.613 → 0.0.22-beta.615

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/core/bargeInDetector.d.ts +16 -14
package/dist/core/costAccounting.d.ts +4 -0
package/dist/index.js +98 -24
package/dist/telephony/twilio.d.ts +1 -0
package/dist/testing/index.js +1 -0
package/package.json +1 -1

package/dist/core/bargeInDetector.d.ts CHANGED Viewed

@@ -17,7 +17,7 @@ export type VoiceBargeInVerdict = {
     reason?: string;
     /**
      * The acoustic measurements the decision used, surfaced on the trace for
-     * tuning the thresholds against real audio. Omitted when no audio was judged.
+     * tuning against real audio. Omitted when no audio was judged.
      */
     metrics?: {
         voicedMs: number;
@@ -28,22 +28,24 @@ export type VoiceBargeInDetector = {
     evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
 };
 export type CreateAcousticBargeInDetectorOptions = {
-    /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
+    /** Voiced speech sustained this long (ms) is a real interruption — cancel. */
     sustainedMs?: number;
-    /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
-    emphaticRms?: number;
-    /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
-    noiseFloorRms?: number;
+    /**
+     * Leading words that mark an interruption ("wait", "hold on", "sorry"). A
+     * short utterance starting with one cancels immediately instead of holding.
+     * Extends (does not replace) the defaults.
+     */
+    interruptionCues?: string[];
 };
 /**
- * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
- * speech duration + onset energy with the text backchannel signal:
- *   - sustained speech            → real interruption (cancel)
- *   - known cue word, stayed short → backchannel (keep talking)
- *   - short but loud/sharp onset   → emphatic interruption like "Wait!" (cancel)
- *   - short + quiet                → incidental noise (keep talking)
- *   - short + moderate, real words → ambiguous, default to cancel (don't strand
- *                                    a genuine short interruption)
+ * A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
+ * (energy was measured to not discriminate on normalized speech):
+ *   - voiced speech past `sustainedMs`  → real interruption (cancel)
+ *   - known backchannel cue, still short → backchannel (keep talking)
+ *   - starts with an interruption cue     → caller took the floor (cancel)
+ *   - otherwise short + ambiguous         → HOLD: keep talking; a continuing
+ *                                           utterance cancels itself once its
+ *                                           voiced duration crosses sustainedMs
  * Runs in-process on raw arithmetic — no model, no sidecar.
  */
 export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;

package/dist/core/costAccounting.d.ts CHANGED Viewed

@@ -44,21 +44,25 @@ export type VoiceCostBreakdown = {
         cachedInputTokens: number;
         inputTokens: number;
         outputTokens: number;
+        provider?: string;
         usd: number;
     };
     sessionId?: string;
     stt: {
         audioMs: number;
+        provider?: string;
         usd: number;
     };
     telephony: {
         minutes: number;
+        provider?: string;
         usd: number;
     };
     totalUsd: number;
     tts: {
         audioMs: number;
         characters: number;
+        provider?: string;
         usd: number;
     };
 };

package/dist/index.js CHANGED Viewed

@@ -25354,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
       sttFallback: resolveSTTFallbackConfig(options.sttFallback),
       sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
       ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
+      ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
       ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
       ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
       ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
@@ -41865,6 +41866,30 @@ var createVoiceCostAccountant = (options = {}) => {
   let sttUsd = 0;
   let telephonyMinutes = 0;
   let telephonyUsd = 0;
+  const llmByProvider = new Map;
+  const ttsByProvider = new Map;
+  const sttByProvider = new Map;
+  const telephonyByProvider = new Map;
+  let lastLlmProvider;
+  let lastTtsProvider;
+  let lastSttProvider;
+  let lastTelephonyProvider;
+  const addProvider = (byProvider, provider, usd) => {
+    if (!provider)
+      return;
+    byProvider.set(provider, (byProvider.get(provider) ?? 0) + usd);
+  };
+  const dominant = (byProvider, fallback) => {
+    let best;
+    let bestUsd = -1;
+    for (const [provider, usd] of byProvider) {
+      if (usd > bestUsd) {
+        bestUsd = usd;
+        best = provider;
+      }
+    }
+    return best ?? fallback;
+  };
   return {
     recordLLM: (usage) => {
       const input = usage.inputTokens ?? 0;
@@ -41873,64 +41898,85 @@ var createVoiceCostAccountant = (options = {}) => {
       llmInput += input;
       llmCachedInput += cached;
       llmOutput += output;
+      if (usage.provider)
+        lastLlmProvider = usage.provider;
       const rates = lookupRates(priceBook, usage.provider, usage.model)?.llm;
       if (!rates) {
         return;
       }
       const cachedRate = rates.cachedInputPerMillionTokensUsd ?? rates.inputPerMillionTokensUsd;
-      llmUsd += Math.max(0, input - cached) * rates.inputPerMillionTokensUsd / 1e6 + cached * cachedRate / 1e6 + output * rates.outputPerMillionTokensUsd / 1e6;
+      const delta = Math.max(0, input - cached) * rates.inputPerMillionTokensUsd / 1e6 + cached * cachedRate / 1e6 + output * rates.outputPerMillionTokensUsd / 1e6;
+      llmUsd += delta;
+      addProvider(llmByProvider, usage.provider, delta);
     },
     recordSTT: (input) => {
       sttAudioMs += Math.max(0, input.audioMs);
+      if (input.provider)
+        lastSttProvider = input.provider;
       const rates = lookupRates(priceBook, input.provider, input.model)?.stt;
       if (!rates) {
         return;
       }
-      sttUsd += Math.max(0, input.audioMs) / 1000 * rates.perSecondUsd;
+      const delta = Math.max(0, input.audioMs) / 1000 * rates.perSecondUsd;
+      sttUsd += delta;
+      addProvider(sttByProvider, input.provider, delta);
     },
     recordTelephony: (input) => {
       telephonyMinutes += Math.max(0, input.minutes);
+      if (input.provider)
+        lastTelephonyProvider = input.provider;
       const rates = lookupRates(priceBook, input.provider)?.telephony;
       if (!rates) {
         return;
       }
-      telephonyUsd += Math.max(0, input.minutes) * rates.perMinuteUsd;
+      const delta = Math.max(0, input.minutes) * rates.perMinuteUsd;
+      telephonyUsd += delta;
+      addProvider(telephonyByProvider, input.provider, delta);
     },
     recordTTS: (input) => {
       const chars = input.characters ?? 0;
       const audioMs = input.audioMs ?? 0;
       ttsCharacters += chars;
       ttsAudioMs += audioMs;
+      if (input.provider)
+        lastTtsProvider = input.provider;
       const rates = lookupRates(priceBook, input.provider, input.voice)?.tts;
       if (!rates) {
         return;
       }
+      let delta = 0;
       if (rates.perMillionCharactersUsd !== undefined && chars > 0) {
-        ttsUsd += chars * rates.perMillionCharactersUsd / 1e6;
+        delta = chars * rates.perMillionCharactersUsd / 1e6;
       } else if (rates.perSecondUsd !== undefined && audioMs > 0) {
-        ttsUsd += audioMs / 1000 * rates.perSecondUsd;
+        delta = audioMs / 1000 * rates.perSecondUsd;
       }
+      ttsUsd += delta;
+      addProvider(ttsByProvider, input.provider, delta);
     },
     snapshot: () => ({
       llm: {
         cachedInputTokens: llmCachedInput,
         inputTokens: llmInput,
         outputTokens: llmOutput,
+        provider: dominant(llmByProvider, lastLlmProvider),
         usd: Math.round(llmUsd * 1e6) / 1e6
       },
       sessionId: options.sessionId,
       stt: {
         audioMs: sttAudioMs,
+        provider: dominant(sttByProvider, lastSttProvider),
         usd: Math.round(sttUsd * 1e6) / 1e6
       },
       telephony: {
         minutes: telephonyMinutes,
+        provider: dominant(telephonyByProvider, lastTelephonyProvider),
         usd: Math.round(telephonyUsd * 1e6) / 1e6
       },
       totalUsd: Math.round((llmUsd + ttsUsd + sttUsd + telephonyUsd) * 1e6) / 1e6,
       tts: {
         audioMs: ttsAudioMs,
         characters: ttsCharacters,
+        provider: dominant(ttsByProvider, lastTtsProvider),
         usd: Math.round(ttsUsd * 1e6) / 1e6
       }
     })
@@ -42007,6 +42053,28 @@ var createRegexSemanticTurnDetector = (options) => {
   };
 };
 // src/core/bargeInDetector.ts
+var DEFAULT_INTERRUPTION_CUES = [
+  "wait",
+  "hold on",
+  "hold up",
+  "hang on",
+  "stop",
+  "sorry",
+  "excuse me",
+  "actually",
+  "one sec",
+  "one second",
+  "quick question",
+  "question",
+  "can i",
+  "let me",
+  "no no"
+];
+var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
+var startsWithCue = (text, cues) => {
+  const norm = normalize(text);
+  return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
+};
 var VOICED_FLOOR = 0.02;
 var measureTurnAudio = (chunks, format) => {
   const channels = format.channels ?? 1;
@@ -42025,38 +42093,44 @@ var measureTurnAudio = (chunks, format) => {
     }
   }
   if (voicedSamples === 0) {
-    return { durationMs: 0, rms: 0 };
+    return { rms: 0, voicedMs: 0 };
   }
   return {
-    durationMs: voicedSamples / channels / sampleRate * 1000,
-    rms: Math.sqrt(sumSquares / voicedSamples)
+    rms: Math.sqrt(sumSquares / voicedSamples),
+    voicedMs: voicedSamples / channels / sampleRate * 1000
   };
 };
 var createAcousticBargeInDetector = (options = {}) => {
-  const sustainedMs = options.sustainedMs ?? 700;
-  const emphaticRms = options.emphaticRms ?? 0.16;
-  const noiseFloorRms = options.noiseFloorRms ?? 0.035;
+  const sustainedMs = options.sustainedMs ?? 600;
+  const interruptionCues = [
+    ...DEFAULT_INTERRUPTION_CUES,
+    ...options.interruptionCues ?? []
+  ];
   return {
     evaluate: (input) => {
+      const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
       const { turnAudio, turnAudioFormat } = input;
       if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
-        return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
+        if (input.isBackchannelByText) {
+          return { reason: "text_backchannel", shouldCancel: false };
+        }
+        return {
+          reason: isInterruptionCue ? "text_interruption" : "text_only",
+          shouldCancel: true
+        };
       }
-      const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
-      const metrics = { rms, voicedMs: Math.round(durationMs) };
-      if (durationMs >= sustainedMs) {
+      const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
+      metrics.voicedMs = Math.round(metrics.voicedMs);
+      if (metrics.voicedMs >= sustainedMs) {
         return { metrics, reason: "acoustic_sustained", shouldCancel: true };
       }
       if (input.isBackchannelByText) {
         return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
       }
-      if (rms >= emphaticRms) {
-        return { metrics, reason: "acoustic_emphatic", shouldCancel: true };
-      }
-      if (rms <= noiseFloorRms) {
-        return { metrics, reason: "acoustic_noise_floor", shouldCancel: false };
+      if (isInterruptionCue) {
+        return { metrics, reason: "acoustic_interruption", shouldCancel: true };
       }
-      return { metrics, reason: "acoustic_ambiguous", shouldCancel: true };
+      return { metrics, reason: "acoustic_hold", shouldCancel: false };
     }
   };
 };
@@ -51785,7 +51859,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
   };
 };
 // src/core/scorecardCalibration.ts
-var normalize = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
+var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
 var correlation = (xs, ys) => {
   if (xs.length === 0 || xs.length !== ys.length)
     return 0;
@@ -51844,8 +51918,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
       const l = llmByCriterion.get(criterionId);
       if (!h || !l)
         continue;
-      const hn = normalize(h.score, pair.human.scaleMax);
-      const ln = normalize(l.score, pair.llm.scaleMax);
+      const hn = normalize2(h.score, pair.human.scaleMax);
+      const ln = normalize2(l.score, pair.llm.scaleMax);
       const gap = Math.abs(hn - ln);
       allGaps.push(gap);
       divergences.push({

package/dist/telephony/twilio.d.ts CHANGED Viewed

@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
      * snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
      */
     semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
+    bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
     /**
      * Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
      * in the gap between user-turn-commit and real assistant audio. Boardy's

package/dist/testing/index.js CHANGED Viewed

@@ -14244,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
       sttFallback: resolveSTTFallbackConfig(options.sttFallback),
       sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
       ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
+      ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
       ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
       ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
       ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@absolutejs/voice",
-  "version": "0.0.22-beta.613",
+  "version": "0.0.22-beta.615",
   "description": "Voice primitives and Elysia plugin for AbsoluteJS",
   "repository": {
     "type": "git",