@absolutejs/voice 0.0.22-beta.613 → 0.0.22-beta.614

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ export type VoiceBargeInVerdict = {
17
17
  reason?: string;
18
18
  /**
19
19
  * The acoustic measurements the decision used, surfaced on the trace for
20
- * tuning the thresholds against real audio. Omitted when no audio was judged.
20
+ * tuning against real audio. Omitted when no audio was judged.
21
21
  */
22
22
  metrics?: {
23
23
  voicedMs: number;
@@ -28,22 +28,24 @@ export type VoiceBargeInDetector = {
28
28
  evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
29
29
  };
30
30
  export type CreateAcousticBargeInDetectorOptions = {
31
- /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
31
+ /** Voiced speech sustained this long (ms) is a real interruption cancel. */
32
32
  sustainedMs?: number;
33
- /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
34
- emphaticRms?: number;
35
- /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
36
- noiseFloorRms?: number;
33
+ /**
34
+ * Leading words that mark an interruption ("wait", "hold on", "sorry"). A
35
+ * short utterance starting with one cancels immediately instead of holding.
36
+ * Extends (does not replace) the defaults.
37
+ */
38
+ interruptionCues?: string[];
37
39
  };
38
40
  /**
39
- * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
40
- * speech duration + onset energy with the text backchannel signal:
41
- * - sustained speech → real interruption (cancel)
42
- * - known cue word, stayed short → backchannel (keep talking)
43
- * - short but loud/sharp onset emphatic interruption like "Wait!" (cancel)
44
- * - short + quiet incidental noise (keep talking)
45
- * - short + moderate, real words → ambiguous, default to cancel (don't strand
46
- * a genuine short interruption)
41
+ * A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
42
+ * (energy was measured to not discriminate on normalized speech):
43
+ * - voiced speech past `sustainedMs` → real interruption (cancel)
44
+ * - known backchannel cue, still short → backchannel (keep talking)
45
+ * - starts with an interruption cue caller took the floor (cancel)
46
+ * - otherwise short + ambiguous HOLD: keep talking; a continuing
47
+ * utterance cancels itself once its
48
+ * voiced duration crosses sustainedMs
47
49
  * Runs in-process on raw arithmetic — no model, no sidecar.
48
50
  */
49
51
  export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
package/dist/index.js CHANGED
@@ -25354,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
25354
25354
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
25355
25355
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
25356
25356
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
25357
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
25357
25358
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
25358
25359
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
25359
25360
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
@@ -42007,6 +42008,28 @@ var createRegexSemanticTurnDetector = (options) => {
42007
42008
  };
42008
42009
  };
42009
42010
  // src/core/bargeInDetector.ts
42011
+ var DEFAULT_INTERRUPTION_CUES = [
42012
+ "wait",
42013
+ "hold on",
42014
+ "hold up",
42015
+ "hang on",
42016
+ "stop",
42017
+ "sorry",
42018
+ "excuse me",
42019
+ "actually",
42020
+ "one sec",
42021
+ "one second",
42022
+ "quick question",
42023
+ "question",
42024
+ "can i",
42025
+ "let me",
42026
+ "no no"
42027
+ ];
42028
+ var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
42029
+ var startsWithCue = (text, cues) => {
42030
+ const norm = normalize(text);
42031
+ return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
42032
+ };
42010
42033
  var VOICED_FLOOR = 0.02;
42011
42034
  var measureTurnAudio = (chunks, format) => {
42012
42035
  const channels = format.channels ?? 1;
@@ -42025,38 +42048,44 @@ var measureTurnAudio = (chunks, format) => {
42025
42048
  }
42026
42049
  }
42027
42050
  if (voicedSamples === 0) {
42028
- return { durationMs: 0, rms: 0 };
42051
+ return { rms: 0, voicedMs: 0 };
42029
42052
  }
42030
42053
  return {
42031
- durationMs: voicedSamples / channels / sampleRate * 1000,
42032
- rms: Math.sqrt(sumSquares / voicedSamples)
42054
+ rms: Math.sqrt(sumSquares / voicedSamples),
42055
+ voicedMs: voicedSamples / channels / sampleRate * 1000
42033
42056
  };
42034
42057
  };
42035
42058
  var createAcousticBargeInDetector = (options = {}) => {
42036
- const sustainedMs = options.sustainedMs ?? 700;
42037
- const emphaticRms = options.emphaticRms ?? 0.16;
42038
- const noiseFloorRms = options.noiseFloorRms ?? 0.035;
42059
+ const sustainedMs = options.sustainedMs ?? 600;
42060
+ const interruptionCues = [
42061
+ ...DEFAULT_INTERRUPTION_CUES,
42062
+ ...options.interruptionCues ?? []
42063
+ ];
42039
42064
  return {
42040
42065
  evaluate: (input) => {
42066
+ const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
42041
42067
  const { turnAudio, turnAudioFormat } = input;
42042
42068
  if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
42043
- return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
42069
+ if (input.isBackchannelByText) {
42070
+ return { reason: "text_backchannel", shouldCancel: false };
42071
+ }
42072
+ return {
42073
+ reason: isInterruptionCue ? "text_interruption" : "text_only",
42074
+ shouldCancel: true
42075
+ };
42044
42076
  }
42045
- const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
42046
- const metrics = { rms, voicedMs: Math.round(durationMs) };
42047
- if (durationMs >= sustainedMs) {
42077
+ const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
42078
+ metrics.voicedMs = Math.round(metrics.voicedMs);
42079
+ if (metrics.voicedMs >= sustainedMs) {
42048
42080
  return { metrics, reason: "acoustic_sustained", shouldCancel: true };
42049
42081
  }
42050
42082
  if (input.isBackchannelByText) {
42051
42083
  return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
42052
42084
  }
42053
- if (rms >= emphaticRms) {
42054
- return { metrics, reason: "acoustic_emphatic", shouldCancel: true };
42055
- }
42056
- if (rms <= noiseFloorRms) {
42057
- return { metrics, reason: "acoustic_noise_floor", shouldCancel: false };
42085
+ if (isInterruptionCue) {
42086
+ return { metrics, reason: "acoustic_interruption", shouldCancel: true };
42058
42087
  }
42059
- return { metrics, reason: "acoustic_ambiguous", shouldCancel: true };
42088
+ return { metrics, reason: "acoustic_hold", shouldCancel: false };
42060
42089
  }
42061
42090
  };
42062
42091
  };
@@ -51785,7 +51814,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
51785
51814
  };
51786
51815
  };
51787
51816
  // src/core/scorecardCalibration.ts
51788
- var normalize = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51817
+ var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51789
51818
  var correlation = (xs, ys) => {
51790
51819
  if (xs.length === 0 || xs.length !== ys.length)
51791
51820
  return 0;
@@ -51844,8 +51873,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
51844
51873
  const l = llmByCriterion.get(criterionId);
51845
51874
  if (!h || !l)
51846
51875
  continue;
51847
- const hn = normalize(h.score, pair.human.scaleMax);
51848
- const ln = normalize(l.score, pair.llm.scaleMax);
51876
+ const hn = normalize2(h.score, pair.human.scaleMax);
51877
+ const ln = normalize2(l.score, pair.llm.scaleMax);
51849
51878
  const gap = Math.abs(hn - ln);
51850
51879
  allGaps.push(gap);
51851
51880
  divergences.push({
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
133
133
  * snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
134
134
  */
135
135
  semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
136
+ bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
136
137
  /**
137
138
  * Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
138
139
  * in the gap between user-turn-commit and real assistant audio. Boardy's
@@ -14244,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
14244
14244
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
14245
14245
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
14246
14246
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
14247
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
14247
14248
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
14248
14249
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
14249
14250
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.613",
3
+ "version": "0.0.22-beta.614",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",