@absolutejs/voice 0.0.22-beta.612 → 0.0.22-beta.614

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,27 +15,37 @@ export type VoiceBargeInVerdict = {
15
15
  shouldCancel: boolean;
16
16
  /** Diagnostic label, surfaced on the barge_in / barge_in_suppressed trace. */
17
17
  reason?: string;
18
+ /**
19
+ * The acoustic measurements the decision used, surfaced on the trace for
20
+ * tuning against real audio. Omitted when no audio was judged.
21
+ */
22
+ metrics?: {
23
+ voicedMs: number;
24
+ rms: number;
25
+ };
18
26
  };
19
27
  export type VoiceBargeInDetector = {
20
28
  evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
21
29
  };
22
30
  export type CreateAcousticBargeInDetectorOptions = {
23
- /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
31
+ /** Voiced speech sustained this long (ms) is a real interruption cancel. */
24
32
  sustainedMs?: number;
25
- /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
26
- emphaticRms?: number;
27
- /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
28
- noiseFloorRms?: number;
33
+ /**
34
+ * Leading words that mark an interruption ("wait", "hold on", "sorry"). A
35
+ * short utterance starting with one cancels immediately instead of holding.
36
+ * Extends (does not replace) the defaults.
37
+ */
38
+ interruptionCues?: string[];
29
39
  };
30
40
  /**
31
- * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
32
- * speech duration + onset energy with the text backchannel signal:
33
- * - sustained speech → real interruption (cancel)
34
- * - known cue word, stayed short → backchannel (keep talking)
35
- * - short but loud/sharp onset emphatic interruption like "Wait!" (cancel)
36
- * - short + quiet incidental noise (keep talking)
37
- * - short + moderate, real words → ambiguous, default to cancel (don't strand
38
- * a genuine short interruption)
41
+ * A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
42
+ * (energy was measured to not discriminate on normalized speech):
43
+ * - voiced speech past `sustainedMs` → real interruption (cancel)
44
+ * - known backchannel cue, still short → backchannel (keep talking)
45
+ * - starts with an interruption cue caller took the floor (cancel)
46
+ * - otherwise short + ambiguous HOLD: keep talking; a continuing
47
+ * utterance cancels itself once its
48
+ * voiced duration crosses sustainedMs
39
49
  * Runs in-process on raw arithmetic — no model, no sidecar.
40
50
  */
41
51
  export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
package/dist/index.js CHANGED
@@ -5287,12 +5287,17 @@ var createVoiceSession = (options) => {
5287
5287
  partialText: triggeringText,
5288
5288
  wordCount,
5289
5289
  ...getTurnAudioForDetector()
5290
- })) : { reason: undefined, shouldCancel: !isBackchannelByText };
5290
+ })) : {
5291
+ metrics: undefined,
5292
+ reason: undefined,
5293
+ shouldCancel: !isBackchannelByText
5294
+ };
5291
5295
  const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
5292
5296
  if (verdict.shouldCancel) {
5293
5297
  backchannelSuppressedAt = null;
5294
5298
  appendTurnLatencyStage({
5295
5299
  metadata: {
5300
+ ...verdict.metrics,
5296
5301
  partial: triggeringText.slice(0, 200),
5297
5302
  source: reason,
5298
5303
  wordCount
@@ -5305,6 +5310,7 @@ var createVoiceSession = (options) => {
5305
5310
  backchannelSuppressedAt = Date.now();
5306
5311
  appendTurnLatencyStage({
5307
5312
  metadata: {
5313
+ ...verdict.metrics,
5308
5314
  partial: triggeringText.slice(0, 200),
5309
5315
  reason,
5310
5316
  wordCount
@@ -25348,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
25348
25354
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
25349
25355
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
25350
25356
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
25357
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
25351
25358
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
25352
25359
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
25353
25360
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
@@ -42001,6 +42008,28 @@ var createRegexSemanticTurnDetector = (options) => {
42001
42008
  };
42002
42009
  };
42003
42010
  // src/core/bargeInDetector.ts
42011
+ var DEFAULT_INTERRUPTION_CUES = [
42012
+ "wait",
42013
+ "hold on",
42014
+ "hold up",
42015
+ "hang on",
42016
+ "stop",
42017
+ "sorry",
42018
+ "excuse me",
42019
+ "actually",
42020
+ "one sec",
42021
+ "one second",
42022
+ "quick question",
42023
+ "question",
42024
+ "can i",
42025
+ "let me",
42026
+ "no no"
42027
+ ];
42028
+ var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
42029
+ var startsWithCue = (text, cues) => {
42030
+ const norm = normalize(text);
42031
+ return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
42032
+ };
42004
42033
  var VOICED_FLOOR = 0.02;
42005
42034
  var measureTurnAudio = (chunks, format) => {
42006
42035
  const channels = format.channels ?? 1;
@@ -42019,37 +42048,44 @@ var measureTurnAudio = (chunks, format) => {
42019
42048
  }
42020
42049
  }
42021
42050
  if (voicedSamples === 0) {
42022
- return { durationMs: 0, rms: 0 };
42051
+ return { rms: 0, voicedMs: 0 };
42023
42052
  }
42024
42053
  return {
42025
- durationMs: voicedSamples / channels / sampleRate * 1000,
42026
- rms: Math.sqrt(sumSquares / voicedSamples)
42054
+ rms: Math.sqrt(sumSquares / voicedSamples),
42055
+ voicedMs: voicedSamples / channels / sampleRate * 1000
42027
42056
  };
42028
42057
  };
42029
42058
  var createAcousticBargeInDetector = (options = {}) => {
42030
- const sustainedMs = options.sustainedMs ?? 700;
42031
- const emphaticRms = options.emphaticRms ?? 0.16;
42032
- const noiseFloorRms = options.noiseFloorRms ?? 0.035;
42059
+ const sustainedMs = options.sustainedMs ?? 600;
42060
+ const interruptionCues = [
42061
+ ...DEFAULT_INTERRUPTION_CUES,
42062
+ ...options.interruptionCues ?? []
42063
+ ];
42033
42064
  return {
42034
42065
  evaluate: (input) => {
42066
+ const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
42035
42067
  const { turnAudio, turnAudioFormat } = input;
42036
42068
  if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
42037
- return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
42069
+ if (input.isBackchannelByText) {
42070
+ return { reason: "text_backchannel", shouldCancel: false };
42071
+ }
42072
+ return {
42073
+ reason: isInterruptionCue ? "text_interruption" : "text_only",
42074
+ shouldCancel: true
42075
+ };
42038
42076
  }
42039
- const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
42040
- if (durationMs >= sustainedMs) {
42041
- return { reason: "acoustic_sustained", shouldCancel: true };
42077
+ const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
42078
+ metrics.voicedMs = Math.round(metrics.voicedMs);
42079
+ if (metrics.voicedMs >= sustainedMs) {
42080
+ return { metrics, reason: "acoustic_sustained", shouldCancel: true };
42042
42081
  }
42043
42082
  if (input.isBackchannelByText) {
42044
- return { reason: "acoustic_backchannel", shouldCancel: false };
42045
- }
42046
- if (rms >= emphaticRms) {
42047
- return { reason: "acoustic_emphatic", shouldCancel: true };
42083
+ return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
42048
42084
  }
42049
- if (rms <= noiseFloorRms) {
42050
- return { reason: "acoustic_noise_floor", shouldCancel: false };
42085
+ if (isInterruptionCue) {
42086
+ return { metrics, reason: "acoustic_interruption", shouldCancel: true };
42051
42087
  }
42052
- return { reason: "acoustic_ambiguous", shouldCancel: true };
42088
+ return { metrics, reason: "acoustic_hold", shouldCancel: false };
42053
42089
  }
42054
42090
  };
42055
42091
  };
@@ -51778,7 +51814,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
51778
51814
  };
51779
51815
  };
51780
51816
  // src/core/scorecardCalibration.ts
51781
- var normalize = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51817
+ var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51782
51818
  var correlation = (xs, ys) => {
51783
51819
  if (xs.length === 0 || xs.length !== ys.length)
51784
51820
  return 0;
@@ -51837,8 +51873,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
51837
51873
  const l = llmByCriterion.get(criterionId);
51838
51874
  if (!h || !l)
51839
51875
  continue;
51840
- const hn = normalize(h.score, pair.human.scaleMax);
51841
- const ln = normalize(l.score, pair.llm.scaleMax);
51876
+ const hn = normalize2(h.score, pair.human.scaleMax);
51877
+ const ln = normalize2(l.score, pair.llm.scaleMax);
51842
51878
  const gap = Math.abs(hn - ln);
51843
51879
  allGaps.push(gap);
51844
51880
  divergences.push({
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
133
133
  * snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
134
134
  */
135
135
  semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
136
+ bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
136
137
  /**
137
138
  * Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
138
139
  * in the gap between user-turn-commit and real assistant audio. Boardy's
@@ -7514,12 +7514,17 @@ var createVoiceSession = (options) => {
7514
7514
  partialText: triggeringText,
7515
7515
  wordCount,
7516
7516
  ...getTurnAudioForDetector()
7517
- })) : { reason: undefined, shouldCancel: !isBackchannelByText };
7517
+ })) : {
7518
+ metrics: undefined,
7519
+ reason: undefined,
7520
+ shouldCancel: !isBackchannelByText
7521
+ };
7518
7522
  const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
7519
7523
  if (verdict.shouldCancel) {
7520
7524
  backchannelSuppressedAt = null;
7521
7525
  appendTurnLatencyStage({
7522
7526
  metadata: {
7527
+ ...verdict.metrics,
7523
7528
  partial: triggeringText.slice(0, 200),
7524
7529
  source: reason,
7525
7530
  wordCount
@@ -7532,6 +7537,7 @@ var createVoiceSession = (options) => {
7532
7537
  backchannelSuppressedAt = Date.now();
7533
7538
  appendTurnLatencyStage({
7534
7539
  metadata: {
7540
+ ...verdict.metrics,
7535
7541
  partial: triggeringText.slice(0, 200),
7536
7542
  reason,
7537
7543
  wordCount
@@ -14238,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
14238
14244
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
14239
14245
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
14240
14246
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
14247
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
14241
14248
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
14242
14249
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
14243
14250
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.612",
3
+ "version": "0.0.22-beta.614",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",