@absolutejs/voice 0.0.22-beta.613 → 0.0.22-beta.615

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ export type VoiceBargeInVerdict = {
17
17
  reason?: string;
18
18
  /**
19
19
  * The acoustic measurements the decision used, surfaced on the trace for
20
- * tuning the thresholds against real audio. Omitted when no audio was judged.
20
+ * tuning against real audio. Omitted when no audio was judged.
21
21
  */
22
22
  metrics?: {
23
23
  voicedMs: number;
@@ -28,22 +28,24 @@ export type VoiceBargeInDetector = {
28
28
  evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
29
29
  };
30
30
  export type CreateAcousticBargeInDetectorOptions = {
31
- /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
31
+ /** Voiced speech sustained this long (ms) is a real interruption cancel. */
32
32
  sustainedMs?: number;
33
- /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
34
- emphaticRms?: number;
35
- /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
36
- noiseFloorRms?: number;
33
+ /**
34
+ * Leading words that mark an interruption ("wait", "hold on", "sorry"). A
35
+ * short utterance starting with one cancels immediately instead of holding.
36
+ * Extends (does not replace) the defaults.
37
+ */
38
+ interruptionCues?: string[];
37
39
  };
38
40
  /**
39
- * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
40
- * speech duration + onset energy with the text backchannel signal:
41
- * - sustained speech → real interruption (cancel)
42
- * - known cue word, stayed short → backchannel (keep talking)
43
- * - short but loud/sharp onset emphatic interruption like "Wait!" (cancel)
44
- * - short + quiet incidental noise (keep talking)
45
- * - short + moderate, real words → ambiguous, default to cancel (don't strand
46
- * a genuine short interruption)
41
+ * A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
42
+ * (energy was measured to not discriminate on normalized speech):
43
+ * - voiced speech past `sustainedMs` → real interruption (cancel)
44
+ * - known backchannel cue, still short → backchannel (keep talking)
45
+ * - starts with an interruption cue caller took the floor (cancel)
46
+ * - otherwise short + ambiguous HOLD: keep talking; a continuing
47
+ * utterance cancels itself once its
48
+ * voiced duration crosses sustainedMs
47
49
  * Runs in-process on raw arithmetic — no model, no sidecar.
48
50
  */
49
51
  export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
@@ -44,21 +44,25 @@ export type VoiceCostBreakdown = {
44
44
  cachedInputTokens: number;
45
45
  inputTokens: number;
46
46
  outputTokens: number;
47
+ provider?: string;
47
48
  usd: number;
48
49
  };
49
50
  sessionId?: string;
50
51
  stt: {
51
52
  audioMs: number;
53
+ provider?: string;
52
54
  usd: number;
53
55
  };
54
56
  telephony: {
55
57
  minutes: number;
58
+ provider?: string;
56
59
  usd: number;
57
60
  };
58
61
  totalUsd: number;
59
62
  tts: {
60
63
  audioMs: number;
61
64
  characters: number;
65
+ provider?: string;
62
66
  usd: number;
63
67
  };
64
68
  };
package/dist/index.js CHANGED
@@ -25354,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
25354
25354
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
25355
25355
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
25356
25356
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
25357
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
25357
25358
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
25358
25359
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
25359
25360
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
@@ -41865,6 +41866,30 @@ var createVoiceCostAccountant = (options = {}) => {
41865
41866
  let sttUsd = 0;
41866
41867
  let telephonyMinutes = 0;
41867
41868
  let telephonyUsd = 0;
41869
+ const llmByProvider = new Map;
41870
+ const ttsByProvider = new Map;
41871
+ const sttByProvider = new Map;
41872
+ const telephonyByProvider = new Map;
41873
+ let lastLlmProvider;
41874
+ let lastTtsProvider;
41875
+ let lastSttProvider;
41876
+ let lastTelephonyProvider;
41877
+ const addProvider = (byProvider, provider, usd) => {
41878
+ if (!provider)
41879
+ return;
41880
+ byProvider.set(provider, (byProvider.get(provider) ?? 0) + usd);
41881
+ };
41882
+ const dominant = (byProvider, fallback) => {
41883
+ let best;
41884
+ let bestUsd = -1;
41885
+ for (const [provider, usd] of byProvider) {
41886
+ if (usd > bestUsd) {
41887
+ bestUsd = usd;
41888
+ best = provider;
41889
+ }
41890
+ }
41891
+ return best ?? fallback;
41892
+ };
41868
41893
  return {
41869
41894
  recordLLM: (usage) => {
41870
41895
  const input = usage.inputTokens ?? 0;
@@ -41873,64 +41898,85 @@ var createVoiceCostAccountant = (options = {}) => {
41873
41898
  llmInput += input;
41874
41899
  llmCachedInput += cached;
41875
41900
  llmOutput += output;
41901
+ if (usage.provider)
41902
+ lastLlmProvider = usage.provider;
41876
41903
  const rates = lookupRates(priceBook, usage.provider, usage.model)?.llm;
41877
41904
  if (!rates) {
41878
41905
  return;
41879
41906
  }
41880
41907
  const cachedRate = rates.cachedInputPerMillionTokensUsd ?? rates.inputPerMillionTokensUsd;
41881
- llmUsd += Math.max(0, input - cached) * rates.inputPerMillionTokensUsd / 1e6 + cached * cachedRate / 1e6 + output * rates.outputPerMillionTokensUsd / 1e6;
41908
+ const delta = Math.max(0, input - cached) * rates.inputPerMillionTokensUsd / 1e6 + cached * cachedRate / 1e6 + output * rates.outputPerMillionTokensUsd / 1e6;
41909
+ llmUsd += delta;
41910
+ addProvider(llmByProvider, usage.provider, delta);
41882
41911
  },
41883
41912
  recordSTT: (input) => {
41884
41913
  sttAudioMs += Math.max(0, input.audioMs);
41914
+ if (input.provider)
41915
+ lastSttProvider = input.provider;
41885
41916
  const rates = lookupRates(priceBook, input.provider, input.model)?.stt;
41886
41917
  if (!rates) {
41887
41918
  return;
41888
41919
  }
41889
- sttUsd += Math.max(0, input.audioMs) / 1000 * rates.perSecondUsd;
41920
+ const delta = Math.max(0, input.audioMs) / 1000 * rates.perSecondUsd;
41921
+ sttUsd += delta;
41922
+ addProvider(sttByProvider, input.provider, delta);
41890
41923
  },
41891
41924
  recordTelephony: (input) => {
41892
41925
  telephonyMinutes += Math.max(0, input.minutes);
41926
+ if (input.provider)
41927
+ lastTelephonyProvider = input.provider;
41893
41928
  const rates = lookupRates(priceBook, input.provider)?.telephony;
41894
41929
  if (!rates) {
41895
41930
  return;
41896
41931
  }
41897
- telephonyUsd += Math.max(0, input.minutes) * rates.perMinuteUsd;
41932
+ const delta = Math.max(0, input.minutes) * rates.perMinuteUsd;
41933
+ telephonyUsd += delta;
41934
+ addProvider(telephonyByProvider, input.provider, delta);
41898
41935
  },
41899
41936
  recordTTS: (input) => {
41900
41937
  const chars = input.characters ?? 0;
41901
41938
  const audioMs = input.audioMs ?? 0;
41902
41939
  ttsCharacters += chars;
41903
41940
  ttsAudioMs += audioMs;
41941
+ if (input.provider)
41942
+ lastTtsProvider = input.provider;
41904
41943
  const rates = lookupRates(priceBook, input.provider, input.voice)?.tts;
41905
41944
  if (!rates) {
41906
41945
  return;
41907
41946
  }
41947
+ let delta = 0;
41908
41948
  if (rates.perMillionCharactersUsd !== undefined && chars > 0) {
41909
- ttsUsd += chars * rates.perMillionCharactersUsd / 1e6;
41949
+ delta = chars * rates.perMillionCharactersUsd / 1e6;
41910
41950
  } else if (rates.perSecondUsd !== undefined && audioMs > 0) {
41911
- ttsUsd += audioMs / 1000 * rates.perSecondUsd;
41951
+ delta = audioMs / 1000 * rates.perSecondUsd;
41912
41952
  }
41953
+ ttsUsd += delta;
41954
+ addProvider(ttsByProvider, input.provider, delta);
41913
41955
  },
41914
41956
  snapshot: () => ({
41915
41957
  llm: {
41916
41958
  cachedInputTokens: llmCachedInput,
41917
41959
  inputTokens: llmInput,
41918
41960
  outputTokens: llmOutput,
41961
+ provider: dominant(llmByProvider, lastLlmProvider),
41919
41962
  usd: Math.round(llmUsd * 1e6) / 1e6
41920
41963
  },
41921
41964
  sessionId: options.sessionId,
41922
41965
  stt: {
41923
41966
  audioMs: sttAudioMs,
41967
+ provider: dominant(sttByProvider, lastSttProvider),
41924
41968
  usd: Math.round(sttUsd * 1e6) / 1e6
41925
41969
  },
41926
41970
  telephony: {
41927
41971
  minutes: telephonyMinutes,
41972
+ provider: dominant(telephonyByProvider, lastTelephonyProvider),
41928
41973
  usd: Math.round(telephonyUsd * 1e6) / 1e6
41929
41974
  },
41930
41975
  totalUsd: Math.round((llmUsd + ttsUsd + sttUsd + telephonyUsd) * 1e6) / 1e6,
41931
41976
  tts: {
41932
41977
  audioMs: ttsAudioMs,
41933
41978
  characters: ttsCharacters,
41979
+ provider: dominant(ttsByProvider, lastTtsProvider),
41934
41980
  usd: Math.round(ttsUsd * 1e6) / 1e6
41935
41981
  }
41936
41982
  })
@@ -42007,6 +42053,28 @@ var createRegexSemanticTurnDetector = (options) => {
42007
42053
  };
42008
42054
  };
42009
42055
  // src/core/bargeInDetector.ts
42056
+ var DEFAULT_INTERRUPTION_CUES = [
42057
+ "wait",
42058
+ "hold on",
42059
+ "hold up",
42060
+ "hang on",
42061
+ "stop",
42062
+ "sorry",
42063
+ "excuse me",
42064
+ "actually",
42065
+ "one sec",
42066
+ "one second",
42067
+ "quick question",
42068
+ "question",
42069
+ "can i",
42070
+ "let me",
42071
+ "no no"
42072
+ ];
42073
+ var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
42074
+ var startsWithCue = (text, cues) => {
42075
+ const norm = normalize(text);
42076
+ return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
42077
+ };
42010
42078
  var VOICED_FLOOR = 0.02;
42011
42079
  var measureTurnAudio = (chunks, format) => {
42012
42080
  const channels = format.channels ?? 1;
@@ -42025,38 +42093,44 @@ var measureTurnAudio = (chunks, format) => {
42025
42093
  }
42026
42094
  }
42027
42095
  if (voicedSamples === 0) {
42028
- return { durationMs: 0, rms: 0 };
42096
+ return { rms: 0, voicedMs: 0 };
42029
42097
  }
42030
42098
  return {
42031
- durationMs: voicedSamples / channels / sampleRate * 1000,
42032
- rms: Math.sqrt(sumSquares / voicedSamples)
42099
+ rms: Math.sqrt(sumSquares / voicedSamples),
42100
+ voicedMs: voicedSamples / channels / sampleRate * 1000
42033
42101
  };
42034
42102
  };
42035
42103
  var createAcousticBargeInDetector = (options = {}) => {
42036
- const sustainedMs = options.sustainedMs ?? 700;
42037
- const emphaticRms = options.emphaticRms ?? 0.16;
42038
- const noiseFloorRms = options.noiseFloorRms ?? 0.035;
42104
+ const sustainedMs = options.sustainedMs ?? 600;
42105
+ const interruptionCues = [
42106
+ ...DEFAULT_INTERRUPTION_CUES,
42107
+ ...options.interruptionCues ?? []
42108
+ ];
42039
42109
  return {
42040
42110
  evaluate: (input) => {
42111
+ const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
42041
42112
  const { turnAudio, turnAudioFormat } = input;
42042
42113
  if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
42043
- return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
42114
+ if (input.isBackchannelByText) {
42115
+ return { reason: "text_backchannel", shouldCancel: false };
42116
+ }
42117
+ return {
42118
+ reason: isInterruptionCue ? "text_interruption" : "text_only",
42119
+ shouldCancel: true
42120
+ };
42044
42121
  }
42045
- const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
42046
- const metrics = { rms, voicedMs: Math.round(durationMs) };
42047
- if (durationMs >= sustainedMs) {
42122
+ const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
42123
+ metrics.voicedMs = Math.round(metrics.voicedMs);
42124
+ if (metrics.voicedMs >= sustainedMs) {
42048
42125
  return { metrics, reason: "acoustic_sustained", shouldCancel: true };
42049
42126
  }
42050
42127
  if (input.isBackchannelByText) {
42051
42128
  return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
42052
42129
  }
42053
- if (rms >= emphaticRms) {
42054
- return { metrics, reason: "acoustic_emphatic", shouldCancel: true };
42055
- }
42056
- if (rms <= noiseFloorRms) {
42057
- return { metrics, reason: "acoustic_noise_floor", shouldCancel: false };
42130
+ if (isInterruptionCue) {
42131
+ return { metrics, reason: "acoustic_interruption", shouldCancel: true };
42058
42132
  }
42059
- return { metrics, reason: "acoustic_ambiguous", shouldCancel: true };
42133
+ return { metrics, reason: "acoustic_hold", shouldCancel: false };
42060
42134
  }
42061
42135
  };
42062
42136
  };
@@ -51785,7 +51859,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
51785
51859
  };
51786
51860
  };
51787
51861
  // src/core/scorecardCalibration.ts
51788
- var normalize = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51862
+ var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
51789
51863
  var correlation = (xs, ys) => {
51790
51864
  if (xs.length === 0 || xs.length !== ys.length)
51791
51865
  return 0;
@@ -51844,8 +51918,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
51844
51918
  const l = llmByCriterion.get(criterionId);
51845
51919
  if (!h || !l)
51846
51920
  continue;
51847
- const hn = normalize(h.score, pair.human.scaleMax);
51848
- const ln = normalize(l.score, pair.llm.scaleMax);
51921
+ const hn = normalize2(h.score, pair.human.scaleMax);
51922
+ const ln = normalize2(l.score, pair.llm.scaleMax);
51849
51923
  const gap = Math.abs(hn - ln);
51850
51924
  allGaps.push(gap);
51851
51925
  divergences.push({
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
133
133
  * snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
134
134
  */
135
135
  semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
136
+ bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
136
137
  /**
137
138
  * Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
138
139
  * in the gap between user-turn-commit and real assistant audio. Boardy's
@@ -14244,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
14244
14244
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
14245
14245
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
14246
14246
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
14247
+ ...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
14247
14248
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
14248
14249
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
14249
14250
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.613",
3
+ "version": "0.0.22-beta.615",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",