@absolutejs/voice 0.0.22-beta.613 → 0.0.22-beta.615
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/bargeInDetector.d.ts +16 -14
- package/dist/core/costAccounting.d.ts +4 -0
- package/dist/index.js +98 -24
- package/dist/telephony/twilio.d.ts +1 -0
- package/dist/testing/index.js +1 -0
- package/package.json +1 -1
|
@@ -17,7 +17,7 @@ export type VoiceBargeInVerdict = {
|
|
|
17
17
|
reason?: string;
|
|
18
18
|
/**
|
|
19
19
|
* The acoustic measurements the decision used, surfaced on the trace for
|
|
20
|
-
* tuning
|
|
20
|
+
* tuning against real audio. Omitted when no audio was judged.
|
|
21
21
|
*/
|
|
22
22
|
metrics?: {
|
|
23
23
|
voicedMs: number;
|
|
@@ -28,22 +28,24 @@ export type VoiceBargeInDetector = {
|
|
|
28
28
|
evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
|
|
29
29
|
};
|
|
30
30
|
export type CreateAcousticBargeInDetectorOptions = {
|
|
31
|
-
/**
|
|
31
|
+
/** Voiced speech sustained this long (ms) is a real interruption — cancel. */
|
|
32
32
|
sustainedMs?: number;
|
|
33
|
-
/**
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
/**
|
|
34
|
+
* Leading words that mark an interruption ("wait", "hold on", "sorry"). A
|
|
35
|
+
* short utterance starting with one cancels immediately instead of holding.
|
|
36
|
+
* Extends (does not replace) the defaults.
|
|
37
|
+
*/
|
|
38
|
+
interruptionCues?: string[];
|
|
37
39
|
};
|
|
38
40
|
/**
|
|
39
|
-
* A model-free
|
|
40
|
-
*
|
|
41
|
-
* -
|
|
42
|
-
* - known cue
|
|
43
|
-
* -
|
|
44
|
-
* - short +
|
|
45
|
-
*
|
|
46
|
-
*
|
|
41
|
+
* A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
|
|
42
|
+
* (energy was measured to not discriminate on normalized speech):
|
|
43
|
+
* - voiced speech past `sustainedMs` → real interruption (cancel)
|
|
44
|
+
* - known backchannel cue, still short → backchannel (keep talking)
|
|
45
|
+
* - starts with an interruption cue → caller took the floor (cancel)
|
|
46
|
+
* - otherwise short + ambiguous → HOLD: keep talking; a continuing
|
|
47
|
+
* utterance cancels itself once its
|
|
48
|
+
* voiced duration crosses sustainedMs
|
|
47
49
|
* Runs in-process on raw arithmetic — no model, no sidecar.
|
|
48
50
|
*/
|
|
49
51
|
export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
|
|
@@ -44,21 +44,25 @@ export type VoiceCostBreakdown = {
|
|
|
44
44
|
cachedInputTokens: number;
|
|
45
45
|
inputTokens: number;
|
|
46
46
|
outputTokens: number;
|
|
47
|
+
provider?: string;
|
|
47
48
|
usd: number;
|
|
48
49
|
};
|
|
49
50
|
sessionId?: string;
|
|
50
51
|
stt: {
|
|
51
52
|
audioMs: number;
|
|
53
|
+
provider?: string;
|
|
52
54
|
usd: number;
|
|
53
55
|
};
|
|
54
56
|
telephony: {
|
|
55
57
|
minutes: number;
|
|
58
|
+
provider?: string;
|
|
56
59
|
usd: number;
|
|
57
60
|
};
|
|
58
61
|
totalUsd: number;
|
|
59
62
|
tts: {
|
|
60
63
|
audioMs: number;
|
|
61
64
|
characters: number;
|
|
65
|
+
provider?: string;
|
|
62
66
|
usd: number;
|
|
63
67
|
};
|
|
64
68
|
};
|
package/dist/index.js
CHANGED
|
@@ -25354,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
25354
25354
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
25355
25355
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
25356
25356
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
25357
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
25357
25358
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
25358
25359
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
25359
25360
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|
|
@@ -41865,6 +41866,30 @@ var createVoiceCostAccountant = (options = {}) => {
|
|
|
41865
41866
|
let sttUsd = 0;
|
|
41866
41867
|
let telephonyMinutes = 0;
|
|
41867
41868
|
let telephonyUsd = 0;
|
|
41869
|
+
const llmByProvider = new Map;
|
|
41870
|
+
const ttsByProvider = new Map;
|
|
41871
|
+
const sttByProvider = new Map;
|
|
41872
|
+
const telephonyByProvider = new Map;
|
|
41873
|
+
let lastLlmProvider;
|
|
41874
|
+
let lastTtsProvider;
|
|
41875
|
+
let lastSttProvider;
|
|
41876
|
+
let lastTelephonyProvider;
|
|
41877
|
+
const addProvider = (byProvider, provider, usd) => {
|
|
41878
|
+
if (!provider)
|
|
41879
|
+
return;
|
|
41880
|
+
byProvider.set(provider, (byProvider.get(provider) ?? 0) + usd);
|
|
41881
|
+
};
|
|
41882
|
+
const dominant = (byProvider, fallback) => {
|
|
41883
|
+
let best;
|
|
41884
|
+
let bestUsd = -1;
|
|
41885
|
+
for (const [provider, usd] of byProvider) {
|
|
41886
|
+
if (usd > bestUsd) {
|
|
41887
|
+
bestUsd = usd;
|
|
41888
|
+
best = provider;
|
|
41889
|
+
}
|
|
41890
|
+
}
|
|
41891
|
+
return best ?? fallback;
|
|
41892
|
+
};
|
|
41868
41893
|
return {
|
|
41869
41894
|
recordLLM: (usage) => {
|
|
41870
41895
|
const input = usage.inputTokens ?? 0;
|
|
@@ -41873,64 +41898,85 @@ var createVoiceCostAccountant = (options = {}) => {
|
|
|
41873
41898
|
llmInput += input;
|
|
41874
41899
|
llmCachedInput += cached;
|
|
41875
41900
|
llmOutput += output;
|
|
41901
|
+
if (usage.provider)
|
|
41902
|
+
lastLlmProvider = usage.provider;
|
|
41876
41903
|
const rates = lookupRates(priceBook, usage.provider, usage.model)?.llm;
|
|
41877
41904
|
if (!rates) {
|
|
41878
41905
|
return;
|
|
41879
41906
|
}
|
|
41880
41907
|
const cachedRate = rates.cachedInputPerMillionTokensUsd ?? rates.inputPerMillionTokensUsd;
|
|
41881
|
-
|
|
41908
|
+
const delta = Math.max(0, input - cached) * rates.inputPerMillionTokensUsd / 1e6 + cached * cachedRate / 1e6 + output * rates.outputPerMillionTokensUsd / 1e6;
|
|
41909
|
+
llmUsd += delta;
|
|
41910
|
+
addProvider(llmByProvider, usage.provider, delta);
|
|
41882
41911
|
},
|
|
41883
41912
|
recordSTT: (input) => {
|
|
41884
41913
|
sttAudioMs += Math.max(0, input.audioMs);
|
|
41914
|
+
if (input.provider)
|
|
41915
|
+
lastSttProvider = input.provider;
|
|
41885
41916
|
const rates = lookupRates(priceBook, input.provider, input.model)?.stt;
|
|
41886
41917
|
if (!rates) {
|
|
41887
41918
|
return;
|
|
41888
41919
|
}
|
|
41889
|
-
|
|
41920
|
+
const delta = Math.max(0, input.audioMs) / 1000 * rates.perSecondUsd;
|
|
41921
|
+
sttUsd += delta;
|
|
41922
|
+
addProvider(sttByProvider, input.provider, delta);
|
|
41890
41923
|
},
|
|
41891
41924
|
recordTelephony: (input) => {
|
|
41892
41925
|
telephonyMinutes += Math.max(0, input.minutes);
|
|
41926
|
+
if (input.provider)
|
|
41927
|
+
lastTelephonyProvider = input.provider;
|
|
41893
41928
|
const rates = lookupRates(priceBook, input.provider)?.telephony;
|
|
41894
41929
|
if (!rates) {
|
|
41895
41930
|
return;
|
|
41896
41931
|
}
|
|
41897
|
-
|
|
41932
|
+
const delta = Math.max(0, input.minutes) * rates.perMinuteUsd;
|
|
41933
|
+
telephonyUsd += delta;
|
|
41934
|
+
addProvider(telephonyByProvider, input.provider, delta);
|
|
41898
41935
|
},
|
|
41899
41936
|
recordTTS: (input) => {
|
|
41900
41937
|
const chars = input.characters ?? 0;
|
|
41901
41938
|
const audioMs = input.audioMs ?? 0;
|
|
41902
41939
|
ttsCharacters += chars;
|
|
41903
41940
|
ttsAudioMs += audioMs;
|
|
41941
|
+
if (input.provider)
|
|
41942
|
+
lastTtsProvider = input.provider;
|
|
41904
41943
|
const rates = lookupRates(priceBook, input.provider, input.voice)?.tts;
|
|
41905
41944
|
if (!rates) {
|
|
41906
41945
|
return;
|
|
41907
41946
|
}
|
|
41947
|
+
let delta = 0;
|
|
41908
41948
|
if (rates.perMillionCharactersUsd !== undefined && chars > 0) {
|
|
41909
|
-
|
|
41949
|
+
delta = chars * rates.perMillionCharactersUsd / 1e6;
|
|
41910
41950
|
} else if (rates.perSecondUsd !== undefined && audioMs > 0) {
|
|
41911
|
-
|
|
41951
|
+
delta = audioMs / 1000 * rates.perSecondUsd;
|
|
41912
41952
|
}
|
|
41953
|
+
ttsUsd += delta;
|
|
41954
|
+
addProvider(ttsByProvider, input.provider, delta);
|
|
41913
41955
|
},
|
|
41914
41956
|
snapshot: () => ({
|
|
41915
41957
|
llm: {
|
|
41916
41958
|
cachedInputTokens: llmCachedInput,
|
|
41917
41959
|
inputTokens: llmInput,
|
|
41918
41960
|
outputTokens: llmOutput,
|
|
41961
|
+
provider: dominant(llmByProvider, lastLlmProvider),
|
|
41919
41962
|
usd: Math.round(llmUsd * 1e6) / 1e6
|
|
41920
41963
|
},
|
|
41921
41964
|
sessionId: options.sessionId,
|
|
41922
41965
|
stt: {
|
|
41923
41966
|
audioMs: sttAudioMs,
|
|
41967
|
+
provider: dominant(sttByProvider, lastSttProvider),
|
|
41924
41968
|
usd: Math.round(sttUsd * 1e6) / 1e6
|
|
41925
41969
|
},
|
|
41926
41970
|
telephony: {
|
|
41927
41971
|
minutes: telephonyMinutes,
|
|
41972
|
+
provider: dominant(telephonyByProvider, lastTelephonyProvider),
|
|
41928
41973
|
usd: Math.round(telephonyUsd * 1e6) / 1e6
|
|
41929
41974
|
},
|
|
41930
41975
|
totalUsd: Math.round((llmUsd + ttsUsd + sttUsd + telephonyUsd) * 1e6) / 1e6,
|
|
41931
41976
|
tts: {
|
|
41932
41977
|
audioMs: ttsAudioMs,
|
|
41933
41978
|
characters: ttsCharacters,
|
|
41979
|
+
provider: dominant(ttsByProvider, lastTtsProvider),
|
|
41934
41980
|
usd: Math.round(ttsUsd * 1e6) / 1e6
|
|
41935
41981
|
}
|
|
41936
41982
|
})
|
|
@@ -42007,6 +42053,28 @@ var createRegexSemanticTurnDetector = (options) => {
|
|
|
42007
42053
|
};
|
|
42008
42054
|
};
|
|
42009
42055
|
// src/core/bargeInDetector.ts
|
|
42056
|
+
var DEFAULT_INTERRUPTION_CUES = [
|
|
42057
|
+
"wait",
|
|
42058
|
+
"hold on",
|
|
42059
|
+
"hold up",
|
|
42060
|
+
"hang on",
|
|
42061
|
+
"stop",
|
|
42062
|
+
"sorry",
|
|
42063
|
+
"excuse me",
|
|
42064
|
+
"actually",
|
|
42065
|
+
"one sec",
|
|
42066
|
+
"one second",
|
|
42067
|
+
"quick question",
|
|
42068
|
+
"question",
|
|
42069
|
+
"can i",
|
|
42070
|
+
"let me",
|
|
42071
|
+
"no no"
|
|
42072
|
+
];
|
|
42073
|
+
var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
42074
|
+
var startsWithCue = (text, cues) => {
|
|
42075
|
+
const norm = normalize(text);
|
|
42076
|
+
return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
|
|
42077
|
+
};
|
|
42010
42078
|
var VOICED_FLOOR = 0.02;
|
|
42011
42079
|
var measureTurnAudio = (chunks, format) => {
|
|
42012
42080
|
const channels = format.channels ?? 1;
|
|
@@ -42025,38 +42093,44 @@ var measureTurnAudio = (chunks, format) => {
|
|
|
42025
42093
|
}
|
|
42026
42094
|
}
|
|
42027
42095
|
if (voicedSamples === 0) {
|
|
42028
|
-
return {
|
|
42096
|
+
return { rms: 0, voicedMs: 0 };
|
|
42029
42097
|
}
|
|
42030
42098
|
return {
|
|
42031
|
-
|
|
42032
|
-
|
|
42099
|
+
rms: Math.sqrt(sumSquares / voicedSamples),
|
|
42100
|
+
voicedMs: voicedSamples / channels / sampleRate * 1000
|
|
42033
42101
|
};
|
|
42034
42102
|
};
|
|
42035
42103
|
var createAcousticBargeInDetector = (options = {}) => {
|
|
42036
|
-
const sustainedMs = options.sustainedMs ??
|
|
42037
|
-
const
|
|
42038
|
-
|
|
42104
|
+
const sustainedMs = options.sustainedMs ?? 600;
|
|
42105
|
+
const interruptionCues = [
|
|
42106
|
+
...DEFAULT_INTERRUPTION_CUES,
|
|
42107
|
+
...options.interruptionCues ?? []
|
|
42108
|
+
];
|
|
42039
42109
|
return {
|
|
42040
42110
|
evaluate: (input) => {
|
|
42111
|
+
const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
|
|
42041
42112
|
const { turnAudio, turnAudioFormat } = input;
|
|
42042
42113
|
if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
|
|
42043
|
-
|
|
42114
|
+
if (input.isBackchannelByText) {
|
|
42115
|
+
return { reason: "text_backchannel", shouldCancel: false };
|
|
42116
|
+
}
|
|
42117
|
+
return {
|
|
42118
|
+
reason: isInterruptionCue ? "text_interruption" : "text_only",
|
|
42119
|
+
shouldCancel: true
|
|
42120
|
+
};
|
|
42044
42121
|
}
|
|
42045
|
-
const
|
|
42046
|
-
|
|
42047
|
-
if (
|
|
42122
|
+
const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
|
|
42123
|
+
metrics.voicedMs = Math.round(metrics.voicedMs);
|
|
42124
|
+
if (metrics.voicedMs >= sustainedMs) {
|
|
42048
42125
|
return { metrics, reason: "acoustic_sustained", shouldCancel: true };
|
|
42049
42126
|
}
|
|
42050
42127
|
if (input.isBackchannelByText) {
|
|
42051
42128
|
return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
|
|
42052
42129
|
}
|
|
42053
|
-
if (
|
|
42054
|
-
return { metrics, reason: "
|
|
42055
|
-
}
|
|
42056
|
-
if (rms <= noiseFloorRms) {
|
|
42057
|
-
return { metrics, reason: "acoustic_noise_floor", shouldCancel: false };
|
|
42130
|
+
if (isInterruptionCue) {
|
|
42131
|
+
return { metrics, reason: "acoustic_interruption", shouldCancel: true };
|
|
42058
42132
|
}
|
|
42059
|
-
return { metrics, reason: "
|
|
42133
|
+
return { metrics, reason: "acoustic_hold", shouldCancel: false };
|
|
42060
42134
|
}
|
|
42061
42135
|
};
|
|
42062
42136
|
};
|
|
@@ -51785,7 +51859,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
|
|
|
51785
51859
|
};
|
|
51786
51860
|
};
|
|
51787
51861
|
// src/core/scorecardCalibration.ts
|
|
51788
|
-
var
|
|
51862
|
+
var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
|
|
51789
51863
|
var correlation = (xs, ys) => {
|
|
51790
51864
|
if (xs.length === 0 || xs.length !== ys.length)
|
|
51791
51865
|
return 0;
|
|
@@ -51844,8 +51918,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
|
|
|
51844
51918
|
const l = llmByCriterion.get(criterionId);
|
|
51845
51919
|
if (!h || !l)
|
|
51846
51920
|
continue;
|
|
51847
|
-
const hn =
|
|
51848
|
-
const ln =
|
|
51921
|
+
const hn = normalize2(h.score, pair.human.scaleMax);
|
|
51922
|
+
const ln = normalize2(l.score, pair.llm.scaleMax);
|
|
51849
51923
|
const gap = Math.abs(hn - ln);
|
|
51850
51924
|
allGaps.push(gap);
|
|
51851
51925
|
divergences.push({
|
|
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
133
133
|
* snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
|
|
134
134
|
*/
|
|
135
135
|
semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
|
|
136
|
+
bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
|
|
136
137
|
/**
|
|
137
138
|
* Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
|
|
138
139
|
* in the gap between user-turn-commit and real assistant audio. Boardy's
|
package/dist/testing/index.js
CHANGED
|
@@ -14244,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
14244
14244
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
14245
14245
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
14246
14246
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
14247
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
14247
14248
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
14248
14249
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
14249
14250
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|