@absolutejs/voice 0.0.22-beta.613 → 0.0.22-beta.614
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/bargeInDetector.d.ts +16 -14
- package/dist/index.js +48 -19
- package/dist/telephony/twilio.d.ts +1 -0
- package/dist/testing/index.js +1 -0
- package/package.json +1 -1
|
@@ -17,7 +17,7 @@ export type VoiceBargeInVerdict = {
|
|
|
17
17
|
reason?: string;
|
|
18
18
|
/**
|
|
19
19
|
* The acoustic measurements the decision used, surfaced on the trace for
|
|
20
|
-
* tuning
|
|
20
|
+
* tuning against real audio. Omitted when no audio was judged.
|
|
21
21
|
*/
|
|
22
22
|
metrics?: {
|
|
23
23
|
voicedMs: number;
|
|
@@ -28,22 +28,24 @@ export type VoiceBargeInDetector = {
|
|
|
28
28
|
evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
|
|
29
29
|
};
|
|
30
30
|
export type CreateAcousticBargeInDetectorOptions = {
|
|
31
|
-
/**
|
|
31
|
+
/** Voiced speech sustained this long (ms) is a real interruption — cancel. */
|
|
32
32
|
sustainedMs?: number;
|
|
33
|
-
/**
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
33
|
+
/**
|
|
34
|
+
* Leading words that mark an interruption ("wait", "hold on", "sorry"). A
|
|
35
|
+
* short utterance starting with one cancels immediately instead of holding.
|
|
36
|
+
* Extends (does not replace) the defaults.
|
|
37
|
+
*/
|
|
38
|
+
interruptionCues?: string[];
|
|
37
39
|
};
|
|
38
40
|
/**
|
|
39
|
-
* A model-free
|
|
40
|
-
*
|
|
41
|
-
* -
|
|
42
|
-
* - known cue
|
|
43
|
-
* -
|
|
44
|
-
* - short +
|
|
45
|
-
*
|
|
46
|
-
*
|
|
41
|
+
* A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
|
|
42
|
+
* (energy was measured to not discriminate on normalized speech):
|
|
43
|
+
* - voiced speech past `sustainedMs` → real interruption (cancel)
|
|
44
|
+
* - known backchannel cue, still short → backchannel (keep talking)
|
|
45
|
+
* - starts with an interruption cue → caller took the floor (cancel)
|
|
46
|
+
* - otherwise short + ambiguous → HOLD: keep talking; a continuing
|
|
47
|
+
* utterance cancels itself once its
|
|
48
|
+
* voiced duration crosses sustainedMs
|
|
47
49
|
* Runs in-process on raw arithmetic — no model, no sidecar.
|
|
48
50
|
*/
|
|
49
51
|
export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
|
package/dist/index.js
CHANGED
|
@@ -25354,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
25354
25354
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
25355
25355
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
25356
25356
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
25357
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
25357
25358
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
25358
25359
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
25359
25360
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|
|
@@ -42007,6 +42008,28 @@ var createRegexSemanticTurnDetector = (options) => {
|
|
|
42007
42008
|
};
|
|
42008
42009
|
};
|
|
42009
42010
|
// src/core/bargeInDetector.ts
|
|
42011
|
+
var DEFAULT_INTERRUPTION_CUES = [
|
|
42012
|
+
"wait",
|
|
42013
|
+
"hold on",
|
|
42014
|
+
"hold up",
|
|
42015
|
+
"hang on",
|
|
42016
|
+
"stop",
|
|
42017
|
+
"sorry",
|
|
42018
|
+
"excuse me",
|
|
42019
|
+
"actually",
|
|
42020
|
+
"one sec",
|
|
42021
|
+
"one second",
|
|
42022
|
+
"quick question",
|
|
42023
|
+
"question",
|
|
42024
|
+
"can i",
|
|
42025
|
+
"let me",
|
|
42026
|
+
"no no"
|
|
42027
|
+
];
|
|
42028
|
+
var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
42029
|
+
var startsWithCue = (text, cues) => {
|
|
42030
|
+
const norm = normalize(text);
|
|
42031
|
+
return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
|
|
42032
|
+
};
|
|
42010
42033
|
var VOICED_FLOOR = 0.02;
|
|
42011
42034
|
var measureTurnAudio = (chunks, format) => {
|
|
42012
42035
|
const channels = format.channels ?? 1;
|
|
@@ -42025,38 +42048,44 @@ var measureTurnAudio = (chunks, format) => {
|
|
|
42025
42048
|
}
|
|
42026
42049
|
}
|
|
42027
42050
|
if (voicedSamples === 0) {
|
|
42028
|
-
return {
|
|
42051
|
+
return { rms: 0, voicedMs: 0 };
|
|
42029
42052
|
}
|
|
42030
42053
|
return {
|
|
42031
|
-
|
|
42032
|
-
|
|
42054
|
+
rms: Math.sqrt(sumSquares / voicedSamples),
|
|
42055
|
+
voicedMs: voicedSamples / channels / sampleRate * 1000
|
|
42033
42056
|
};
|
|
42034
42057
|
};
|
|
42035
42058
|
var createAcousticBargeInDetector = (options = {}) => {
|
|
42036
|
-
const sustainedMs = options.sustainedMs ??
|
|
42037
|
-
const
|
|
42038
|
-
|
|
42059
|
+
const sustainedMs = options.sustainedMs ?? 600;
|
|
42060
|
+
const interruptionCues = [
|
|
42061
|
+
...DEFAULT_INTERRUPTION_CUES,
|
|
42062
|
+
...options.interruptionCues ?? []
|
|
42063
|
+
];
|
|
42039
42064
|
return {
|
|
42040
42065
|
evaluate: (input) => {
|
|
42066
|
+
const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
|
|
42041
42067
|
const { turnAudio, turnAudioFormat } = input;
|
|
42042
42068
|
if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
|
|
42043
|
-
|
|
42069
|
+
if (input.isBackchannelByText) {
|
|
42070
|
+
return { reason: "text_backchannel", shouldCancel: false };
|
|
42071
|
+
}
|
|
42072
|
+
return {
|
|
42073
|
+
reason: isInterruptionCue ? "text_interruption" : "text_only",
|
|
42074
|
+
shouldCancel: true
|
|
42075
|
+
};
|
|
42044
42076
|
}
|
|
42045
|
-
const
|
|
42046
|
-
|
|
42047
|
-
if (
|
|
42077
|
+
const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
|
|
42078
|
+
metrics.voicedMs = Math.round(metrics.voicedMs);
|
|
42079
|
+
if (metrics.voicedMs >= sustainedMs) {
|
|
42048
42080
|
return { metrics, reason: "acoustic_sustained", shouldCancel: true };
|
|
42049
42081
|
}
|
|
42050
42082
|
if (input.isBackchannelByText) {
|
|
42051
42083
|
return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
|
|
42052
42084
|
}
|
|
42053
|
-
if (
|
|
42054
|
-
return { metrics, reason: "
|
|
42055
|
-
}
|
|
42056
|
-
if (rms <= noiseFloorRms) {
|
|
42057
|
-
return { metrics, reason: "acoustic_noise_floor", shouldCancel: false };
|
|
42085
|
+
if (isInterruptionCue) {
|
|
42086
|
+
return { metrics, reason: "acoustic_interruption", shouldCancel: true };
|
|
42058
42087
|
}
|
|
42059
|
-
return { metrics, reason: "
|
|
42088
|
+
return { metrics, reason: "acoustic_hold", shouldCancel: false };
|
|
42060
42089
|
}
|
|
42061
42090
|
};
|
|
42062
42091
|
};
|
|
@@ -51785,7 +51814,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
|
|
|
51785
51814
|
};
|
|
51786
51815
|
};
|
|
51787
51816
|
// src/core/scorecardCalibration.ts
|
|
51788
|
-
var
|
|
51817
|
+
var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
|
|
51789
51818
|
var correlation = (xs, ys) => {
|
|
51790
51819
|
if (xs.length === 0 || xs.length !== ys.length)
|
|
51791
51820
|
return 0;
|
|
@@ -51844,8 +51873,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
|
|
|
51844
51873
|
const l = llmByCriterion.get(criterionId);
|
|
51845
51874
|
if (!h || !l)
|
|
51846
51875
|
continue;
|
|
51847
|
-
const hn =
|
|
51848
|
-
const ln =
|
|
51876
|
+
const hn = normalize2(h.score, pair.human.scaleMax);
|
|
51877
|
+
const ln = normalize2(l.score, pair.llm.scaleMax);
|
|
51849
51878
|
const gap = Math.abs(hn - ln);
|
|
51850
51879
|
allGaps.push(gap);
|
|
51851
51880
|
divergences.push({
|
|
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
133
133
|
* snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
|
|
134
134
|
*/
|
|
135
135
|
semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
|
|
136
|
+
bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
|
|
136
137
|
/**
|
|
137
138
|
* Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
|
|
138
139
|
* in the gap between user-turn-commit and real assistant audio. Boardy's
|
package/dist/testing/index.js
CHANGED
|
@@ -14244,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
14244
14244
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
14245
14245
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
14246
14246
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
14247
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
14247
14248
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
14248
14249
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
14249
14250
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|