@absolutejs/voice 0.0.22-beta.612 → 0.0.22-beta.614
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/bargeInDetector.d.ts +23 -13
- package/dist/index.js +57 -21
- package/dist/telephony/twilio.d.ts +1 -0
- package/dist/testing/index.js +8 -1
- package/package.json +1 -1
|
@@ -15,27 +15,37 @@ export type VoiceBargeInVerdict = {
|
|
|
15
15
|
shouldCancel: boolean;
|
|
16
16
|
/** Diagnostic label, surfaced on the barge_in / barge_in_suppressed trace. */
|
|
17
17
|
reason?: string;
|
|
18
|
+
/**
|
|
19
|
+
* The acoustic measurements the decision used, surfaced on the trace for
|
|
20
|
+
* tuning against real audio. Omitted when no audio was judged.
|
|
21
|
+
*/
|
|
22
|
+
metrics?: {
|
|
23
|
+
voicedMs: number;
|
|
24
|
+
rms: number;
|
|
25
|
+
};
|
|
18
26
|
};
|
|
19
27
|
export type VoiceBargeInDetector = {
|
|
20
28
|
evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
|
|
21
29
|
};
|
|
22
30
|
export type CreateAcousticBargeInDetectorOptions = {
|
|
23
|
-
/**
|
|
31
|
+
/** Voiced speech sustained this long (ms) is a real interruption — cancel. */
|
|
24
32
|
sustainedMs?: number;
|
|
25
|
-
/**
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
33
|
+
/**
|
|
34
|
+
* Leading words that mark an interruption ("wait", "hold on", "sorry"). A
|
|
35
|
+
* short utterance starting with one cancels immediately instead of holding.
|
|
36
|
+
* Extends (does not replace) the defaults.
|
|
37
|
+
*/
|
|
38
|
+
interruptionCues?: string[];
|
|
29
39
|
};
|
|
30
40
|
/**
|
|
31
|
-
* A model-free
|
|
32
|
-
*
|
|
33
|
-
* -
|
|
34
|
-
* - known cue
|
|
35
|
-
* -
|
|
36
|
-
* - short +
|
|
37
|
-
*
|
|
38
|
-
*
|
|
41
|
+
* A model-free backchannel-vs-barge-in classifier driven by TEXT + PERSISTENCE
|
|
42
|
+
* (energy was measured to not discriminate on normalized speech):
|
|
43
|
+
* - voiced speech past `sustainedMs` → real interruption (cancel)
|
|
44
|
+
* - known backchannel cue, still short → backchannel (keep talking)
|
|
45
|
+
* - starts with an interruption cue → caller took the floor (cancel)
|
|
46
|
+
* - otherwise short + ambiguous → HOLD: keep talking; a continuing
|
|
47
|
+
* utterance cancels itself once its
|
|
48
|
+
* voiced duration crosses sustainedMs
|
|
39
49
|
* Runs in-process on raw arithmetic — no model, no sidecar.
|
|
40
50
|
*/
|
|
41
51
|
export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
|
package/dist/index.js
CHANGED
|
@@ -5287,12 +5287,17 @@ var createVoiceSession = (options) => {
|
|
|
5287
5287
|
partialText: triggeringText,
|
|
5288
5288
|
wordCount,
|
|
5289
5289
|
...getTurnAudioForDetector()
|
|
5290
|
-
})) : {
|
|
5290
|
+
})) : {
|
|
5291
|
+
metrics: undefined,
|
|
5292
|
+
reason: undefined,
|
|
5293
|
+
shouldCancel: !isBackchannelByText
|
|
5294
|
+
};
|
|
5291
5295
|
const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
|
|
5292
5296
|
if (verdict.shouldCancel) {
|
|
5293
5297
|
backchannelSuppressedAt = null;
|
|
5294
5298
|
appendTurnLatencyStage({
|
|
5295
5299
|
metadata: {
|
|
5300
|
+
...verdict.metrics,
|
|
5296
5301
|
partial: triggeringText.slice(0, 200),
|
|
5297
5302
|
source: reason,
|
|
5298
5303
|
wordCount
|
|
@@ -5305,6 +5310,7 @@ var createVoiceSession = (options) => {
|
|
|
5305
5310
|
backchannelSuppressedAt = Date.now();
|
|
5306
5311
|
appendTurnLatencyStage({
|
|
5307
5312
|
metadata: {
|
|
5313
|
+
...verdict.metrics,
|
|
5308
5314
|
partial: triggeringText.slice(0, 200),
|
|
5309
5315
|
reason,
|
|
5310
5316
|
wordCount
|
|
@@ -25348,6 +25354,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
25348
25354
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
25349
25355
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
25350
25356
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
25357
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
25351
25358
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
25352
25359
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
25353
25360
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|
|
@@ -42001,6 +42008,28 @@ var createRegexSemanticTurnDetector = (options) => {
|
|
|
42001
42008
|
};
|
|
42002
42009
|
};
|
|
42003
42010
|
// src/core/bargeInDetector.ts
|
|
42011
|
+
var DEFAULT_INTERRUPTION_CUES = [
|
|
42012
|
+
"wait",
|
|
42013
|
+
"hold on",
|
|
42014
|
+
"hold up",
|
|
42015
|
+
"hang on",
|
|
42016
|
+
"stop",
|
|
42017
|
+
"sorry",
|
|
42018
|
+
"excuse me",
|
|
42019
|
+
"actually",
|
|
42020
|
+
"one sec",
|
|
42021
|
+
"one second",
|
|
42022
|
+
"quick question",
|
|
42023
|
+
"question",
|
|
42024
|
+
"can i",
|
|
42025
|
+
"let me",
|
|
42026
|
+
"no no"
|
|
42027
|
+
];
|
|
42028
|
+
var normalize = (text) => text.toLowerCase().replace(/[^a-z\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
42029
|
+
var startsWithCue = (text, cues) => {
|
|
42030
|
+
const norm = normalize(text);
|
|
42031
|
+
return cues.some((cue) => norm === cue || norm.startsWith(`${cue} `));
|
|
42032
|
+
};
|
|
42004
42033
|
var VOICED_FLOOR = 0.02;
|
|
42005
42034
|
var measureTurnAudio = (chunks, format) => {
|
|
42006
42035
|
const channels = format.channels ?? 1;
|
|
@@ -42019,37 +42048,44 @@ var measureTurnAudio = (chunks, format) => {
|
|
|
42019
42048
|
}
|
|
42020
42049
|
}
|
|
42021
42050
|
if (voicedSamples === 0) {
|
|
42022
|
-
return {
|
|
42051
|
+
return { rms: 0, voicedMs: 0 };
|
|
42023
42052
|
}
|
|
42024
42053
|
return {
|
|
42025
|
-
|
|
42026
|
-
|
|
42054
|
+
rms: Math.sqrt(sumSquares / voicedSamples),
|
|
42055
|
+
voicedMs: voicedSamples / channels / sampleRate * 1000
|
|
42027
42056
|
};
|
|
42028
42057
|
};
|
|
42029
42058
|
var createAcousticBargeInDetector = (options = {}) => {
|
|
42030
|
-
const sustainedMs = options.sustainedMs ??
|
|
42031
|
-
const
|
|
42032
|
-
|
|
42059
|
+
const sustainedMs = options.sustainedMs ?? 600;
|
|
42060
|
+
const interruptionCues = [
|
|
42061
|
+
...DEFAULT_INTERRUPTION_CUES,
|
|
42062
|
+
...options.interruptionCues ?? []
|
|
42063
|
+
];
|
|
42033
42064
|
return {
|
|
42034
42065
|
evaluate: (input) => {
|
|
42066
|
+
const isInterruptionCue = startsWithCue(input.partialText, interruptionCues);
|
|
42035
42067
|
const { turnAudio, turnAudioFormat } = input;
|
|
42036
42068
|
if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
|
|
42037
|
-
|
|
42069
|
+
if (input.isBackchannelByText) {
|
|
42070
|
+
return { reason: "text_backchannel", shouldCancel: false };
|
|
42071
|
+
}
|
|
42072
|
+
return {
|
|
42073
|
+
reason: isInterruptionCue ? "text_interruption" : "text_only",
|
|
42074
|
+
shouldCancel: true
|
|
42075
|
+
};
|
|
42038
42076
|
}
|
|
42039
|
-
const
|
|
42040
|
-
|
|
42041
|
-
|
|
42077
|
+
const metrics = measureTurnAudio(turnAudio, turnAudioFormat);
|
|
42078
|
+
metrics.voicedMs = Math.round(metrics.voicedMs);
|
|
42079
|
+
if (metrics.voicedMs >= sustainedMs) {
|
|
42080
|
+
return { metrics, reason: "acoustic_sustained", shouldCancel: true };
|
|
42042
42081
|
}
|
|
42043
42082
|
if (input.isBackchannelByText) {
|
|
42044
|
-
return { reason: "acoustic_backchannel", shouldCancel: false };
|
|
42045
|
-
}
|
|
42046
|
-
if (rms >= emphaticRms) {
|
|
42047
|
-
return { reason: "acoustic_emphatic", shouldCancel: true };
|
|
42083
|
+
return { metrics, reason: "acoustic_backchannel", shouldCancel: false };
|
|
42048
42084
|
}
|
|
42049
|
-
if (
|
|
42050
|
-
return { reason: "
|
|
42085
|
+
if (isInterruptionCue) {
|
|
42086
|
+
return { metrics, reason: "acoustic_interruption", shouldCancel: true };
|
|
42051
42087
|
}
|
|
42052
|
-
return { reason: "
|
|
42088
|
+
return { metrics, reason: "acoustic_hold", shouldCancel: false };
|
|
42053
42089
|
}
|
|
42054
42090
|
};
|
|
42055
42091
|
};
|
|
@@ -51778,7 +51814,7 @@ var buildVoiceAgentPerformanceReport = (input) => {
|
|
|
51778
51814
|
};
|
|
51779
51815
|
};
|
|
51780
51816
|
// src/core/scorecardCalibration.ts
|
|
51781
|
-
var
|
|
51817
|
+
var normalize2 = (raw, scaleMax) => scaleMax === 0 ? 0 : raw / scaleMax;
|
|
51782
51818
|
var correlation = (xs, ys) => {
|
|
51783
51819
|
if (xs.length === 0 || xs.length !== ys.length)
|
|
51784
51820
|
return 0;
|
|
@@ -51837,8 +51873,8 @@ var computeVoiceScorecardCalibration = (pairs, options = {}) => {
|
|
|
51837
51873
|
const l = llmByCriterion.get(criterionId);
|
|
51838
51874
|
if (!h || !l)
|
|
51839
51875
|
continue;
|
|
51840
|
-
const hn =
|
|
51841
|
-
const ln =
|
|
51876
|
+
const hn = normalize2(h.score, pair.human.scaleMax);
|
|
51877
|
+
const ln = normalize2(l.score, pair.llm.scaleMax);
|
|
51842
51878
|
const gap = Math.abs(hn - ln);
|
|
51843
51879
|
allGaps.push(gap);
|
|
51844
51880
|
divergences.push({
|
|
@@ -133,6 +133,7 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
133
133
|
* snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
|
|
134
134
|
*/
|
|
135
135
|
semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
|
|
136
|
+
bargeInDetector?: import("../core/bargeInDetector").VoiceBargeInDetector;
|
|
136
137
|
/**
|
|
137
138
|
* Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
|
|
138
139
|
* in the gap between user-turn-commit and real assistant audio. Boardy's
|
package/dist/testing/index.js
CHANGED
|
@@ -7514,12 +7514,17 @@ var createVoiceSession = (options) => {
|
|
|
7514
7514
|
partialText: triggeringText,
|
|
7515
7515
|
wordCount,
|
|
7516
7516
|
...getTurnAudioForDetector()
|
|
7517
|
-
})) : {
|
|
7517
|
+
})) : {
|
|
7518
|
+
metrics: undefined,
|
|
7519
|
+
reason: undefined,
|
|
7520
|
+
shouldCancel: !isBackchannelByText
|
|
7521
|
+
};
|
|
7518
7522
|
const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
|
|
7519
7523
|
if (verdict.shouldCancel) {
|
|
7520
7524
|
backchannelSuppressedAt = null;
|
|
7521
7525
|
appendTurnLatencyStage({
|
|
7522
7526
|
metadata: {
|
|
7527
|
+
...verdict.metrics,
|
|
7523
7528
|
partial: triggeringText.slice(0, 200),
|
|
7524
7529
|
source: reason,
|
|
7525
7530
|
wordCount
|
|
@@ -7532,6 +7537,7 @@ var createVoiceSession = (options) => {
|
|
|
7532
7537
|
backchannelSuppressedAt = Date.now();
|
|
7533
7538
|
appendTurnLatencyStage({
|
|
7534
7539
|
metadata: {
|
|
7540
|
+
...verdict.metrics,
|
|
7535
7541
|
partial: triggeringText.slice(0, 200),
|
|
7536
7542
|
reason,
|
|
7537
7543
|
wordCount
|
|
@@ -14238,6 +14244,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
14238
14244
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
14239
14245
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
14240
14246
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
14247
|
+
...options.bargeInDetector ? { bargeInDetector: options.bargeInDetector } : {},
|
|
14241
14248
|
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
14242
14249
|
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
14243
14250
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|