@absolutejs/voice 0.0.22-beta.607 → 0.0.22-beta.608
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/backchannel.d.ts +1 -0
- package/dist/core/types.d.ts +16 -0
- package/dist/index.js +116 -1
- package/dist/testing/index.js +115 -1
- package/package.json +1 -1
|
@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
|
|
|
21
21
|
noteSilence: (timestampMs?: number) => void;
|
|
22
22
|
reset: () => void;
|
|
23
23
|
};
|
|
24
|
+
export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
|
|
24
25
|
export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;
|
package/dist/core/types.d.ts
CHANGED
|
@@ -802,6 +802,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
802
802
|
turnDetection?: VoiceTurnDetectionConfig;
|
|
803
803
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
804
804
|
bargeInMinPartialWords?: number;
|
|
805
|
+
/**
|
|
806
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
807
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
808
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
809
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
810
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
811
|
+
*/
|
|
812
|
+
backchannelBargeInGuard?: boolean;
|
|
805
813
|
fillerPhrases?: ReadonlyArray<string>;
|
|
806
814
|
fillerDelayMs?: number;
|
|
807
815
|
fillerFor?: (input: {
|
|
@@ -975,6 +983,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
975
983
|
* Word splitting is whitespace-based. Punctuation is left attached.
|
|
976
984
|
*/
|
|
977
985
|
bargeInMinPartialWords?: number;
|
|
986
|
+
/**
|
|
987
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
988
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
989
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
990
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
991
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
992
|
+
*/
|
|
993
|
+
backchannelBargeInGuard?: boolean;
|
|
978
994
|
fillerPhrases?: ReadonlyArray<string>;
|
|
979
995
|
/** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
|
|
980
996
|
fillerDelayMs?: number;
|
package/dist/index.js
CHANGED
|
@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
|
|
|
3118
3118
|
{ text: "right" },
|
|
3119
3119
|
{ text: "go on" }
|
|
3120
3120
|
];
|
|
3121
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
3122
|
+
"mm",
|
|
3123
|
+
"mmm",
|
|
3124
|
+
"mhm",
|
|
3125
|
+
"mmhm",
|
|
3126
|
+
"mmhmm",
|
|
3127
|
+
"hm",
|
|
3128
|
+
"hmm",
|
|
3129
|
+
"uh-huh",
|
|
3130
|
+
"uhhuh",
|
|
3131
|
+
"uh",
|
|
3132
|
+
"huh",
|
|
3133
|
+
"ah",
|
|
3134
|
+
"oh",
|
|
3135
|
+
"yeah",
|
|
3136
|
+
"yep",
|
|
3137
|
+
"yup",
|
|
3138
|
+
"yes",
|
|
3139
|
+
"ya",
|
|
3140
|
+
"yah",
|
|
3141
|
+
"ok",
|
|
3142
|
+
"okay",
|
|
3143
|
+
"k",
|
|
3144
|
+
"kay",
|
|
3145
|
+
"right",
|
|
3146
|
+
"sure",
|
|
3147
|
+
"totally",
|
|
3148
|
+
"exactly",
|
|
3149
|
+
"absolutely",
|
|
3150
|
+
"definitely",
|
|
3151
|
+
"gotcha",
|
|
3152
|
+
"cool",
|
|
3153
|
+
"nice",
|
|
3154
|
+
"wow",
|
|
3155
|
+
"true",
|
|
3156
|
+
"fair",
|
|
3157
|
+
"aha",
|
|
3158
|
+
"perfect",
|
|
3159
|
+
"awesome",
|
|
3160
|
+
"great",
|
|
3161
|
+
"good",
|
|
3162
|
+
"wonderful",
|
|
3163
|
+
"amazing",
|
|
3164
|
+
"interesting",
|
|
3165
|
+
"understood",
|
|
3166
|
+
"agreed"
|
|
3167
|
+
]);
|
|
3168
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
3169
|
+
"i see",
|
|
3170
|
+
"got it",
|
|
3171
|
+
"makes sense",
|
|
3172
|
+
"of course",
|
|
3173
|
+
"for sure",
|
|
3174
|
+
"fair enough",
|
|
3175
|
+
"sounds good",
|
|
3176
|
+
"i know",
|
|
3177
|
+
"oh ok",
|
|
3178
|
+
"oh okay",
|
|
3179
|
+
"that's right",
|
|
3180
|
+
"thats right",
|
|
3181
|
+
"oh wow",
|
|
3182
|
+
"oh nice",
|
|
3183
|
+
"oh cool",
|
|
3184
|
+
"uh huh",
|
|
3185
|
+
"mm hm",
|
|
3186
|
+
"mm hmm",
|
|
3187
|
+
"i hear you",
|
|
3188
|
+
"for real",
|
|
3189
|
+
"no way",
|
|
3190
|
+
"makes total sense",
|
|
3191
|
+
"got you",
|
|
3192
|
+
"i get it",
|
|
3193
|
+
"right right",
|
|
3194
|
+
"yeah yeah",
|
|
3195
|
+
"ok ok",
|
|
3196
|
+
"oh i see",
|
|
3197
|
+
"oh got it",
|
|
3198
|
+
"yeah totally",
|
|
3199
|
+
"yeah exactly"
|
|
3200
|
+
]);
|
|
3201
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
3202
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
3203
|
+
if (!normalized) {
|
|
3204
|
+
return false;
|
|
3205
|
+
}
|
|
3206
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
3207
|
+
return true;
|
|
3208
|
+
}
|
|
3209
|
+
const words = normalized.split(" ");
|
|
3210
|
+
if (words.length > maxWords) {
|
|
3211
|
+
return false;
|
|
3212
|
+
}
|
|
3213
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
3214
|
+
};
|
|
3121
3215
|
var createVoiceBackchannelDriver = (options) => {
|
|
3122
3216
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
3123
3217
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -3757,6 +3851,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
3757
3851
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
3758
3852
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
3759
3853
|
var SPECULATIVE_DELAY_MS = 500;
|
|
3854
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
3760
3855
|
var nextSpeakableBoundary = (buffer) => {
|
|
3761
3856
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
3762
3857
|
return match ? match.index + match[0].length : -1;
|
|
@@ -3996,6 +4091,8 @@ var createVoiceSession = (options) => {
|
|
|
3996
4091
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
3997
4092
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
3998
4093
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
4094
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
4095
|
+
let backchannelSuppressedAt = null;
|
|
3999
4096
|
const fillerFor = options.fillerFor;
|
|
4000
4097
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
4001
4098
|
const currentTurnAudio = [];
|
|
@@ -5169,7 +5266,19 @@ var createVoiceSession = (options) => {
|
|
|
5169
5266
|
const triggeringText = transcript.text.trim();
|
|
5170
5267
|
if (triggeringText) {
|
|
5171
5268
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
5172
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
5269
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
5270
|
+
backchannelSuppressedAt = Date.now();
|
|
5271
|
+
appendTurnLatencyStage({
|
|
5272
|
+
metadata: {
|
|
5273
|
+
partial: triggeringText.slice(0, 200),
|
|
5274
|
+
reason: "backchannel",
|
|
5275
|
+
wordCount
|
|
5276
|
+
},
|
|
5277
|
+
stage: "barge_in_suppressed",
|
|
5278
|
+
turnId: activeTTSTurnId
|
|
5279
|
+
}).catch(() => {});
|
|
5280
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
5281
|
+
backchannelSuppressedAt = null;
|
|
5173
5282
|
appendTurnLatencyStage({
|
|
5174
5283
|
metadata: {
|
|
5175
5284
|
partial: triggeringText.slice(0, 200),
|
|
@@ -5231,6 +5340,11 @@ var createVoiceSession = (options) => {
|
|
|
5231
5340
|
};
|
|
5232
5341
|
const handleFinal = async (transcript) => {
|
|
5233
5342
|
sttReconnectCount = 0;
|
|
5343
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
5344
|
+
backchannelSuppressedAt = null;
|
|
5345
|
+
return;
|
|
5346
|
+
}
|
|
5347
|
+
backchannelSuppressedAt = null;
|
|
5234
5348
|
const session = await writeSession((session2) => {
|
|
5235
5349
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
5236
5350
|
if (!alreadyPresent) {
|
|
@@ -39581,6 +39695,7 @@ var voice = (config) => {
|
|
|
39581
39695
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
39582
39696
|
...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
|
|
39583
39697
|
...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
|
|
39698
|
+
...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
|
|
39584
39699
|
...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
|
|
39585
39700
|
...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
|
|
39586
39701
|
...config.fillerFor ? { fillerFor: config.fillerFor } : {},
|
package/dist/testing/index.js
CHANGED
|
@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
|
|
|
5450
5450
|
{ text: "right" },
|
|
5451
5451
|
{ text: "go on" }
|
|
5452
5452
|
];
|
|
5453
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
5454
|
+
"mm",
|
|
5455
|
+
"mmm",
|
|
5456
|
+
"mhm",
|
|
5457
|
+
"mmhm",
|
|
5458
|
+
"mmhmm",
|
|
5459
|
+
"hm",
|
|
5460
|
+
"hmm",
|
|
5461
|
+
"uh-huh",
|
|
5462
|
+
"uhhuh",
|
|
5463
|
+
"uh",
|
|
5464
|
+
"huh",
|
|
5465
|
+
"ah",
|
|
5466
|
+
"oh",
|
|
5467
|
+
"yeah",
|
|
5468
|
+
"yep",
|
|
5469
|
+
"yup",
|
|
5470
|
+
"yes",
|
|
5471
|
+
"ya",
|
|
5472
|
+
"yah",
|
|
5473
|
+
"ok",
|
|
5474
|
+
"okay",
|
|
5475
|
+
"k",
|
|
5476
|
+
"kay",
|
|
5477
|
+
"right",
|
|
5478
|
+
"sure",
|
|
5479
|
+
"totally",
|
|
5480
|
+
"exactly",
|
|
5481
|
+
"absolutely",
|
|
5482
|
+
"definitely",
|
|
5483
|
+
"gotcha",
|
|
5484
|
+
"cool",
|
|
5485
|
+
"nice",
|
|
5486
|
+
"wow",
|
|
5487
|
+
"true",
|
|
5488
|
+
"fair",
|
|
5489
|
+
"aha",
|
|
5490
|
+
"perfect",
|
|
5491
|
+
"awesome",
|
|
5492
|
+
"great",
|
|
5493
|
+
"good",
|
|
5494
|
+
"wonderful",
|
|
5495
|
+
"amazing",
|
|
5496
|
+
"interesting",
|
|
5497
|
+
"understood",
|
|
5498
|
+
"agreed"
|
|
5499
|
+
]);
|
|
5500
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
5501
|
+
"i see",
|
|
5502
|
+
"got it",
|
|
5503
|
+
"makes sense",
|
|
5504
|
+
"of course",
|
|
5505
|
+
"for sure",
|
|
5506
|
+
"fair enough",
|
|
5507
|
+
"sounds good",
|
|
5508
|
+
"i know",
|
|
5509
|
+
"oh ok",
|
|
5510
|
+
"oh okay",
|
|
5511
|
+
"that's right",
|
|
5512
|
+
"thats right",
|
|
5513
|
+
"oh wow",
|
|
5514
|
+
"oh nice",
|
|
5515
|
+
"oh cool",
|
|
5516
|
+
"uh huh",
|
|
5517
|
+
"mm hm",
|
|
5518
|
+
"mm hmm",
|
|
5519
|
+
"i hear you",
|
|
5520
|
+
"for real",
|
|
5521
|
+
"no way",
|
|
5522
|
+
"makes total sense",
|
|
5523
|
+
"got you",
|
|
5524
|
+
"i get it",
|
|
5525
|
+
"right right",
|
|
5526
|
+
"yeah yeah",
|
|
5527
|
+
"ok ok",
|
|
5528
|
+
"oh i see",
|
|
5529
|
+
"oh got it",
|
|
5530
|
+
"yeah totally",
|
|
5531
|
+
"yeah exactly"
|
|
5532
|
+
]);
|
|
5533
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
5534
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
5535
|
+
if (!normalized) {
|
|
5536
|
+
return false;
|
|
5537
|
+
}
|
|
5538
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
5539
|
+
return true;
|
|
5540
|
+
}
|
|
5541
|
+
const words = normalized.split(" ");
|
|
5542
|
+
if (words.length > maxWords) {
|
|
5543
|
+
return false;
|
|
5544
|
+
}
|
|
5545
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
5546
|
+
};
|
|
5453
5547
|
var createVoiceBackchannelDriver = (options) => {
|
|
5454
5548
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
5455
5549
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -5984,6 +6078,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
5984
6078
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
5985
6079
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
5986
6080
|
var SPECULATIVE_DELAY_MS = 500;
|
|
6081
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
5987
6082
|
var nextSpeakableBoundary = (buffer) => {
|
|
5988
6083
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
5989
6084
|
return match ? match.index + match[0].length : -1;
|
|
@@ -6223,6 +6318,8 @@ var createVoiceSession = (options) => {
|
|
|
6223
6318
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
6224
6319
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
6225
6320
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
6321
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
6322
|
+
let backchannelSuppressedAt = null;
|
|
6226
6323
|
const fillerFor = options.fillerFor;
|
|
6227
6324
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
6228
6325
|
const currentTurnAudio = [];
|
|
@@ -7396,7 +7493,19 @@ var createVoiceSession = (options) => {
|
|
|
7396
7493
|
const triggeringText = transcript.text.trim();
|
|
7397
7494
|
if (triggeringText) {
|
|
7398
7495
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
7399
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
7496
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
7497
|
+
backchannelSuppressedAt = Date.now();
|
|
7498
|
+
appendTurnLatencyStage({
|
|
7499
|
+
metadata: {
|
|
7500
|
+
partial: triggeringText.slice(0, 200),
|
|
7501
|
+
reason: "backchannel",
|
|
7502
|
+
wordCount
|
|
7503
|
+
},
|
|
7504
|
+
stage: "barge_in_suppressed",
|
|
7505
|
+
turnId: activeTTSTurnId
|
|
7506
|
+
}).catch(() => {});
|
|
7507
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
7508
|
+
backchannelSuppressedAt = null;
|
|
7400
7509
|
appendTurnLatencyStage({
|
|
7401
7510
|
metadata: {
|
|
7402
7511
|
partial: triggeringText.slice(0, 200),
|
|
@@ -7458,6 +7567,11 @@ var createVoiceSession = (options) => {
|
|
|
7458
7567
|
};
|
|
7459
7568
|
const handleFinal = async (transcript) => {
|
|
7460
7569
|
sttReconnectCount = 0;
|
|
7570
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
7571
|
+
backchannelSuppressedAt = null;
|
|
7572
|
+
return;
|
|
7573
|
+
}
|
|
7574
|
+
backchannelSuppressedAt = null;
|
|
7461
7575
|
const session = await writeSession((session2) => {
|
|
7462
7576
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
7463
7577
|
if (!alreadyPresent) {
|