@absolutejs/voice 0.0.22-beta.606 → 0.0.22-beta.608
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/backchannel.d.ts +1 -0
- package/dist/core/types.d.ts +16 -0
- package/dist/index.js +118 -7
- package/dist/testing/index.js +117 -7
- package/package.json +1 -1
|
@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
|
|
|
21
21
|
noteSilence: (timestampMs?: number) => void;
|
|
22
22
|
reset: () => void;
|
|
23
23
|
};
|
|
24
|
+
export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
|
|
24
25
|
export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;
|
package/dist/core/types.d.ts
CHANGED
|
@@ -802,6 +802,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
802
802
|
turnDetection?: VoiceTurnDetectionConfig;
|
|
803
803
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
804
804
|
bargeInMinPartialWords?: number;
|
|
805
|
+
/**
|
|
806
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
807
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
808
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
809
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
810
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
811
|
+
*/
|
|
812
|
+
backchannelBargeInGuard?: boolean;
|
|
805
813
|
fillerPhrases?: ReadonlyArray<string>;
|
|
806
814
|
fillerDelayMs?: number;
|
|
807
815
|
fillerFor?: (input: {
|
|
@@ -975,6 +983,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
975
983
|
* Word splitting is whitespace-based. Punctuation is left attached.
|
|
976
984
|
*/
|
|
977
985
|
bargeInMinPartialWords?: number;
|
|
986
|
+
/**
|
|
987
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
988
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
989
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
990
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
991
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
992
|
+
*/
|
|
993
|
+
backchannelBargeInGuard?: boolean;
|
|
978
994
|
fillerPhrases?: ReadonlyArray<string>;
|
|
979
995
|
/** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
|
|
980
996
|
fillerDelayMs?: number;
|
package/dist/index.js
CHANGED
|
@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
|
|
|
3118
3118
|
{ text: "right" },
|
|
3119
3119
|
{ text: "go on" }
|
|
3120
3120
|
];
|
|
3121
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
3122
|
+
"mm",
|
|
3123
|
+
"mmm",
|
|
3124
|
+
"mhm",
|
|
3125
|
+
"mmhm",
|
|
3126
|
+
"mmhmm",
|
|
3127
|
+
"hm",
|
|
3128
|
+
"hmm",
|
|
3129
|
+
"uh-huh",
|
|
3130
|
+
"uhhuh",
|
|
3131
|
+
"uh",
|
|
3132
|
+
"huh",
|
|
3133
|
+
"ah",
|
|
3134
|
+
"oh",
|
|
3135
|
+
"yeah",
|
|
3136
|
+
"yep",
|
|
3137
|
+
"yup",
|
|
3138
|
+
"yes",
|
|
3139
|
+
"ya",
|
|
3140
|
+
"yah",
|
|
3141
|
+
"ok",
|
|
3142
|
+
"okay",
|
|
3143
|
+
"k",
|
|
3144
|
+
"kay",
|
|
3145
|
+
"right",
|
|
3146
|
+
"sure",
|
|
3147
|
+
"totally",
|
|
3148
|
+
"exactly",
|
|
3149
|
+
"absolutely",
|
|
3150
|
+
"definitely",
|
|
3151
|
+
"gotcha",
|
|
3152
|
+
"cool",
|
|
3153
|
+
"nice",
|
|
3154
|
+
"wow",
|
|
3155
|
+
"true",
|
|
3156
|
+
"fair",
|
|
3157
|
+
"aha",
|
|
3158
|
+
"perfect",
|
|
3159
|
+
"awesome",
|
|
3160
|
+
"great",
|
|
3161
|
+
"good",
|
|
3162
|
+
"wonderful",
|
|
3163
|
+
"amazing",
|
|
3164
|
+
"interesting",
|
|
3165
|
+
"understood",
|
|
3166
|
+
"agreed"
|
|
3167
|
+
]);
|
|
3168
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
3169
|
+
"i see",
|
|
3170
|
+
"got it",
|
|
3171
|
+
"makes sense",
|
|
3172
|
+
"of course",
|
|
3173
|
+
"for sure",
|
|
3174
|
+
"fair enough",
|
|
3175
|
+
"sounds good",
|
|
3176
|
+
"i know",
|
|
3177
|
+
"oh ok",
|
|
3178
|
+
"oh okay",
|
|
3179
|
+
"that's right",
|
|
3180
|
+
"thats right",
|
|
3181
|
+
"oh wow",
|
|
3182
|
+
"oh nice",
|
|
3183
|
+
"oh cool",
|
|
3184
|
+
"uh huh",
|
|
3185
|
+
"mm hm",
|
|
3186
|
+
"mm hmm",
|
|
3187
|
+
"i hear you",
|
|
3188
|
+
"for real",
|
|
3189
|
+
"no way",
|
|
3190
|
+
"makes total sense",
|
|
3191
|
+
"got you",
|
|
3192
|
+
"i get it",
|
|
3193
|
+
"right right",
|
|
3194
|
+
"yeah yeah",
|
|
3195
|
+
"ok ok",
|
|
3196
|
+
"oh i see",
|
|
3197
|
+
"oh got it",
|
|
3198
|
+
"yeah totally",
|
|
3199
|
+
"yeah exactly"
|
|
3200
|
+
]);
|
|
3201
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
3202
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
3203
|
+
if (!normalized) {
|
|
3204
|
+
return false;
|
|
3205
|
+
}
|
|
3206
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
3207
|
+
return true;
|
|
3208
|
+
}
|
|
3209
|
+
const words = normalized.split(" ");
|
|
3210
|
+
if (words.length > maxWords) {
|
|
3211
|
+
return false;
|
|
3212
|
+
}
|
|
3213
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
3214
|
+
};
|
|
3121
3215
|
var createVoiceBackchannelDriver = (options) => {
|
|
3122
3216
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
3123
3217
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -3757,6 +3851,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
3757
3851
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
3758
3852
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
3759
3853
|
var SPECULATIVE_DELAY_MS = 500;
|
|
3854
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
3760
3855
|
var nextSpeakableBoundary = (buffer) => {
|
|
3761
3856
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
3762
3857
|
return match ? match.index + match[0].length : -1;
|
|
@@ -3996,6 +4091,8 @@ var createVoiceSession = (options) => {
|
|
|
3996
4091
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
3997
4092
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
3998
4093
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
4094
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
4095
|
+
let backchannelSuppressedAt = null;
|
|
3999
4096
|
const fillerFor = options.fillerFor;
|
|
4000
4097
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
4001
4098
|
const currentTurnAudio = [];
|
|
@@ -4496,18 +4593,14 @@ var createVoiceSession = (options) => {
|
|
|
4496
4593
|
text: pendingText,
|
|
4497
4594
|
transcripts: session.currentTurn.transcripts
|
|
4498
4595
|
};
|
|
4499
|
-
const startedAt = Date.now();
|
|
4500
4596
|
const speculate = options.route.speculate;
|
|
4501
4597
|
const promise = Promise.resolve(speculate({
|
|
4502
4598
|
api,
|
|
4503
4599
|
context: options.context,
|
|
4504
4600
|
session,
|
|
4505
4601
|
turn: provisionalTurn
|
|
4506
|
-
})).then((result) => {
|
|
4507
|
-
console.info(`[voice][p3] speculate
|
|
4508
|
-
return result && result.text.trim() ? { text: result.text } : null;
|
|
4509
|
-
}).catch((error) => {
|
|
4510
|
-
console.info(`[voice][p3] speculate error: ${error instanceof Error ? error.message : String(error)}`);
|
|
4602
|
+
})).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
|
|
4603
|
+
console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
4511
4604
|
return null;
|
|
4512
4605
|
});
|
|
4513
4606
|
speculation = { pendingText, promise };
|
|
@@ -5173,7 +5266,19 @@ var createVoiceSession = (options) => {
|
|
|
5173
5266
|
const triggeringText = transcript.text.trim();
|
|
5174
5267
|
if (triggeringText) {
|
|
5175
5268
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
5176
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
5269
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
5270
|
+
backchannelSuppressedAt = Date.now();
|
|
5271
|
+
appendTurnLatencyStage({
|
|
5272
|
+
metadata: {
|
|
5273
|
+
partial: triggeringText.slice(0, 200),
|
|
5274
|
+
reason: "backchannel",
|
|
5275
|
+
wordCount
|
|
5276
|
+
},
|
|
5277
|
+
stage: "barge_in_suppressed",
|
|
5278
|
+
turnId: activeTTSTurnId
|
|
5279
|
+
}).catch(() => {});
|
|
5280
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
5281
|
+
backchannelSuppressedAt = null;
|
|
5177
5282
|
appendTurnLatencyStage({
|
|
5178
5283
|
metadata: {
|
|
5179
5284
|
partial: triggeringText.slice(0, 200),
|
|
@@ -5235,6 +5340,11 @@ var createVoiceSession = (options) => {
|
|
|
5235
5340
|
};
|
|
5236
5341
|
const handleFinal = async (transcript) => {
|
|
5237
5342
|
sttReconnectCount = 0;
|
|
5343
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
5344
|
+
backchannelSuppressedAt = null;
|
|
5345
|
+
return;
|
|
5346
|
+
}
|
|
5347
|
+
backchannelSuppressedAt = null;
|
|
5238
5348
|
const session = await writeSession((session2) => {
|
|
5239
5349
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
5240
5350
|
if (!alreadyPresent) {
|
|
@@ -39585,6 +39695,7 @@ var voice = (config) => {
|
|
|
39585
39695
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
39586
39696
|
...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
|
|
39587
39697
|
...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
|
|
39698
|
+
...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
|
|
39588
39699
|
...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
|
|
39589
39700
|
...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
|
|
39590
39701
|
...config.fillerFor ? { fillerFor: config.fillerFor } : {},
|
package/dist/testing/index.js
CHANGED
|
@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
|
|
|
5450
5450
|
{ text: "right" },
|
|
5451
5451
|
{ text: "go on" }
|
|
5452
5452
|
];
|
|
5453
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
5454
|
+
"mm",
|
|
5455
|
+
"mmm",
|
|
5456
|
+
"mhm",
|
|
5457
|
+
"mmhm",
|
|
5458
|
+
"mmhmm",
|
|
5459
|
+
"hm",
|
|
5460
|
+
"hmm",
|
|
5461
|
+
"uh-huh",
|
|
5462
|
+
"uhhuh",
|
|
5463
|
+
"uh",
|
|
5464
|
+
"huh",
|
|
5465
|
+
"ah",
|
|
5466
|
+
"oh",
|
|
5467
|
+
"yeah",
|
|
5468
|
+
"yep",
|
|
5469
|
+
"yup",
|
|
5470
|
+
"yes",
|
|
5471
|
+
"ya",
|
|
5472
|
+
"yah",
|
|
5473
|
+
"ok",
|
|
5474
|
+
"okay",
|
|
5475
|
+
"k",
|
|
5476
|
+
"kay",
|
|
5477
|
+
"right",
|
|
5478
|
+
"sure",
|
|
5479
|
+
"totally",
|
|
5480
|
+
"exactly",
|
|
5481
|
+
"absolutely",
|
|
5482
|
+
"definitely",
|
|
5483
|
+
"gotcha",
|
|
5484
|
+
"cool",
|
|
5485
|
+
"nice",
|
|
5486
|
+
"wow",
|
|
5487
|
+
"true",
|
|
5488
|
+
"fair",
|
|
5489
|
+
"aha",
|
|
5490
|
+
"perfect",
|
|
5491
|
+
"awesome",
|
|
5492
|
+
"great",
|
|
5493
|
+
"good",
|
|
5494
|
+
"wonderful",
|
|
5495
|
+
"amazing",
|
|
5496
|
+
"interesting",
|
|
5497
|
+
"understood",
|
|
5498
|
+
"agreed"
|
|
5499
|
+
]);
|
|
5500
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
5501
|
+
"i see",
|
|
5502
|
+
"got it",
|
|
5503
|
+
"makes sense",
|
|
5504
|
+
"of course",
|
|
5505
|
+
"for sure",
|
|
5506
|
+
"fair enough",
|
|
5507
|
+
"sounds good",
|
|
5508
|
+
"i know",
|
|
5509
|
+
"oh ok",
|
|
5510
|
+
"oh okay",
|
|
5511
|
+
"that's right",
|
|
5512
|
+
"thats right",
|
|
5513
|
+
"oh wow",
|
|
5514
|
+
"oh nice",
|
|
5515
|
+
"oh cool",
|
|
5516
|
+
"uh huh",
|
|
5517
|
+
"mm hm",
|
|
5518
|
+
"mm hmm",
|
|
5519
|
+
"i hear you",
|
|
5520
|
+
"for real",
|
|
5521
|
+
"no way",
|
|
5522
|
+
"makes total sense",
|
|
5523
|
+
"got you",
|
|
5524
|
+
"i get it",
|
|
5525
|
+
"right right",
|
|
5526
|
+
"yeah yeah",
|
|
5527
|
+
"ok ok",
|
|
5528
|
+
"oh i see",
|
|
5529
|
+
"oh got it",
|
|
5530
|
+
"yeah totally",
|
|
5531
|
+
"yeah exactly"
|
|
5532
|
+
]);
|
|
5533
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
5534
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
5535
|
+
if (!normalized) {
|
|
5536
|
+
return false;
|
|
5537
|
+
}
|
|
5538
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
5539
|
+
return true;
|
|
5540
|
+
}
|
|
5541
|
+
const words = normalized.split(" ");
|
|
5542
|
+
if (words.length > maxWords) {
|
|
5543
|
+
return false;
|
|
5544
|
+
}
|
|
5545
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
5546
|
+
};
|
|
5453
5547
|
var createVoiceBackchannelDriver = (options) => {
|
|
5454
5548
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
5455
5549
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -5984,6 +6078,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
5984
6078
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
5985
6079
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
5986
6080
|
var SPECULATIVE_DELAY_MS = 500;
|
|
6081
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
5987
6082
|
var nextSpeakableBoundary = (buffer) => {
|
|
5988
6083
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
5989
6084
|
return match ? match.index + match[0].length : -1;
|
|
@@ -6223,6 +6318,8 @@ var createVoiceSession = (options) => {
|
|
|
6223
6318
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
6224
6319
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
6225
6320
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
6321
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
6322
|
+
let backchannelSuppressedAt = null;
|
|
6226
6323
|
const fillerFor = options.fillerFor;
|
|
6227
6324
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
6228
6325
|
const currentTurnAudio = [];
|
|
@@ -6723,18 +6820,14 @@ var createVoiceSession = (options) => {
|
|
|
6723
6820
|
text: pendingText,
|
|
6724
6821
|
transcripts: session.currentTurn.transcripts
|
|
6725
6822
|
};
|
|
6726
|
-
const startedAt = Date.now();
|
|
6727
6823
|
const speculate = options.route.speculate;
|
|
6728
6824
|
const promise = Promise.resolve(speculate({
|
|
6729
6825
|
api,
|
|
6730
6826
|
context: options.context,
|
|
6731
6827
|
session,
|
|
6732
6828
|
turn: provisionalTurn
|
|
6733
|
-
})).then((result) => {
|
|
6734
|
-
console.info(`[voice][p3] speculate
|
|
6735
|
-
return result && result.text.trim() ? { text: result.text } : null;
|
|
6736
|
-
}).catch((error) => {
|
|
6737
|
-
console.info(`[voice][p3] speculate error: ${error instanceof Error ? error.message : String(error)}`);
|
|
6829
|
+
})).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
|
|
6830
|
+
console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
6738
6831
|
return null;
|
|
6739
6832
|
});
|
|
6740
6833
|
speculation = { pendingText, promise };
|
|
@@ -7400,7 +7493,19 @@ var createVoiceSession = (options) => {
|
|
|
7400
7493
|
const triggeringText = transcript.text.trim();
|
|
7401
7494
|
if (triggeringText) {
|
|
7402
7495
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
7403
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
7496
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
7497
|
+
backchannelSuppressedAt = Date.now();
|
|
7498
|
+
appendTurnLatencyStage({
|
|
7499
|
+
metadata: {
|
|
7500
|
+
partial: triggeringText.slice(0, 200),
|
|
7501
|
+
reason: "backchannel",
|
|
7502
|
+
wordCount
|
|
7503
|
+
},
|
|
7504
|
+
stage: "barge_in_suppressed",
|
|
7505
|
+
turnId: activeTTSTurnId
|
|
7506
|
+
}).catch(() => {});
|
|
7507
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
7508
|
+
backchannelSuppressedAt = null;
|
|
7404
7509
|
appendTurnLatencyStage({
|
|
7405
7510
|
metadata: {
|
|
7406
7511
|
partial: triggeringText.slice(0, 200),
|
|
@@ -7462,6 +7567,11 @@ var createVoiceSession = (options) => {
|
|
|
7462
7567
|
};
|
|
7463
7568
|
const handleFinal = async (transcript) => {
|
|
7464
7569
|
sttReconnectCount = 0;
|
|
7570
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
7571
|
+
backchannelSuppressedAt = null;
|
|
7572
|
+
return;
|
|
7573
|
+
}
|
|
7574
|
+
backchannelSuppressedAt = null;
|
|
7465
7575
|
const session = await writeSession((session2) => {
|
|
7466
7576
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
7467
7577
|
if (!alreadyPresent) {
|