@absolutejs/voice 0.0.22-beta.607 → 0.0.22-beta.609
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/agent.d.ts +2 -0
- package/dist/core/backchannel.d.ts +1 -0
- package/dist/core/types.d.ts +17 -0
- package/dist/index.js +136 -8
- package/dist/testing/index.js +133 -7
- package/package.json +1 -1
package/dist/core/agent.d.ts
CHANGED
|
@@ -63,6 +63,7 @@ export type VoiceAgentModelInput<TContext = unknown, TSession extends VoiceSessi
|
|
|
63
63
|
onTextDelta?: (delta: string) => void;
|
|
64
64
|
session: TSession;
|
|
65
65
|
system?: string;
|
|
66
|
+
signal?: AbortSignal;
|
|
66
67
|
tools: Array<{
|
|
67
68
|
description?: string;
|
|
68
69
|
name: string;
|
|
@@ -148,6 +149,7 @@ export type VoiceAgent<TContext = unknown, TSession extends VoiceSessionRecord =
|
|
|
148
149
|
session: TSession;
|
|
149
150
|
system?: string;
|
|
150
151
|
turn: VoiceTurnRecord;
|
|
152
|
+
signal?: AbortSignal;
|
|
151
153
|
}) => Promise<{
|
|
152
154
|
text: string;
|
|
153
155
|
} | null>;
|
|
@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
|
|
|
21
21
|
noteSilence: (timestampMs?: number) => void;
|
|
22
22
|
reset: () => void;
|
|
23
23
|
};
|
|
24
|
+
export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
|
|
24
25
|
export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;
|
package/dist/core/types.d.ts
CHANGED
|
@@ -631,6 +631,7 @@ export type VoiceRouteConfig<TContext = unknown, TSession extends VoiceSessionRe
|
|
|
631
631
|
context: TContext;
|
|
632
632
|
session: TSession;
|
|
633
633
|
turn: VoiceTurnRecord;
|
|
634
|
+
signal?: AbortSignal;
|
|
634
635
|
}) => Promise<{
|
|
635
636
|
text: string;
|
|
636
637
|
} | null | void>;
|
|
@@ -802,6 +803,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
802
803
|
turnDetection?: VoiceTurnDetectionConfig;
|
|
803
804
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
804
805
|
bargeInMinPartialWords?: number;
|
|
806
|
+
/**
|
|
807
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
808
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
809
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
810
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
811
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
812
|
+
*/
|
|
813
|
+
backchannelBargeInGuard?: boolean;
|
|
805
814
|
fillerPhrases?: ReadonlyArray<string>;
|
|
806
815
|
fillerDelayMs?: number;
|
|
807
816
|
fillerFor?: (input: {
|
|
@@ -975,6 +984,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
975
984
|
* Word splitting is whitespace-based. Punctuation is left attached.
|
|
976
985
|
*/
|
|
977
986
|
bargeInMinPartialWords?: number;
|
|
987
|
+
/**
|
|
988
|
+
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
989
|
+
* WHILE the assistant is talking does NOT barge-in — the assistant keeps going
|
|
990
|
+
* and the cue is dropped so it never becomes the caller's next turn. A bare
|
|
991
|
+
* "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
|
|
992
|
+
* Default false (any in-speech words interrupt, the prior behavior).
|
|
993
|
+
*/
|
|
994
|
+
backchannelBargeInGuard?: boolean;
|
|
978
995
|
fillerPhrases?: ReadonlyArray<string>;
|
|
979
996
|
/** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
|
|
980
997
|
fillerDelayMs?: number;
|
package/dist/index.js
CHANGED
|
@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
|
|
|
3118
3118
|
{ text: "right" },
|
|
3119
3119
|
{ text: "go on" }
|
|
3120
3120
|
];
|
|
3121
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
3122
|
+
"mm",
|
|
3123
|
+
"mmm",
|
|
3124
|
+
"mhm",
|
|
3125
|
+
"mmhm",
|
|
3126
|
+
"mmhmm",
|
|
3127
|
+
"hm",
|
|
3128
|
+
"hmm",
|
|
3129
|
+
"uh-huh",
|
|
3130
|
+
"uhhuh",
|
|
3131
|
+
"uh",
|
|
3132
|
+
"huh",
|
|
3133
|
+
"ah",
|
|
3134
|
+
"oh",
|
|
3135
|
+
"yeah",
|
|
3136
|
+
"yep",
|
|
3137
|
+
"yup",
|
|
3138
|
+
"yes",
|
|
3139
|
+
"ya",
|
|
3140
|
+
"yah",
|
|
3141
|
+
"ok",
|
|
3142
|
+
"okay",
|
|
3143
|
+
"k",
|
|
3144
|
+
"kay",
|
|
3145
|
+
"right",
|
|
3146
|
+
"sure",
|
|
3147
|
+
"totally",
|
|
3148
|
+
"exactly",
|
|
3149
|
+
"absolutely",
|
|
3150
|
+
"definitely",
|
|
3151
|
+
"gotcha",
|
|
3152
|
+
"cool",
|
|
3153
|
+
"nice",
|
|
3154
|
+
"wow",
|
|
3155
|
+
"true",
|
|
3156
|
+
"fair",
|
|
3157
|
+
"aha",
|
|
3158
|
+
"perfect",
|
|
3159
|
+
"awesome",
|
|
3160
|
+
"great",
|
|
3161
|
+
"good",
|
|
3162
|
+
"wonderful",
|
|
3163
|
+
"amazing",
|
|
3164
|
+
"interesting",
|
|
3165
|
+
"understood",
|
|
3166
|
+
"agreed"
|
|
3167
|
+
]);
|
|
3168
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
3169
|
+
"i see",
|
|
3170
|
+
"got it",
|
|
3171
|
+
"makes sense",
|
|
3172
|
+
"of course",
|
|
3173
|
+
"for sure",
|
|
3174
|
+
"fair enough",
|
|
3175
|
+
"sounds good",
|
|
3176
|
+
"i know",
|
|
3177
|
+
"oh ok",
|
|
3178
|
+
"oh okay",
|
|
3179
|
+
"that's right",
|
|
3180
|
+
"thats right",
|
|
3181
|
+
"oh wow",
|
|
3182
|
+
"oh nice",
|
|
3183
|
+
"oh cool",
|
|
3184
|
+
"uh huh",
|
|
3185
|
+
"mm hm",
|
|
3186
|
+
"mm hmm",
|
|
3187
|
+
"i hear you",
|
|
3188
|
+
"for real",
|
|
3189
|
+
"no way",
|
|
3190
|
+
"makes total sense",
|
|
3191
|
+
"got you",
|
|
3192
|
+
"i get it",
|
|
3193
|
+
"right right",
|
|
3194
|
+
"yeah yeah",
|
|
3195
|
+
"ok ok",
|
|
3196
|
+
"oh i see",
|
|
3197
|
+
"oh got it",
|
|
3198
|
+
"yeah totally",
|
|
3199
|
+
"yeah exactly"
|
|
3200
|
+
]);
|
|
3201
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
3202
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
3203
|
+
if (!normalized) {
|
|
3204
|
+
return false;
|
|
3205
|
+
}
|
|
3206
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
3207
|
+
return true;
|
|
3208
|
+
}
|
|
3209
|
+
const words = normalized.split(" ");
|
|
3210
|
+
if (words.length > maxWords) {
|
|
3211
|
+
return false;
|
|
3212
|
+
}
|
|
3213
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
3214
|
+
};
|
|
3121
3215
|
var createVoiceBackchannelDriver = (options) => {
|
|
3122
3216
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
3123
3217
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -3757,6 +3851,8 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
3757
3851
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
3758
3852
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
3759
3853
|
var SPECULATIVE_DELAY_MS = 500;
|
|
3854
|
+
var SPECULATION_ADOPT_TIMEOUT_MS = 6000;
|
|
3855
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
3760
3856
|
var nextSpeakableBoundary = (buffer) => {
|
|
3761
3857
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
3762
3858
|
return match ? match.index + match[0].length : -1;
|
|
@@ -3996,6 +4092,8 @@ var createVoiceSession = (options) => {
|
|
|
3996
4092
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
3997
4093
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
3998
4094
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
4095
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
4096
|
+
let backchannelSuppressedAt = null;
|
|
3999
4097
|
const fillerFor = options.fillerFor;
|
|
4000
4098
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
4001
4099
|
const currentTurnAudio = [];
|
|
@@ -4475,6 +4573,7 @@ var createVoiceSession = (options) => {
|
|
|
4475
4573
|
}, delayMs);
|
|
4476
4574
|
};
|
|
4477
4575
|
const clearSpeculation = () => {
|
|
4576
|
+
speculation?.controller.abort();
|
|
4478
4577
|
speculation = null;
|
|
4479
4578
|
speculationAttempted = false;
|
|
4480
4579
|
};
|
|
@@ -4497,16 +4596,18 @@ var createVoiceSession = (options) => {
|
|
|
4497
4596
|
transcripts: session.currentTurn.transcripts
|
|
4498
4597
|
};
|
|
4499
4598
|
const speculate = options.route.speculate;
|
|
4599
|
+
const controller = new AbortController;
|
|
4500
4600
|
const promise = Promise.resolve(speculate({
|
|
4501
4601
|
api,
|
|
4502
4602
|
context: options.context,
|
|
4503
4603
|
session,
|
|
4504
|
-
turn: provisionalTurn
|
|
4604
|
+
turn: provisionalTurn,
|
|
4605
|
+
signal: controller.signal
|
|
4505
4606
|
})).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
|
|
4506
4607
|
console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
4507
4608
|
return null;
|
|
4508
4609
|
});
|
|
4509
|
-
speculation = { pendingText, promise };
|
|
4610
|
+
speculation = { controller, pendingText, promise };
|
|
4510
4611
|
};
|
|
4511
4612
|
const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => {
|
|
4512
4613
|
scheduleTurnCommit(delayMs, "silence", reset);
|
|
@@ -5169,7 +5270,19 @@ var createVoiceSession = (options) => {
|
|
|
5169
5270
|
const triggeringText = transcript.text.trim();
|
|
5170
5271
|
if (triggeringText) {
|
|
5171
5272
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
5172
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
5273
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
5274
|
+
backchannelSuppressedAt = Date.now();
|
|
5275
|
+
appendTurnLatencyStage({
|
|
5276
|
+
metadata: {
|
|
5277
|
+
partial: triggeringText.slice(0, 200),
|
|
5278
|
+
reason: "backchannel",
|
|
5279
|
+
wordCount
|
|
5280
|
+
},
|
|
5281
|
+
stage: "barge_in_suppressed",
|
|
5282
|
+
turnId: activeTTSTurnId
|
|
5283
|
+
}).catch(() => {});
|
|
5284
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
5285
|
+
backchannelSuppressedAt = null;
|
|
5173
5286
|
appendTurnLatencyStage({
|
|
5174
5287
|
metadata: {
|
|
5175
5288
|
partial: triggeringText.slice(0, 200),
|
|
@@ -5231,6 +5344,11 @@ var createVoiceSession = (options) => {
|
|
|
5231
5344
|
};
|
|
5232
5345
|
const handleFinal = async (transcript) => {
|
|
5233
5346
|
sttReconnectCount = 0;
|
|
5347
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
5348
|
+
backchannelSuppressedAt = null;
|
|
5349
|
+
return;
|
|
5350
|
+
}
|
|
5351
|
+
backchannelSuppressedAt = null;
|
|
5234
5352
|
const session = await writeSession((session2) => {
|
|
5235
5353
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
5236
5354
|
if (!alreadyPresent) {
|
|
@@ -5693,18 +5811,26 @@ var createVoiceSession = (options) => {
|
|
|
5693
5811
|
const onTurnStartedAt = Date.now();
|
|
5694
5812
|
logVoiceTiming(session.id, "session.commit-to-onturn", onTurnStartedAt - (turn.committedAt || onTurnStartedAt), { fillerScheduled: fillerTimer !== null });
|
|
5695
5813
|
const pendingSpeculation = speculation;
|
|
5696
|
-
|
|
5814
|
+
speculation = null;
|
|
5815
|
+
speculationAttempted = false;
|
|
5697
5816
|
let reusableSpeculation;
|
|
5698
5817
|
if (pendingSpeculation && pendingSpeculation.pendingText === turn.text) {
|
|
5699
|
-
const speculated = await
|
|
5818
|
+
const speculated = await Promise.race([
|
|
5819
|
+
pendingSpeculation.promise,
|
|
5820
|
+
new Promise((resolve) => {
|
|
5821
|
+
setTimeout(() => resolve(null), SPECULATION_ADOPT_TIMEOUT_MS);
|
|
5822
|
+
})
|
|
5823
|
+
]);
|
|
5700
5824
|
if (speculated?.text) {
|
|
5701
5825
|
reusableSpeculation = { text: speculated.text };
|
|
5702
5826
|
logVoiceTiming(session.id, "p3.adopted-speculation", 0, {
|
|
5703
5827
|
chars: speculated.text.length
|
|
5704
5828
|
});
|
|
5829
|
+
} else {
|
|
5830
|
+
pendingSpeculation.controller.abort();
|
|
5705
5831
|
}
|
|
5706
|
-
} else {
|
|
5707
|
-
pendingSpeculation
|
|
5832
|
+
} else if (pendingSpeculation) {
|
|
5833
|
+
pendingSpeculation.controller.abort();
|
|
5708
5834
|
}
|
|
5709
5835
|
try {
|
|
5710
5836
|
const onTurnPromise = options.route.onTurn({
|
|
@@ -7989,6 +8115,7 @@ var createVoiceAgent = (options) => {
|
|
|
7989
8115
|
context: input.context,
|
|
7990
8116
|
messages,
|
|
7991
8117
|
session: input.session,
|
|
8118
|
+
signal: input.signal,
|
|
7992
8119
|
system,
|
|
7993
8120
|
tools: [...LIFECYCLE_TOOLS, ...toolMap.values()].map((tool) => ({
|
|
7994
8121
|
description: tool.description,
|
|
@@ -39581,6 +39708,7 @@ var voice = (config) => {
|
|
|
39581
39708
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
39582
39709
|
...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
|
|
39583
39710
|
...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
|
|
39711
|
+
...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
|
|
39584
39712
|
...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
|
|
39585
39713
|
...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
|
|
39586
39714
|
...config.fillerFor ? { fillerFor: config.fillerFor } : {},
|
|
@@ -41257,7 +41385,7 @@ var createAIVoiceModel = (options) => ({
|
|
|
41257
41385
|
const stream = options.provider.stream({
|
|
41258
41386
|
messages: toProviderMessages(input.messages),
|
|
41259
41387
|
model: options.model,
|
|
41260
|
-
signal: options.signal,
|
|
41388
|
+
signal: input.signal ?? options.signal,
|
|
41261
41389
|
systemPrompt,
|
|
41262
41390
|
tools: toProviderTools(input.tools)
|
|
41263
41391
|
});
|
package/dist/testing/index.js
CHANGED
|
@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
|
|
|
5450
5450
|
{ text: "right" },
|
|
5451
5451
|
{ text: "go on" }
|
|
5452
5452
|
];
|
|
5453
|
+
var BACKCHANNEL_TOKENS = new Set([
|
|
5454
|
+
"mm",
|
|
5455
|
+
"mmm",
|
|
5456
|
+
"mhm",
|
|
5457
|
+
"mmhm",
|
|
5458
|
+
"mmhmm",
|
|
5459
|
+
"hm",
|
|
5460
|
+
"hmm",
|
|
5461
|
+
"uh-huh",
|
|
5462
|
+
"uhhuh",
|
|
5463
|
+
"uh",
|
|
5464
|
+
"huh",
|
|
5465
|
+
"ah",
|
|
5466
|
+
"oh",
|
|
5467
|
+
"yeah",
|
|
5468
|
+
"yep",
|
|
5469
|
+
"yup",
|
|
5470
|
+
"yes",
|
|
5471
|
+
"ya",
|
|
5472
|
+
"yah",
|
|
5473
|
+
"ok",
|
|
5474
|
+
"okay",
|
|
5475
|
+
"k",
|
|
5476
|
+
"kay",
|
|
5477
|
+
"right",
|
|
5478
|
+
"sure",
|
|
5479
|
+
"totally",
|
|
5480
|
+
"exactly",
|
|
5481
|
+
"absolutely",
|
|
5482
|
+
"definitely",
|
|
5483
|
+
"gotcha",
|
|
5484
|
+
"cool",
|
|
5485
|
+
"nice",
|
|
5486
|
+
"wow",
|
|
5487
|
+
"true",
|
|
5488
|
+
"fair",
|
|
5489
|
+
"aha",
|
|
5490
|
+
"perfect",
|
|
5491
|
+
"awesome",
|
|
5492
|
+
"great",
|
|
5493
|
+
"good",
|
|
5494
|
+
"wonderful",
|
|
5495
|
+
"amazing",
|
|
5496
|
+
"interesting",
|
|
5497
|
+
"understood",
|
|
5498
|
+
"agreed"
|
|
5499
|
+
]);
|
|
5500
|
+
var BACKCHANNEL_PHRASES = new Set([
|
|
5501
|
+
"i see",
|
|
5502
|
+
"got it",
|
|
5503
|
+
"makes sense",
|
|
5504
|
+
"of course",
|
|
5505
|
+
"for sure",
|
|
5506
|
+
"fair enough",
|
|
5507
|
+
"sounds good",
|
|
5508
|
+
"i know",
|
|
5509
|
+
"oh ok",
|
|
5510
|
+
"oh okay",
|
|
5511
|
+
"that's right",
|
|
5512
|
+
"thats right",
|
|
5513
|
+
"oh wow",
|
|
5514
|
+
"oh nice",
|
|
5515
|
+
"oh cool",
|
|
5516
|
+
"uh huh",
|
|
5517
|
+
"mm hm",
|
|
5518
|
+
"mm hmm",
|
|
5519
|
+
"i hear you",
|
|
5520
|
+
"for real",
|
|
5521
|
+
"no way",
|
|
5522
|
+
"makes total sense",
|
|
5523
|
+
"got you",
|
|
5524
|
+
"i get it",
|
|
5525
|
+
"right right",
|
|
5526
|
+
"yeah yeah",
|
|
5527
|
+
"ok ok",
|
|
5528
|
+
"oh i see",
|
|
5529
|
+
"oh got it",
|
|
5530
|
+
"yeah totally",
|
|
5531
|
+
"yeah exactly"
|
|
5532
|
+
]);
|
|
5533
|
+
var isBackchannelUtterance = (text, maxWords = 3) => {
|
|
5534
|
+
const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
|
|
5535
|
+
if (!normalized) {
|
|
5536
|
+
return false;
|
|
5537
|
+
}
|
|
5538
|
+
if (BACKCHANNEL_PHRASES.has(normalized)) {
|
|
5539
|
+
return true;
|
|
5540
|
+
}
|
|
5541
|
+
const words = normalized.split(" ");
|
|
5542
|
+
if (words.length > maxWords) {
|
|
5543
|
+
return false;
|
|
5544
|
+
}
|
|
5545
|
+
return words.every((word) => BACKCHANNEL_TOKENS.has(word));
|
|
5546
|
+
};
|
|
5453
5547
|
var createVoiceBackchannelDriver = (options) => {
|
|
5454
5548
|
const cues = options.cues ?? DEFAULT_CUES;
|
|
5455
5549
|
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
@@ -5984,6 +6078,8 @@ var MAX_TTS_CHUNK_CHARS = 320;
|
|
|
5984
6078
|
var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
|
|
5985
6079
|
var STREAM_IDLE_FLUSH_MS = 350;
|
|
5986
6080
|
var SPECULATIVE_DELAY_MS = 500;
|
|
6081
|
+
var SPECULATION_ADOPT_TIMEOUT_MS = 6000;
|
|
6082
|
+
var BACKCHANNEL_DROP_WINDOW_MS = 2000;
|
|
5987
6083
|
var nextSpeakableBoundary = (buffer) => {
|
|
5988
6084
|
const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
|
|
5989
6085
|
return match ? match.index + match[0].length : -1;
|
|
@@ -6223,6 +6319,8 @@ var createVoiceSession = (options) => {
|
|
|
6223
6319
|
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
6224
6320
|
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
6225
6321
|
const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
|
|
6322
|
+
const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
|
|
6323
|
+
let backchannelSuppressedAt = null;
|
|
6226
6324
|
const fillerFor = options.fillerFor;
|
|
6227
6325
|
const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
|
|
6228
6326
|
const currentTurnAudio = [];
|
|
@@ -6702,6 +6800,7 @@ var createVoiceSession = (options) => {
|
|
|
6702
6800
|
}, delayMs);
|
|
6703
6801
|
};
|
|
6704
6802
|
const clearSpeculation = () => {
|
|
6803
|
+
speculation?.controller.abort();
|
|
6705
6804
|
speculation = null;
|
|
6706
6805
|
speculationAttempted = false;
|
|
6707
6806
|
};
|
|
@@ -6724,16 +6823,18 @@ var createVoiceSession = (options) => {
|
|
|
6724
6823
|
transcripts: session.currentTurn.transcripts
|
|
6725
6824
|
};
|
|
6726
6825
|
const speculate = options.route.speculate;
|
|
6826
|
+
const controller = new AbortController;
|
|
6727
6827
|
const promise = Promise.resolve(speculate({
|
|
6728
6828
|
api,
|
|
6729
6829
|
context: options.context,
|
|
6730
6830
|
session,
|
|
6731
|
-
turn: provisionalTurn
|
|
6831
|
+
turn: provisionalTurn,
|
|
6832
|
+
signal: controller.signal
|
|
6732
6833
|
})).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
|
|
6733
6834
|
console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
|
|
6734
6835
|
return null;
|
|
6735
6836
|
});
|
|
6736
|
-
speculation = { pendingText, promise };
|
|
6837
|
+
speculation = { controller, pendingText, promise };
|
|
6737
6838
|
};
|
|
6738
6839
|
const scheduleSilenceCommit = (delayMs = adaptiveSilenceMs(), reset = true) => {
|
|
6739
6840
|
scheduleTurnCommit(delayMs, "silence", reset);
|
|
@@ -7396,7 +7497,19 @@ var createVoiceSession = (options) => {
|
|
|
7396
7497
|
const triggeringText = transcript.text.trim();
|
|
7397
7498
|
if (triggeringText) {
|
|
7398
7499
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
7399
|
-
if (wordCount >= bargeInMinPartialWords) {
|
|
7500
|
+
if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
|
|
7501
|
+
backchannelSuppressedAt = Date.now();
|
|
7502
|
+
appendTurnLatencyStage({
|
|
7503
|
+
metadata: {
|
|
7504
|
+
partial: triggeringText.slice(0, 200),
|
|
7505
|
+
reason: "backchannel",
|
|
7506
|
+
wordCount
|
|
7507
|
+
},
|
|
7508
|
+
stage: "barge_in_suppressed",
|
|
7509
|
+
turnId: activeTTSTurnId
|
|
7510
|
+
}).catch(() => {});
|
|
7511
|
+
} else if (wordCount >= bargeInMinPartialWords) {
|
|
7512
|
+
backchannelSuppressedAt = null;
|
|
7400
7513
|
appendTurnLatencyStage({
|
|
7401
7514
|
metadata: {
|
|
7402
7515
|
partial: triggeringText.slice(0, 200),
|
|
@@ -7458,6 +7571,11 @@ var createVoiceSession = (options) => {
|
|
|
7458
7571
|
};
|
|
7459
7572
|
const handleFinal = async (transcript) => {
|
|
7460
7573
|
sttReconnectCount = 0;
|
|
7574
|
+
if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
|
|
7575
|
+
backchannelSuppressedAt = null;
|
|
7576
|
+
return;
|
|
7577
|
+
}
|
|
7578
|
+
backchannelSuppressedAt = null;
|
|
7461
7579
|
const session = await writeSession((session2) => {
|
|
7462
7580
|
const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
|
|
7463
7581
|
if (!alreadyPresent) {
|
|
@@ -7920,18 +8038,26 @@ var createVoiceSession = (options) => {
|
|
|
7920
8038
|
const onTurnStartedAt = Date.now();
|
|
7921
8039
|
logVoiceTiming(session.id, "session.commit-to-onturn", onTurnStartedAt - (turn.committedAt || onTurnStartedAt), { fillerScheduled: fillerTimer !== null });
|
|
7922
8040
|
const pendingSpeculation = speculation;
|
|
7923
|
-
|
|
8041
|
+
speculation = null;
|
|
8042
|
+
speculationAttempted = false;
|
|
7924
8043
|
let reusableSpeculation;
|
|
7925
8044
|
if (pendingSpeculation && pendingSpeculation.pendingText === turn.text) {
|
|
7926
|
-
const speculated = await
|
|
8045
|
+
const speculated = await Promise.race([
|
|
8046
|
+
pendingSpeculation.promise,
|
|
8047
|
+
new Promise((resolve2) => {
|
|
8048
|
+
setTimeout(() => resolve2(null), SPECULATION_ADOPT_TIMEOUT_MS);
|
|
8049
|
+
})
|
|
8050
|
+
]);
|
|
7927
8051
|
if (speculated?.text) {
|
|
7928
8052
|
reusableSpeculation = { text: speculated.text };
|
|
7929
8053
|
logVoiceTiming(session.id, "p3.adopted-speculation", 0, {
|
|
7930
8054
|
chars: speculated.text.length
|
|
7931
8055
|
});
|
|
8056
|
+
} else {
|
|
8057
|
+
pendingSpeculation.controller.abort();
|
|
7932
8058
|
}
|
|
7933
|
-
} else {
|
|
7934
|
-
pendingSpeculation
|
|
8059
|
+
} else if (pendingSpeculation) {
|
|
8060
|
+
pendingSpeculation.controller.abort();
|
|
7935
8061
|
}
|
|
7936
8062
|
try {
|
|
7937
8063
|
const onTurnPromise = options.route.onTurn({
|