@absolutejs/voice 0.0.22-beta.610 → 0.0.22-beta.611
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/bargeInDetector.d.ts +41 -0
- package/dist/core/types.d.ts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +87 -25
- package/dist/testing/index.js +35 -25
- package/package.json +1 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import type { AudioFormat } from "./types";
|
|
2
|
+
export type VoiceBargeInInput = {
|
|
3
|
+
/** The partial transcript that arrived while the assistant was speaking. */
|
|
4
|
+
partialText: string;
|
|
5
|
+
/** Word count of `partialText`. */
|
|
6
|
+
wordCount: number;
|
|
7
|
+
/** Whether the text matches a known backchannel cue (isBackchannelUtterance). */
|
|
8
|
+
isBackchannelByText: boolean;
|
|
9
|
+
/** The user's buffered PCM for this window (oldest→newest), if any. */
|
|
10
|
+
turnAudio?: ReadonlyArray<Uint8Array>;
|
|
11
|
+
turnAudioFormat?: AudioFormat;
|
|
12
|
+
};
|
|
13
|
+
export type VoiceBargeInVerdict = {
|
|
14
|
+
/** true = real interruption → cancel the assistant's TTS. false = keep talking. */
|
|
15
|
+
shouldCancel: boolean;
|
|
16
|
+
/** Diagnostic label, surfaced on the barge_in / barge_in_suppressed trace. */
|
|
17
|
+
reason?: string;
|
|
18
|
+
};
|
|
19
|
+
export type VoiceBargeInDetector = {
|
|
20
|
+
evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
|
|
21
|
+
};
|
|
22
|
+
export type CreateAcousticBargeInDetectorOptions = {
|
|
23
|
+
/** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
|
|
24
|
+
sustainedMs?: number;
|
|
25
|
+
/** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
|
|
26
|
+
emphaticRms?: number;
|
|
27
|
+
/** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
|
|
28
|
+
noiseFloorRms?: number;
|
|
29
|
+
};
|
|
30
|
+
/**
|
|
31
|
+
* A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
|
|
32
|
+
* speech duration + onset energy with the text backchannel signal:
|
|
33
|
+
* - sustained speech → real interruption (cancel)
|
|
34
|
+
* - known cue word, stayed short → backchannel (keep talking)
|
|
35
|
+
* - short but loud/sharp onset → emphatic interruption like "Wait!" (cancel)
|
|
36
|
+
* - short + quiet → incidental noise (keep talking)
|
|
37
|
+
* - short + moderate, real words → ambiguous, default to cancel (don't strand
|
|
38
|
+
* a genuine short interruption)
|
|
39
|
+
* Runs in-process on raw arithmetic — no model, no sidecar.
|
|
40
|
+
*/
|
|
41
|
+
export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
|
package/dist/core/types.d.ts
CHANGED
|
@@ -802,6 +802,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
802
802
|
reconnect?: VoiceReconnectConfig;
|
|
803
803
|
turnDetection?: VoiceTurnDetectionConfig;
|
|
804
804
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
805
|
+
bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
|
|
805
806
|
bargeInMinPartialWords?: number;
|
|
806
807
|
/**
|
|
807
808
|
* When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
|
|
@@ -951,6 +952,7 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
951
952
|
};
|
|
952
953
|
redact?: import("./redaction").VoiceTranscriptRedactor;
|
|
953
954
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
955
|
+
bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
|
|
954
956
|
/**
|
|
955
957
|
* Pre-rendered filler phrases the runtime plays in the gap between
|
|
956
958
|
* user-turn-commit and real assistant audio (typically 800-1500ms). The
|
package/dist/index.d.ts
CHANGED
|
@@ -92,6 +92,8 @@ export type { CreateVoiceCostAccountantOptions, VoiceCostAccountant, VoiceCostBr
|
|
|
92
92
|
export { describeVoiceAssistantMode, resolveVoiceAssistantMode, } from "./core/assistantMode";
|
|
93
93
|
export type { VoiceAssistantMode, VoiceAssistantModality, VoiceAssistantModeDescriptor, VoiceSemanticVADConfig, } from "./core/assistantMode";
|
|
94
94
|
export { createPunctuationSemanticTurnDetector, createRegexSemanticTurnDetector, } from "./core/semanticTurn";
|
|
95
|
+
export { createAcousticBargeInDetector } from "./core/bargeInDetector";
|
|
96
|
+
export type { CreateAcousticBargeInDetectorOptions, VoiceBargeInDetector, VoiceBargeInInput, VoiceBargeInVerdict, } from "./core/bargeInDetector";
|
|
95
97
|
export { VOICE_WEBHOOK_SIGNATURE_HEADER, VOICE_WEBHOOK_TIMESTAMP_HEADER, extractVoiceWebhookSignatureFromHeaders, signVoiceWebhookBody, verifyVoiceWebhookSignature, } from "./core/webhookVerification";
|
|
96
98
|
export { describeVoiceAgentUIState, deriveVoiceAgentUIState, voiceAgentUIStateOrder, } from "./core/agentState";
|
|
97
99
|
export type { VoiceAgentUIInput, VoiceAgentUIState } from "./core/agentState";
|
package/dist/index.js
CHANGED
|
@@ -4245,7 +4245,7 @@ var createVoiceSession = (options) => {
|
|
|
4245
4245
|
};
|
|
4246
4246
|
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
4247
4247
|
const getTurnAudioForDetector = () => {
|
|
4248
|
-
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
4248
|
+
if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
|
|
4249
4249
|
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
4250
4250
|
}
|
|
4251
4251
|
const turnAudio = currentTurnAudio.map((audio) => {
|
|
@@ -5270,30 +5270,7 @@ var createVoiceSession = (options) => {
|
|
|
5270
5270
|
const triggeringText = transcript.text.trim();
|
|
5271
5271
|
if (triggeringText) {
|
|
5272
5272
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
5273
|
-
if (wordCount
|
|
5274
|
-
backchannelSuppressedAt = Date.now();
|
|
5275
|
-
appendTurnLatencyStage({
|
|
5276
|
-
metadata: {
|
|
5277
|
-
partial: triggeringText.slice(0, 200),
|
|
5278
|
-
reason: "backchannel",
|
|
5279
|
-
wordCount
|
|
5280
|
-
},
|
|
5281
|
-
stage: "barge_in_suppressed",
|
|
5282
|
-
turnId: activeTTSTurnId
|
|
5283
|
-
}).catch(() => {});
|
|
5284
|
-
} else if (wordCount >= bargeInMinPartialWords) {
|
|
5285
|
-
backchannelSuppressedAt = null;
|
|
5286
|
-
appendTurnLatencyStage({
|
|
5287
|
-
metadata: {
|
|
5288
|
-
partial: triggeringText.slice(0, 200),
|
|
5289
|
-
source: "stt_partial",
|
|
5290
|
-
wordCount
|
|
5291
|
-
},
|
|
5292
|
-
stage: "barge_in",
|
|
5293
|
-
turnId: activeTTSTurnId
|
|
5294
|
-
}).catch(() => {});
|
|
5295
|
-
cancelActiveTTS("barge-in");
|
|
5296
|
-
} else {
|
|
5273
|
+
if (wordCount < bargeInMinPartialWords) {
|
|
5297
5274
|
appendTurnLatencyStage({
|
|
5298
5275
|
metadata: {
|
|
5299
5276
|
partial: triggeringText.slice(0, 200),
|
|
@@ -5303,6 +5280,39 @@ var createVoiceSession = (options) => {
|
|
|
5303
5280
|
stage: "barge_in_suppressed",
|
|
5304
5281
|
turnId: activeTTSTurnId
|
|
5305
5282
|
}).catch(() => {});
|
|
5283
|
+
} else {
|
|
5284
|
+
const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
|
|
5285
|
+
const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
|
|
5286
|
+
isBackchannelByText,
|
|
5287
|
+
partialText: triggeringText,
|
|
5288
|
+
wordCount,
|
|
5289
|
+
...getTurnAudioForDetector()
|
|
5290
|
+
})) : { reason: undefined, shouldCancel: !isBackchannelByText };
|
|
5291
|
+
const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
|
|
5292
|
+
if (verdict.shouldCancel) {
|
|
5293
|
+
backchannelSuppressedAt = null;
|
|
5294
|
+
appendTurnLatencyStage({
|
|
5295
|
+
metadata: {
|
|
5296
|
+
partial: triggeringText.slice(0, 200),
|
|
5297
|
+
source: reason,
|
|
5298
|
+
wordCount
|
|
5299
|
+
},
|
|
5300
|
+
stage: "barge_in",
|
|
5301
|
+
turnId: activeTTSTurnId
|
|
5302
|
+
}).catch(() => {});
|
|
5303
|
+
cancelActiveTTS("barge-in");
|
|
5304
|
+
} else {
|
|
5305
|
+
backchannelSuppressedAt = Date.now();
|
|
5306
|
+
appendTurnLatencyStage({
|
|
5307
|
+
metadata: {
|
|
5308
|
+
partial: triggeringText.slice(0, 200),
|
|
5309
|
+
reason,
|
|
5310
|
+
wordCount
|
|
5311
|
+
},
|
|
5312
|
+
stage: "barge_in_suppressed",
|
|
5313
|
+
turnId: activeTTSTurnId
|
|
5314
|
+
}).catch(() => {});
|
|
5315
|
+
}
|
|
5306
5316
|
}
|
|
5307
5317
|
}
|
|
5308
5318
|
}
|
|
@@ -39707,6 +39717,7 @@ var voice = (config) => {
|
|
|
39707
39717
|
sttFallback: sessionOptions.sttFallback,
|
|
39708
39718
|
sttLifecycle: sessionOptions.sttLifecycle,
|
|
39709
39719
|
...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
|
|
39720
|
+
...config.bargeInDetector ? { bargeInDetector: config.bargeInDetector } : {},
|
|
39710
39721
|
...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
|
|
39711
39722
|
...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
|
|
39712
39723
|
...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
|
|
@@ -41989,6 +42000,56 @@ var createRegexSemanticTurnDetector = (options) => {
|
|
|
41989
42000
|
}
|
|
41990
42001
|
};
|
|
41991
42002
|
};
|
|
42003
|
+
// src/core/bargeInDetector.ts
|
|
42004
|
+
var measureTurnAudio = (chunks, format) => {
|
|
42005
|
+
const channels = format.channels ?? 1;
|
|
42006
|
+
const sampleRate = format.sampleRateHz ?? 16000;
|
|
42007
|
+
let sumSquares = 0;
|
|
42008
|
+
let sampleCount = 0;
|
|
42009
|
+
for (const chunk of chunks) {
|
|
42010
|
+
const usableBytes = chunk.byteLength - chunk.byteLength % 2;
|
|
42011
|
+
const view = new DataView(chunk.buffer, chunk.byteOffset, usableBytes);
|
|
42012
|
+
for (let offset = 0;offset < usableBytes; offset += 2) {
|
|
42013
|
+
const sample = view.getInt16(offset, true) / 32768;
|
|
42014
|
+
sumSquares += sample * sample;
|
|
42015
|
+
sampleCount += 1;
|
|
42016
|
+
}
|
|
42017
|
+
}
|
|
42018
|
+
if (sampleCount === 0) {
|
|
42019
|
+
return { durationMs: 0, rms: 0 };
|
|
42020
|
+
}
|
|
42021
|
+
return {
|
|
42022
|
+
durationMs: sampleCount / channels / sampleRate * 1000,
|
|
42023
|
+
rms: Math.sqrt(sumSquares / sampleCount)
|
|
42024
|
+
};
|
|
42025
|
+
};
|
|
42026
|
+
var createAcousticBargeInDetector = (options = {}) => {
|
|
42027
|
+
const sustainedMs = options.sustainedMs ?? 700;
|
|
42028
|
+
const emphaticRms = options.emphaticRms ?? 0.16;
|
|
42029
|
+
const noiseFloorRms = options.noiseFloorRms ?? 0.035;
|
|
42030
|
+
return {
|
|
42031
|
+
evaluate: (input) => {
|
|
42032
|
+
const { turnAudio, turnAudioFormat } = input;
|
|
42033
|
+
if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
|
|
42034
|
+
return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
|
|
42035
|
+
}
|
|
42036
|
+
const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
|
|
42037
|
+
if (durationMs >= sustainedMs) {
|
|
42038
|
+
return { reason: "acoustic_sustained", shouldCancel: true };
|
|
42039
|
+
}
|
|
42040
|
+
if (input.isBackchannelByText) {
|
|
42041
|
+
return { reason: "acoustic_backchannel", shouldCancel: false };
|
|
42042
|
+
}
|
|
42043
|
+
if (rms >= emphaticRms) {
|
|
42044
|
+
return { reason: "acoustic_emphatic", shouldCancel: true };
|
|
42045
|
+
}
|
|
42046
|
+
if (rms <= noiseFloorRms) {
|
|
42047
|
+
return { reason: "acoustic_noise_floor", shouldCancel: false };
|
|
42048
|
+
}
|
|
42049
|
+
return { reason: "acoustic_ambiguous", shouldCancel: true };
|
|
42050
|
+
}
|
|
42051
|
+
};
|
|
42052
|
+
};
|
|
41992
42053
|
// src/core/webhookVerification.ts
|
|
41993
42054
|
var VOICE_WEBHOOK_SIGNATURE_HEADER = "x-absolutejs-signature";
|
|
41994
42055
|
var VOICE_WEBHOOK_TIMESTAMP_HEADER = "x-absolutejs-timestamp";
|
|
@@ -53726,6 +53787,7 @@ export {
|
|
|
53726
53787
|
createCoturnIceServers,
|
|
53727
53788
|
createCachedTTS,
|
|
53728
53789
|
createAnthropicVoiceAssistantModel,
|
|
53790
|
+
createAcousticBargeInDetector,
|
|
53729
53791
|
createAIVoiceModel,
|
|
53730
53792
|
conditionAudioChunk,
|
|
53731
53793
|
computeVoiceScorecardCalibration,
|
package/dist/testing/index.js
CHANGED
|
@@ -6472,7 +6472,7 @@ var createVoiceSession = (options) => {
|
|
|
6472
6472
|
};
|
|
6473
6473
|
const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
|
|
6474
6474
|
const getTurnAudioForDetector = () => {
|
|
6475
|
-
if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
|
|
6475
|
+
if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
|
|
6476
6476
|
return { turnAudio: undefined, turnAudioFormat: undefined };
|
|
6477
6477
|
}
|
|
6478
6478
|
const turnAudio = currentTurnAudio.map((audio) => {
|
|
@@ -7497,30 +7497,7 @@ var createVoiceSession = (options) => {
|
|
|
7497
7497
|
const triggeringText = transcript.text.trim();
|
|
7498
7498
|
if (triggeringText) {
|
|
7499
7499
|
const wordCount = triggeringText.split(/\s+/).length;
|
|
7500
|
-
if (wordCount
|
|
7501
|
-
backchannelSuppressedAt = Date.now();
|
|
7502
|
-
appendTurnLatencyStage({
|
|
7503
|
-
metadata: {
|
|
7504
|
-
partial: triggeringText.slice(0, 200),
|
|
7505
|
-
reason: "backchannel",
|
|
7506
|
-
wordCount
|
|
7507
|
-
},
|
|
7508
|
-
stage: "barge_in_suppressed",
|
|
7509
|
-
turnId: activeTTSTurnId
|
|
7510
|
-
}).catch(() => {});
|
|
7511
|
-
} else if (wordCount >= bargeInMinPartialWords) {
|
|
7512
|
-
backchannelSuppressedAt = null;
|
|
7513
|
-
appendTurnLatencyStage({
|
|
7514
|
-
metadata: {
|
|
7515
|
-
partial: triggeringText.slice(0, 200),
|
|
7516
|
-
source: "stt_partial",
|
|
7517
|
-
wordCount
|
|
7518
|
-
},
|
|
7519
|
-
stage: "barge_in",
|
|
7520
|
-
turnId: activeTTSTurnId
|
|
7521
|
-
}).catch(() => {});
|
|
7522
|
-
cancelActiveTTS("barge-in");
|
|
7523
|
-
} else {
|
|
7500
|
+
if (wordCount < bargeInMinPartialWords) {
|
|
7524
7501
|
appendTurnLatencyStage({
|
|
7525
7502
|
metadata: {
|
|
7526
7503
|
partial: triggeringText.slice(0, 200),
|
|
@@ -7530,6 +7507,39 @@ var createVoiceSession = (options) => {
|
|
|
7530
7507
|
stage: "barge_in_suppressed",
|
|
7531
7508
|
turnId: activeTTSTurnId
|
|
7532
7509
|
}).catch(() => {});
|
|
7510
|
+
} else {
|
|
7511
|
+
const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
|
|
7512
|
+
const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
|
|
7513
|
+
isBackchannelByText,
|
|
7514
|
+
partialText: triggeringText,
|
|
7515
|
+
wordCount,
|
|
7516
|
+
...getTurnAudioForDetector()
|
|
7517
|
+
})) : { reason: undefined, shouldCancel: !isBackchannelByText };
|
|
7518
|
+
const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
|
|
7519
|
+
if (verdict.shouldCancel) {
|
|
7520
|
+
backchannelSuppressedAt = null;
|
|
7521
|
+
appendTurnLatencyStage({
|
|
7522
|
+
metadata: {
|
|
7523
|
+
partial: triggeringText.slice(0, 200),
|
|
7524
|
+
source: reason,
|
|
7525
|
+
wordCount
|
|
7526
|
+
},
|
|
7527
|
+
stage: "barge_in",
|
|
7528
|
+
turnId: activeTTSTurnId
|
|
7529
|
+
}).catch(() => {});
|
|
7530
|
+
cancelActiveTTS("barge-in");
|
|
7531
|
+
} else {
|
|
7532
|
+
backchannelSuppressedAt = Date.now();
|
|
7533
|
+
appendTurnLatencyStage({
|
|
7534
|
+
metadata: {
|
|
7535
|
+
partial: triggeringText.slice(0, 200),
|
|
7536
|
+
reason,
|
|
7537
|
+
wordCount
|
|
7538
|
+
},
|
|
7539
|
+
stage: "barge_in_suppressed",
|
|
7540
|
+
turnId: activeTTSTurnId
|
|
7541
|
+
}).catch(() => {});
|
|
7542
|
+
}
|
|
7533
7543
|
}
|
|
7534
7544
|
}
|
|
7535
7545
|
}
|