@absolutejs/voice 0.0.22-beta.610 → 0.0.22-beta.611

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ import type { AudioFormat } from "./types";
2
+ export type VoiceBargeInInput = {
3
+ /** The partial transcript that arrived while the assistant was speaking. */
4
+ partialText: string;
5
+ /** Word count of `partialText`. */
6
+ wordCount: number;
7
+ /** Whether the text matches a known backchannel cue (isBackchannelUtterance). */
8
+ isBackchannelByText: boolean;
9
+ /** The user's buffered PCM for this window (oldest→newest), if any. */
10
+ turnAudio?: ReadonlyArray<Uint8Array>;
11
+ turnAudioFormat?: AudioFormat;
12
+ };
13
+ export type VoiceBargeInVerdict = {
14
+ /** true = real interruption → cancel the assistant's TTS. false = keep talking. */
15
+ shouldCancel: boolean;
16
+ /** Diagnostic label, surfaced on the barge_in / barge_in_suppressed trace. */
17
+ reason?: string;
18
+ };
19
+ export type VoiceBargeInDetector = {
20
+ evaluate: (input: VoiceBargeInInput) => Promise<VoiceBargeInVerdict> | VoiceBargeInVerdict;
21
+ };
22
+ export type CreateAcousticBargeInDetectorOptions = {
23
+ /** Speech sustained this long (ms) is a real interruption regardless of text/energy. */
24
+ sustainedMs?: number;
25
+ /** RMS (0-1) at/above this is an emphatic onset ("Wait!") — cancel even if short. */
26
+ emphaticRms?: number;
27
+ /** Below this RMS (0-1) a short burst is incidental noise — keep talking. */
28
+ noiseFloorRms?: number;
29
+ };
30
+ /**
31
+ * A model-free acoustic backchannel-vs-barge-in classifier. Combines the user's
32
+ * speech duration + onset energy with the text backchannel signal:
33
+ * - sustained speech → real interruption (cancel)
34
+ * - known cue word, stayed short → backchannel (keep talking)
35
+ * - short but loud/sharp onset → emphatic interruption like "Wait!" (cancel)
36
+ * - short + quiet → incidental noise (keep talking)
37
+ * - short + moderate, real words → ambiguous, default to cancel (don't strand
38
+ * a genuine short interruption)
39
+ * Runs in-process on raw arithmetic — no model, no sidecar.
40
+ */
41
+ export declare const createAcousticBargeInDetector: (options?: CreateAcousticBargeInDetectorOptions) => VoiceBargeInDetector;
@@ -802,6 +802,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
802
802
  reconnect?: VoiceReconnectConfig;
803
803
  turnDetection?: VoiceTurnDetectionConfig;
804
804
  semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
805
+ bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
805
806
  bargeInMinPartialWords?: number;
806
807
  /**
807
808
  * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
@@ -951,6 +952,7 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
951
952
  };
952
953
  redact?: import("./redaction").VoiceTranscriptRedactor;
953
954
  semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
955
+ bargeInDetector?: import("./bargeInDetector").VoiceBargeInDetector;
954
956
  /**
955
957
  * Pre-rendered filler phrases the runtime plays in the gap between
956
958
  * user-turn-commit and real assistant audio (typically 800-1500ms). The
package/dist/index.d.ts CHANGED
@@ -92,6 +92,8 @@ export type { CreateVoiceCostAccountantOptions, VoiceCostAccountant, VoiceCostBr
92
92
  export { describeVoiceAssistantMode, resolveVoiceAssistantMode, } from "./core/assistantMode";
93
93
  export type { VoiceAssistantMode, VoiceAssistantModality, VoiceAssistantModeDescriptor, VoiceSemanticVADConfig, } from "./core/assistantMode";
94
94
  export { createPunctuationSemanticTurnDetector, createRegexSemanticTurnDetector, } from "./core/semanticTurn";
95
+ export { createAcousticBargeInDetector } from "./core/bargeInDetector";
96
+ export type { CreateAcousticBargeInDetectorOptions, VoiceBargeInDetector, VoiceBargeInInput, VoiceBargeInVerdict, } from "./core/bargeInDetector";
95
97
  export { VOICE_WEBHOOK_SIGNATURE_HEADER, VOICE_WEBHOOK_TIMESTAMP_HEADER, extractVoiceWebhookSignatureFromHeaders, signVoiceWebhookBody, verifyVoiceWebhookSignature, } from "./core/webhookVerification";
96
98
  export { describeVoiceAgentUIState, deriveVoiceAgentUIState, voiceAgentUIStateOrder, } from "./core/agentState";
97
99
  export type { VoiceAgentUIInput, VoiceAgentUIState } from "./core/agentState";
package/dist/index.js CHANGED
@@ -4245,7 +4245,7 @@ var createVoiceSession = (options) => {
4245
4245
  };
4246
4246
  const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
4247
4247
  const getTurnAudioForDetector = () => {
4248
- if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
4248
+ if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
4249
4249
  return { turnAudio: undefined, turnAudioFormat: undefined };
4250
4250
  }
4251
4251
  const turnAudio = currentTurnAudio.map((audio) => {
@@ -5270,30 +5270,7 @@ var createVoiceSession = (options) => {
5270
5270
  const triggeringText = transcript.text.trim();
5271
5271
  if (triggeringText) {
5272
5272
  const wordCount = triggeringText.split(/\s+/).length;
5273
- if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
5274
- backchannelSuppressedAt = Date.now();
5275
- appendTurnLatencyStage({
5276
- metadata: {
5277
- partial: triggeringText.slice(0, 200),
5278
- reason: "backchannel",
5279
- wordCount
5280
- },
5281
- stage: "barge_in_suppressed",
5282
- turnId: activeTTSTurnId
5283
- }).catch(() => {});
5284
- } else if (wordCount >= bargeInMinPartialWords) {
5285
- backchannelSuppressedAt = null;
5286
- appendTurnLatencyStage({
5287
- metadata: {
5288
- partial: triggeringText.slice(0, 200),
5289
- source: "stt_partial",
5290
- wordCount
5291
- },
5292
- stage: "barge_in",
5293
- turnId: activeTTSTurnId
5294
- }).catch(() => {});
5295
- cancelActiveTTS("barge-in");
5296
- } else {
5273
+ if (wordCount < bargeInMinPartialWords) {
5297
5274
  appendTurnLatencyStage({
5298
5275
  metadata: {
5299
5276
  partial: triggeringText.slice(0, 200),
@@ -5303,6 +5280,39 @@ var createVoiceSession = (options) => {
5303
5280
  stage: "barge_in_suppressed",
5304
5281
  turnId: activeTTSTurnId
5305
5282
  }).catch(() => {});
5283
+ } else {
5284
+ const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
5285
+ const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
5286
+ isBackchannelByText,
5287
+ partialText: triggeringText,
5288
+ wordCount,
5289
+ ...getTurnAudioForDetector()
5290
+ })) : { reason: undefined, shouldCancel: !isBackchannelByText };
5291
+ const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
5292
+ if (verdict.shouldCancel) {
5293
+ backchannelSuppressedAt = null;
5294
+ appendTurnLatencyStage({
5295
+ metadata: {
5296
+ partial: triggeringText.slice(0, 200),
5297
+ source: reason,
5298
+ wordCount
5299
+ },
5300
+ stage: "barge_in",
5301
+ turnId: activeTTSTurnId
5302
+ }).catch(() => {});
5303
+ cancelActiveTTS("barge-in");
5304
+ } else {
5305
+ backchannelSuppressedAt = Date.now();
5306
+ appendTurnLatencyStage({
5307
+ metadata: {
5308
+ partial: triggeringText.slice(0, 200),
5309
+ reason,
5310
+ wordCount
5311
+ },
5312
+ stage: "barge_in_suppressed",
5313
+ turnId: activeTTSTurnId
5314
+ }).catch(() => {});
5315
+ }
5306
5316
  }
5307
5317
  }
5308
5318
  }
@@ -39707,6 +39717,7 @@ var voice = (config) => {
39707
39717
  sttFallback: sessionOptions.sttFallback,
39708
39718
  sttLifecycle: sessionOptions.sttLifecycle,
39709
39719
  ...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
39720
+ ...config.bargeInDetector ? { bargeInDetector: config.bargeInDetector } : {},
39710
39721
  ...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
39711
39722
  ...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
39712
39723
  ...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
@@ -41989,6 +42000,56 @@ var createRegexSemanticTurnDetector = (options) => {
41989
42000
  }
41990
42001
  };
41991
42002
  };
42003
+ // src/core/bargeInDetector.ts
42004
+ var measureTurnAudio = (chunks, format) => {
42005
+ const channels = format.channels ?? 1;
42006
+ const sampleRate = format.sampleRateHz ?? 16000;
42007
+ let sumSquares = 0;
42008
+ let sampleCount = 0;
42009
+ for (const chunk of chunks) {
42010
+ const usableBytes = chunk.byteLength - chunk.byteLength % 2;
42011
+ const view = new DataView(chunk.buffer, chunk.byteOffset, usableBytes);
42012
+ for (let offset = 0;offset < usableBytes; offset += 2) {
42013
+ const sample = view.getInt16(offset, true) / 32768;
42014
+ sumSquares += sample * sample;
42015
+ sampleCount += 1;
42016
+ }
42017
+ }
42018
+ if (sampleCount === 0) {
42019
+ return { durationMs: 0, rms: 0 };
42020
+ }
42021
+ return {
42022
+ durationMs: sampleCount / channels / sampleRate * 1000,
42023
+ rms: Math.sqrt(sumSquares / sampleCount)
42024
+ };
42025
+ };
42026
+ var createAcousticBargeInDetector = (options = {}) => {
42027
+ const sustainedMs = options.sustainedMs ?? 700;
42028
+ const emphaticRms = options.emphaticRms ?? 0.16;
42029
+ const noiseFloorRms = options.noiseFloorRms ?? 0.035;
42030
+ return {
42031
+ evaluate: (input) => {
42032
+ const { turnAudio, turnAudioFormat } = input;
42033
+ if (!turnAudio || turnAudio.length === 0 || !turnAudioFormat) {
42034
+ return input.isBackchannelByText ? { reason: "text_backchannel", shouldCancel: false } : { reason: "text_only", shouldCancel: true };
42035
+ }
42036
+ const { durationMs, rms } = measureTurnAudio(turnAudio, turnAudioFormat);
42037
+ if (durationMs >= sustainedMs) {
42038
+ return { reason: "acoustic_sustained", shouldCancel: true };
42039
+ }
42040
+ if (input.isBackchannelByText) {
42041
+ return { reason: "acoustic_backchannel", shouldCancel: false };
42042
+ }
42043
+ if (rms >= emphaticRms) {
42044
+ return { reason: "acoustic_emphatic", shouldCancel: true };
42045
+ }
42046
+ if (rms <= noiseFloorRms) {
42047
+ return { reason: "acoustic_noise_floor", shouldCancel: false };
42048
+ }
42049
+ return { reason: "acoustic_ambiguous", shouldCancel: true };
42050
+ }
42051
+ };
42052
+ };
41992
42053
  // src/core/webhookVerification.ts
41993
42054
  var VOICE_WEBHOOK_SIGNATURE_HEADER = "x-absolutejs-signature";
41994
42055
  var VOICE_WEBHOOK_TIMESTAMP_HEADER = "x-absolutejs-timestamp";
@@ -53726,6 +53787,7 @@ export {
53726
53787
  createCoturnIceServers,
53727
53788
  createCachedTTS,
53728
53789
  createAnthropicVoiceAssistantModel,
53790
+ createAcousticBargeInDetector,
53729
53791
  createAIVoiceModel,
53730
53792
  conditionAudioChunk,
53731
53793
  computeVoiceScorecardCalibration,
@@ -6472,7 +6472,7 @@ var createVoiceSession = (options) => {
6472
6472
  };
6473
6473
  const turnAudioInputFormat = recordingConfig?.userInputFormat ?? options.realtimeInputFormat ?? DEFAULT_REALTIME_FORMAT;
6474
6474
  const getTurnAudioForDetector = () => {
6475
- if (!options.semanticTurnDetector || currentTurnAudio.length === 0) {
6475
+ if (!options.semanticTurnDetector && !options.bargeInDetector || currentTurnAudio.length === 0) {
6476
6476
  return { turnAudio: undefined, turnAudioFormat: undefined };
6477
6477
  }
6478
6478
  const turnAudio = currentTurnAudio.map((audio) => {
@@ -7497,30 +7497,7 @@ var createVoiceSession = (options) => {
7497
7497
  const triggeringText = transcript.text.trim();
7498
7498
  if (triggeringText) {
7499
7499
  const wordCount = triggeringText.split(/\s+/).length;
7500
- if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
7501
- backchannelSuppressedAt = Date.now();
7502
- appendTurnLatencyStage({
7503
- metadata: {
7504
- partial: triggeringText.slice(0, 200),
7505
- reason: "backchannel",
7506
- wordCount
7507
- },
7508
- stage: "barge_in_suppressed",
7509
- turnId: activeTTSTurnId
7510
- }).catch(() => {});
7511
- } else if (wordCount >= bargeInMinPartialWords) {
7512
- backchannelSuppressedAt = null;
7513
- appendTurnLatencyStage({
7514
- metadata: {
7515
- partial: triggeringText.slice(0, 200),
7516
- source: "stt_partial",
7517
- wordCount
7518
- },
7519
- stage: "barge_in",
7520
- turnId: activeTTSTurnId
7521
- }).catch(() => {});
7522
- cancelActiveTTS("barge-in");
7523
- } else {
7500
+ if (wordCount < bargeInMinPartialWords) {
7524
7501
  appendTurnLatencyStage({
7525
7502
  metadata: {
7526
7503
  partial: triggeringText.slice(0, 200),
@@ -7530,6 +7507,39 @@ var createVoiceSession = (options) => {
7530
7507
  stage: "barge_in_suppressed",
7531
7508
  turnId: activeTTSTurnId
7532
7509
  }).catch(() => {});
7510
+ } else {
7511
+ const isBackchannelByText = backchannelBargeInGuard && isBackchannelUtterance(triggeringText);
7512
+ const verdict = options.bargeInDetector ? await Promise.resolve(options.bargeInDetector.evaluate({
7513
+ isBackchannelByText,
7514
+ partialText: triggeringText,
7515
+ wordCount,
7516
+ ...getTurnAudioForDetector()
7517
+ })) : { reason: undefined, shouldCancel: !isBackchannelByText };
7518
+ const reason = verdict.reason ?? (verdict.shouldCancel ? "stt_partial" : "backchannel");
7519
+ if (verdict.shouldCancel) {
7520
+ backchannelSuppressedAt = null;
7521
+ appendTurnLatencyStage({
7522
+ metadata: {
7523
+ partial: triggeringText.slice(0, 200),
7524
+ source: reason,
7525
+ wordCount
7526
+ },
7527
+ stage: "barge_in",
7528
+ turnId: activeTTSTurnId
7529
+ }).catch(() => {});
7530
+ cancelActiveTTS("barge-in");
7531
+ } else {
7532
+ backchannelSuppressedAt = Date.now();
7533
+ appendTurnLatencyStage({
7534
+ metadata: {
7535
+ partial: triggeringText.slice(0, 200),
7536
+ reason,
7537
+ wordCount
7538
+ },
7539
+ stage: "barge_in_suppressed",
7540
+ turnId: activeTTSTurnId
7541
+ }).catch(() => {});
7542
+ }
7533
7543
  }
7534
7544
  }
7535
7545
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.610",
3
+ "version": "0.0.22-beta.611",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",