@absolutejs/voice 0.0.22-beta.578 → 0.0.22-beta.579

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,12 @@ export type VoiceBackchannelDriverOptions = {
10
10
  minSpeechMs?: number;
11
11
  onCue: (cue: VoiceBackchannelCue) => Promise<void> | void;
12
12
  };
13
+ export type VoiceBackchannelConfig = {
14
+ enabled?: boolean;
15
+ cues?: ReadonlyArray<string>;
16
+ minSpeechMs?: number;
17
+ cueIntervalMs?: number;
18
+ };
13
19
  export type VoiceBackchannelDriver = {
14
20
  noteSpeech: (timestampMs?: number) => void;
15
21
  noteSilence: (timestampMs?: number) => void;
@@ -783,6 +783,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
783
783
  userText: string;
784
784
  }) => Promise<string | null>;
785
785
  fillerForTimeoutMs?: number;
786
+ backchannel?: import("./backchannel").VoiceBackchannelConfig;
786
787
  defaultSilentTurnAck?: string;
787
788
  routeOnTurnTimeoutMs?: number;
788
789
  audioConditioning?: VoiceAudioConditioningConfig;
@@ -968,6 +969,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
968
969
  }) => Promise<string | null>;
969
970
  /** Ceiling for the `fillerFor` call before we fall back to a static phrase. Default 600ms. */
970
971
  fillerForTimeoutMs?: number;
972
+ /**
973
+ * Backchannel cues — short "mm-hm"/"right" acknowledgements played while the
974
+ * CALLER is mid-turn (a long answer) so they feel heard, the way a human
975
+ * listener interjects. Plays on the same non-turn TTS path as fillers, so it
976
+ * never registers as the assistant's turn or trips barge-in. Off unless
977
+ * `enabled` is set. Fires only while the assistant is silent.
978
+ */
979
+ backchannel?: import("./backchannel").VoiceBackchannelConfig;
971
980
  /**
972
981
  * Default spoken ack if the model returns ONLY tool calls (no text) and the
973
982
  * turn isn't ending. Without this, the caller hears total silence after
package/dist/index.d.ts CHANGED
@@ -98,7 +98,7 @@ export type { VoiceCampaignDisposition, VoiceCampaignDispositionRetryPolicy, Voi
98
98
  export { createVoiceBackchannelDriver } from "./core/backchannel";
99
99
  export { createVoiceOAuth2TokenSource } from "./core/oauth2TokenSource";
100
100
  export type { CreateVoiceOAuth2TokenSourceOptions, VoiceOAuth2TokenResponse, VoiceOAuth2TokenSource, } from "./core/oauth2TokenSource";
101
- export type { VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
101
+ export type { VoiceBackchannelConfig, VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
102
102
  export { createVoiceIVRSession, describeVoiceIVRPlan, evaluateVoiceIVRPlan, } from "./core/ivrPlan";
103
103
  export type { VoiceIVRBranch, VoiceIVRDecision, VoiceIVRInput, VoiceIVRMatch, VoiceIVRPlan, VoiceIVRSession, } from "./core/ivrPlan";
104
104
  export { VOICE_CALLER_MEMORY_KEY, buildVoiceCallerMemoryNamespace, createVoiceCallerMemoryNamespace, summarizeVoiceCallerTranscript, } from "./core/callerMemory";
package/dist/index.js CHANGED
@@ -3091,6 +3091,71 @@ var toVoiceSessionSummary = (session) => ({
3091
3091
  // src/core/session.ts
3092
3092
  import { Buffer as Buffer2 } from "buffer";
3093
3093
 
3094
+ // src/core/backchannel.ts
3095
+ var DEFAULT_CUES = [
3096
+ { text: "mm-hmm" },
3097
+ { text: "I see" },
3098
+ { text: "right" },
3099
+ { text: "go on" }
3100
+ ];
3101
+ var createVoiceBackchannelDriver = (options) => {
3102
+ const cues = options.cues ?? DEFAULT_CUES;
3103
+ const minSpeechMs = options.minSpeechMs ?? 2500;
3104
+ const cueIntervalMs = options.cueIntervalMs ?? 2500;
3105
+ const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
3106
+ let speechStartedAt;
3107
+ let lastCueAt;
3108
+ let cueCount = 0;
3109
+ let firing = false;
3110
+ const tryFire = async (now) => {
3111
+ if (firing || cues.length === 0) {
3112
+ return;
3113
+ }
3114
+ if (speechStartedAt === undefined) {
3115
+ return;
3116
+ }
3117
+ const elapsed = now - speechStartedAt;
3118
+ if (elapsed < minSpeechMs) {
3119
+ return;
3120
+ }
3121
+ if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
3122
+ return;
3123
+ }
3124
+ const cue = cues[cueIndexFn(cueCount)];
3125
+ if (!cue) {
3126
+ return;
3127
+ }
3128
+ firing = true;
3129
+ try {
3130
+ await options.onCue(cue);
3131
+ } finally {
3132
+ firing = false;
3133
+ lastCueAt = now;
3134
+ cueCount += 1;
3135
+ }
3136
+ };
3137
+ return {
3138
+ noteSilence: (timestampMs) => {
3139
+ const now = timestampMs ?? Date.now();
3140
+ if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
3141
+ speechStartedAt = undefined;
3142
+ }
3143
+ },
3144
+ noteSpeech: (timestampMs) => {
3145
+ const now = timestampMs ?? Date.now();
3146
+ if (speechStartedAt === undefined) {
3147
+ speechStartedAt = now;
3148
+ }
3149
+ tryFire(now);
3150
+ },
3151
+ reset: () => {
3152
+ speechStartedAt = undefined;
3153
+ lastCueAt = undefined;
3154
+ cueCount = 0;
3155
+ }
3156
+ };
3157
+ };
3158
+
3094
3159
  // src/core/handoff.ts
3095
3160
  var toHex3 = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
3096
3161
  var signHandoffBody = async (input) => {
@@ -5217,6 +5282,30 @@ var createVoiceSession = (options) => {
5217
5282
  });
5218
5283
  });
5219
5284
  };
5285
+ const emitBackchannelCue = (text) => {
5286
+ if (!text || !options.tts)
5287
+ return;
5288
+ if (activeTTSTurnId !== undefined || fillerActive)
5289
+ return;
5290
+ runSerial("backchannel.send", async () => {
5291
+ if (activeTTSTurnId !== undefined || fillerActive)
5292
+ return;
5293
+ const adapterSession = await ensureTTSSession();
5294
+ if (!adapterSession)
5295
+ return;
5296
+ try {
5297
+ await adapterSession.send(text);
5298
+ } catch {}
5299
+ });
5300
+ };
5301
+ const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
5302
+ ...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
5303
+ ...options.backchannel.cues ? {
5304
+ cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
5305
+ } : {},
5306
+ ...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
5307
+ onCue: (cue) => emitBackchannelCue(cue.text)
5308
+ }) : null;
5220
5309
  const createTurnTTSStreamer = (turn, session) => {
5221
5310
  let buffer = "";
5222
5311
  let full = "";
@@ -5708,6 +5797,7 @@ var createVoiceSession = (options) => {
5708
5797
  };
5709
5798
  const commitTurnInternal = async (reason = "manual") => {
5710
5799
  clearSilenceTimer();
5800
+ backchannelDriver?.reset();
5711
5801
  amdLastTurnCommitAt = Date.now();
5712
5802
  const session = await readSession();
5713
5803
  if (session.status === "completed" || session.status === "failed") {
@@ -6051,7 +6141,9 @@ var createVoiceSession = (options) => {
6051
6141
  speechDetected = true;
6052
6142
  clearSilenceTimer();
6053
6143
  kickCallSilenceWatchdog();
6144
+ backchannelDriver?.noteSpeech();
6054
6145
  } else if (speechDetected) {
6146
+ backchannelDriver?.noteSilence();
6055
6147
  const currentSession = await readSession();
6056
6148
  const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
6057
6149
  partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
@@ -24811,6 +24903,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
24811
24903
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
24812
24904
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
24813
24905
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
24906
+ ...options.backchannel ? { backchannel: options.backchannel } : {},
24814
24907
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
24815
24908
  ...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
24816
24909
  trace: options.trace,
@@ -39177,6 +39270,7 @@ var voice = (config) => {
39177
39270
  ...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
39178
39271
  ...config.fillerFor ? { fillerFor: config.fillerFor } : {},
39179
39272
  ...config.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: config.fillerForTimeoutMs } : {},
39273
+ ...config.backchannel ? { backchannel: config.backchannel } : {},
39180
39274
  ...config.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: config.defaultSilentTurnAck } : {},
39181
39275
  ...config.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: config.routeOnTurnTimeoutMs } : {},
39182
39276
  tts: config.tts,
@@ -41569,70 +41663,6 @@ var summarizeVoiceCampaignDispositions = (record) => {
41569
41663
  totalRecipients: record.recipients.length
41570
41664
  };
41571
41665
  };
41572
- // src/core/backchannel.ts
41573
- var DEFAULT_CUES = [
41574
- { text: "mm-hmm" },
41575
- { text: "I see" },
41576
- { text: "right" },
41577
- { text: "go on" }
41578
- ];
41579
- var createVoiceBackchannelDriver = (options) => {
41580
- const cues = options.cues ?? DEFAULT_CUES;
41581
- const minSpeechMs = options.minSpeechMs ?? 2500;
41582
- const cueIntervalMs = options.cueIntervalMs ?? 2500;
41583
- const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
41584
- let speechStartedAt;
41585
- let lastCueAt;
41586
- let cueCount = 0;
41587
- let firing = false;
41588
- const tryFire = async (now) => {
41589
- if (firing || cues.length === 0) {
41590
- return;
41591
- }
41592
- if (speechStartedAt === undefined) {
41593
- return;
41594
- }
41595
- const elapsed = now - speechStartedAt;
41596
- if (elapsed < minSpeechMs) {
41597
- return;
41598
- }
41599
- if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
41600
- return;
41601
- }
41602
- const cue = cues[cueIndexFn(cueCount)];
41603
- if (!cue) {
41604
- return;
41605
- }
41606
- firing = true;
41607
- try {
41608
- await options.onCue(cue);
41609
- } finally {
41610
- firing = false;
41611
- lastCueAt = now;
41612
- cueCount += 1;
41613
- }
41614
- };
41615
- return {
41616
- noteSilence: (timestampMs) => {
41617
- const now = timestampMs ?? Date.now();
41618
- if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
41619
- speechStartedAt = undefined;
41620
- }
41621
- },
41622
- noteSpeech: (timestampMs) => {
41623
- const now = timestampMs ?? Date.now();
41624
- if (speechStartedAt === undefined) {
41625
- speechStartedAt = now;
41626
- }
41627
- tryFire(now);
41628
- },
41629
- reset: () => {
41630
- speechStartedAt = undefined;
41631
- lastCueAt = undefined;
41632
- cueCount = 0;
41633
- }
41634
- };
41635
- };
41636
41666
  // src/core/oauth2TokenSource.ts
41637
41667
  var createVoiceOAuth2TokenSource = (options) => {
41638
41668
  const fetchImpl = options.fetch ?? globalThis.fetch.bind(globalThis);
@@ -164,6 +164,9 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
164
164
  }) => Promise<string | null>;
165
165
  /** Cap on the `fillerFor` race before falling back to a static phrase. Default 600ms. */
166
166
  fillerForTimeoutMs?: number;
167
+ /** Backchannel cues played while the caller is mid-turn so they feel heard.
168
+ * Non-turn TTS path (no barge-in interaction). Off unless `enabled`. */
169
+ backchannel?: import("../core/backchannel").VoiceBackchannelConfig;
167
170
  /**
168
171
  * Default spoken ack if the model returns ONLY tool calls (no text) and
169
172
  * the turn isn't ending. Without this, the caller hears silence and
@@ -5311,6 +5311,71 @@ var createVoiceMemoryStore = () => {
5311
5311
  // src/core/session.ts
5312
5312
  import { Buffer as Buffer2 } from "buffer";
5313
5313
 
5314
+ // src/core/backchannel.ts
5315
+ var DEFAULT_CUES = [
5316
+ { text: "mm-hmm" },
5317
+ { text: "I see" },
5318
+ { text: "right" },
5319
+ { text: "go on" }
5320
+ ];
5321
+ var createVoiceBackchannelDriver = (options) => {
5322
+ const cues = options.cues ?? DEFAULT_CUES;
5323
+ const minSpeechMs = options.minSpeechMs ?? 2500;
5324
+ const cueIntervalMs = options.cueIntervalMs ?? 2500;
5325
+ const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
5326
+ let speechStartedAt;
5327
+ let lastCueAt;
5328
+ let cueCount = 0;
5329
+ let firing = false;
5330
+ const tryFire = async (now) => {
5331
+ if (firing || cues.length === 0) {
5332
+ return;
5333
+ }
5334
+ if (speechStartedAt === undefined) {
5335
+ return;
5336
+ }
5337
+ const elapsed = now - speechStartedAt;
5338
+ if (elapsed < minSpeechMs) {
5339
+ return;
5340
+ }
5341
+ if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
5342
+ return;
5343
+ }
5344
+ const cue = cues[cueIndexFn(cueCount)];
5345
+ if (!cue) {
5346
+ return;
5347
+ }
5348
+ firing = true;
5349
+ try {
5350
+ await options.onCue(cue);
5351
+ } finally {
5352
+ firing = false;
5353
+ lastCueAt = now;
5354
+ cueCount += 1;
5355
+ }
5356
+ };
5357
+ return {
5358
+ noteSilence: (timestampMs) => {
5359
+ const now = timestampMs ?? Date.now();
5360
+ if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
5361
+ speechStartedAt = undefined;
5362
+ }
5363
+ },
5364
+ noteSpeech: (timestampMs) => {
5365
+ const now = timestampMs ?? Date.now();
5366
+ if (speechStartedAt === undefined) {
5367
+ speechStartedAt = now;
5368
+ }
5369
+ tryFire(now);
5370
+ },
5371
+ reset: () => {
5372
+ speechStartedAt = undefined;
5373
+ lastCueAt = undefined;
5374
+ cueCount = 0;
5375
+ }
5376
+ };
5377
+ };
5378
+
5314
5379
  // src/core/handoff.ts
5315
5380
  var toHex = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
5316
5381
  var signHandoffBody = async (input) => {
@@ -7333,6 +7398,30 @@ var createVoiceSession = (options) => {
7333
7398
  });
7334
7399
  });
7335
7400
  };
7401
+ const emitBackchannelCue = (text) => {
7402
+ if (!text || !options.tts)
7403
+ return;
7404
+ if (activeTTSTurnId !== undefined || fillerActive)
7405
+ return;
7406
+ runSerial("backchannel.send", async () => {
7407
+ if (activeTTSTurnId !== undefined || fillerActive)
7408
+ return;
7409
+ const adapterSession = await ensureTTSSession();
7410
+ if (!adapterSession)
7411
+ return;
7412
+ try {
7413
+ await adapterSession.send(text);
7414
+ } catch {}
7415
+ });
7416
+ };
7417
+ const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
7418
+ ...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
7419
+ ...options.backchannel.cues ? {
7420
+ cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
7421
+ } : {},
7422
+ ...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
7423
+ onCue: (cue) => emitBackchannelCue(cue.text)
7424
+ }) : null;
7336
7425
  const createTurnTTSStreamer = (turn, session) => {
7337
7426
  let buffer = "";
7338
7427
  let full = "";
@@ -7824,6 +7913,7 @@ var createVoiceSession = (options) => {
7824
7913
  };
7825
7914
  const commitTurnInternal = async (reason = "manual") => {
7826
7915
  clearSilenceTimer();
7916
+ backchannelDriver?.reset();
7827
7917
  amdLastTurnCommitAt = Date.now();
7828
7918
  const session = await readSession();
7829
7919
  if (session.status === "completed" || session.status === "failed") {
@@ -8167,7 +8257,9 @@ var createVoiceSession = (options) => {
8167
8257
  speechDetected = true;
8168
8258
  clearSilenceTimer();
8169
8259
  kickCallSilenceWatchdog();
8260
+ backchannelDriver?.noteSpeech();
8170
8261
  } else if (speechDetected) {
8262
+ backchannelDriver?.noteSilence();
8171
8263
  const currentSession = await readSession();
8172
8264
  const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
8173
8265
  partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
@@ -13646,6 +13738,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
13646
13738
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
13647
13739
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
13648
13740
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
13741
+ ...options.backchannel ? { backchannel: options.backchannel } : {},
13649
13742
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
13650
13743
  ...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
13651
13744
  trace: options.trace,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.578",
3
+ "version": "0.0.22-beta.579",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",