@absolutejs/voice 0.0.22-beta.606 → 0.0.22-beta.608

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
21
21
  noteSilence: (timestampMs?: number) => void;
22
22
  reset: () => void;
23
23
  };
24
+ export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
24
25
  export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;
@@ -802,6 +802,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
802
802
  turnDetection?: VoiceTurnDetectionConfig;
803
803
  semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
804
804
  bargeInMinPartialWords?: number;
805
+ /**
806
+ * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
807
+ * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
808
+ * and the cue is dropped so it never becomes the caller's next turn. A bare
809
+ * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
810
+ * Default false (any in-speech words interrupt, the prior behavior).
811
+ */
812
+ backchannelBargeInGuard?: boolean;
805
813
  fillerPhrases?: ReadonlyArray<string>;
806
814
  fillerDelayMs?: number;
807
815
  fillerFor?: (input: {
@@ -975,6 +983,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
975
983
  * Word splitting is whitespace-based. Punctuation is left attached.
976
984
  */
977
985
  bargeInMinPartialWords?: number;
986
+ /**
987
+ * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
988
+ * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
989
+ * and the cue is dropped so it never becomes the caller's next turn. A bare
990
+ * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
991
+ * Default false (any in-speech words interrupt, the prior behavior).
992
+ */
993
+ backchannelBargeInGuard?: boolean;
978
994
  fillerPhrases?: ReadonlyArray<string>;
979
995
  /** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
980
996
  fillerDelayMs?: number;
package/dist/index.js CHANGED
@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
3118
3118
  { text: "right" },
3119
3119
  { text: "go on" }
3120
3120
  ];
3121
+ var BACKCHANNEL_TOKENS = new Set([
3122
+ "mm",
3123
+ "mmm",
3124
+ "mhm",
3125
+ "mmhm",
3126
+ "mmhmm",
3127
+ "hm",
3128
+ "hmm",
3129
+ "uh-huh",
3130
+ "uhhuh",
3131
+ "uh",
3132
+ "huh",
3133
+ "ah",
3134
+ "oh",
3135
+ "yeah",
3136
+ "yep",
3137
+ "yup",
3138
+ "yes",
3139
+ "ya",
3140
+ "yah",
3141
+ "ok",
3142
+ "okay",
3143
+ "k",
3144
+ "kay",
3145
+ "right",
3146
+ "sure",
3147
+ "totally",
3148
+ "exactly",
3149
+ "absolutely",
3150
+ "definitely",
3151
+ "gotcha",
3152
+ "cool",
3153
+ "nice",
3154
+ "wow",
3155
+ "true",
3156
+ "fair",
3157
+ "aha",
3158
+ "perfect",
3159
+ "awesome",
3160
+ "great",
3161
+ "good",
3162
+ "wonderful",
3163
+ "amazing",
3164
+ "interesting",
3165
+ "understood",
3166
+ "agreed"
3167
+ ]);
3168
+ var BACKCHANNEL_PHRASES = new Set([
3169
+ "i see",
3170
+ "got it",
3171
+ "makes sense",
3172
+ "of course",
3173
+ "for sure",
3174
+ "fair enough",
3175
+ "sounds good",
3176
+ "i know",
3177
+ "oh ok",
3178
+ "oh okay",
3179
+ "that's right",
3180
+ "thats right",
3181
+ "oh wow",
3182
+ "oh nice",
3183
+ "oh cool",
3184
+ "uh huh",
3185
+ "mm hm",
3186
+ "mm hmm",
3187
+ "i hear you",
3188
+ "for real",
3189
+ "no way",
3190
+ "makes total sense",
3191
+ "got you",
3192
+ "i get it",
3193
+ "right right",
3194
+ "yeah yeah",
3195
+ "ok ok",
3196
+ "oh i see",
3197
+ "oh got it",
3198
+ "yeah totally",
3199
+ "yeah exactly"
3200
+ ]);
3201
+ var isBackchannelUtterance = (text, maxWords = 3) => {
3202
+ const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
3203
+ if (!normalized) {
3204
+ return false;
3205
+ }
3206
+ if (BACKCHANNEL_PHRASES.has(normalized)) {
3207
+ return true;
3208
+ }
3209
+ const words = normalized.split(" ");
3210
+ if (words.length > maxWords) {
3211
+ return false;
3212
+ }
3213
+ return words.every((word) => BACKCHANNEL_TOKENS.has(word));
3214
+ };
3121
3215
  var createVoiceBackchannelDriver = (options) => {
3122
3216
  const cues = options.cues ?? DEFAULT_CUES;
3123
3217
  const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -3757,6 +3851,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
3757
3851
  var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
3758
3852
  var STREAM_IDLE_FLUSH_MS = 350;
3759
3853
  var SPECULATIVE_DELAY_MS = 500;
3854
+ var BACKCHANNEL_DROP_WINDOW_MS = 2000;
3760
3855
  var nextSpeakableBoundary = (buffer) => {
3761
3856
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
3762
3857
  return match ? match.index + match[0].length : -1;
@@ -3996,6 +4091,8 @@ var createVoiceSession = (options) => {
3996
4091
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
3997
4092
  const fillerDelayMs = options.fillerDelayMs ?? 250;
3998
4093
  const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
4094
+ const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
4095
+ let backchannelSuppressedAt = null;
3999
4096
  const fillerFor = options.fillerFor;
4000
4097
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
4001
4098
  const currentTurnAudio = [];
@@ -4496,18 +4593,14 @@ var createVoiceSession = (options) => {
4496
4593
  text: pendingText,
4497
4594
  transcripts: session.currentTurn.transcripts
4498
4595
  };
4499
- const startedAt = Date.now();
4500
4596
  const speculate = options.route.speculate;
4501
4597
  const promise = Promise.resolve(speculate({
4502
4598
  api,
4503
4599
  context: options.context,
4504
4600
  session,
4505
4601
  turn: provisionalTurn
4506
- })).then((result) => {
4507
- console.info(`[voice][p3] speculate done session=${session.id} -> ${result?.text ? `${result.text.length} chars` : "null"} in ${Date.now() - startedAt}ms for "${pendingText.slice(0, 30)}"`);
4508
- return result && result.text.trim() ? { text: result.text } : null;
4509
- }).catch((error) => {
4510
- console.info(`[voice][p3] speculate error: ${error instanceof Error ? error.message : String(error)}`);
4602
+ })).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
4603
+ console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
4511
4604
  return null;
4512
4605
  });
4513
4606
  speculation = { pendingText, promise };
@@ -5173,7 +5266,19 @@ var createVoiceSession = (options) => {
5173
5266
  const triggeringText = transcript.text.trim();
5174
5267
  if (triggeringText) {
5175
5268
  const wordCount = triggeringText.split(/\s+/).length;
5176
- if (wordCount >= bargeInMinPartialWords) {
5269
+ if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
5270
+ backchannelSuppressedAt = Date.now();
5271
+ appendTurnLatencyStage({
5272
+ metadata: {
5273
+ partial: triggeringText.slice(0, 200),
5274
+ reason: "backchannel",
5275
+ wordCount
5276
+ },
5277
+ stage: "barge_in_suppressed",
5278
+ turnId: activeTTSTurnId
5279
+ }).catch(() => {});
5280
+ } else if (wordCount >= bargeInMinPartialWords) {
5281
+ backchannelSuppressedAt = null;
5177
5282
  appendTurnLatencyStage({
5178
5283
  metadata: {
5179
5284
  partial: triggeringText.slice(0, 200),
@@ -5235,6 +5340,11 @@ var createVoiceSession = (options) => {
5235
5340
  };
5236
5341
  const handleFinal = async (transcript) => {
5237
5342
  sttReconnectCount = 0;
5343
+ if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
5344
+ backchannelSuppressedAt = null;
5345
+ return;
5346
+ }
5347
+ backchannelSuppressedAt = null;
5238
5348
  const session = await writeSession((session2) => {
5239
5349
  const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
5240
5350
  if (!alreadyPresent) {
@@ -39585,6 +39695,7 @@ var voice = (config) => {
39585
39695
  sttLifecycle: sessionOptions.sttLifecycle,
39586
39696
  ...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
39587
39697
  ...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
39698
+ ...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
39588
39699
  ...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
39589
39700
  ...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
39590
39701
  ...config.fillerFor ? { fillerFor: config.fillerFor } : {},
@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
5450
5450
  { text: "right" },
5451
5451
  { text: "go on" }
5452
5452
  ];
5453
+ var BACKCHANNEL_TOKENS = new Set([
5454
+ "mm",
5455
+ "mmm",
5456
+ "mhm",
5457
+ "mmhm",
5458
+ "mmhmm",
5459
+ "hm",
5460
+ "hmm",
5461
+ "uh-huh",
5462
+ "uhhuh",
5463
+ "uh",
5464
+ "huh",
5465
+ "ah",
5466
+ "oh",
5467
+ "yeah",
5468
+ "yep",
5469
+ "yup",
5470
+ "yes",
5471
+ "ya",
5472
+ "yah",
5473
+ "ok",
5474
+ "okay",
5475
+ "k",
5476
+ "kay",
5477
+ "right",
5478
+ "sure",
5479
+ "totally",
5480
+ "exactly",
5481
+ "absolutely",
5482
+ "definitely",
5483
+ "gotcha",
5484
+ "cool",
5485
+ "nice",
5486
+ "wow",
5487
+ "true",
5488
+ "fair",
5489
+ "aha",
5490
+ "perfect",
5491
+ "awesome",
5492
+ "great",
5493
+ "good",
5494
+ "wonderful",
5495
+ "amazing",
5496
+ "interesting",
5497
+ "understood",
5498
+ "agreed"
5499
+ ]);
5500
+ var BACKCHANNEL_PHRASES = new Set([
5501
+ "i see",
5502
+ "got it",
5503
+ "makes sense",
5504
+ "of course",
5505
+ "for sure",
5506
+ "fair enough",
5507
+ "sounds good",
5508
+ "i know",
5509
+ "oh ok",
5510
+ "oh okay",
5511
+ "that's right",
5512
+ "thats right",
5513
+ "oh wow",
5514
+ "oh nice",
5515
+ "oh cool",
5516
+ "uh huh",
5517
+ "mm hm",
5518
+ "mm hmm",
5519
+ "i hear you",
5520
+ "for real",
5521
+ "no way",
5522
+ "makes total sense",
5523
+ "got you",
5524
+ "i get it",
5525
+ "right right",
5526
+ "yeah yeah",
5527
+ "ok ok",
5528
+ "oh i see",
5529
+ "oh got it",
5530
+ "yeah totally",
5531
+ "yeah exactly"
5532
+ ]);
5533
+ var isBackchannelUtterance = (text, maxWords = 3) => {
5534
+ const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
5535
+ if (!normalized) {
5536
+ return false;
5537
+ }
5538
+ if (BACKCHANNEL_PHRASES.has(normalized)) {
5539
+ return true;
5540
+ }
5541
+ const words = normalized.split(" ");
5542
+ if (words.length > maxWords) {
5543
+ return false;
5544
+ }
5545
+ return words.every((word) => BACKCHANNEL_TOKENS.has(word));
5546
+ };
5453
5547
  var createVoiceBackchannelDriver = (options) => {
5454
5548
  const cues = options.cues ?? DEFAULT_CUES;
5455
5549
  const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -5984,6 +6078,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
5984
6078
  var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
5985
6079
  var STREAM_IDLE_FLUSH_MS = 350;
5986
6080
  var SPECULATIVE_DELAY_MS = 500;
6081
+ var BACKCHANNEL_DROP_WINDOW_MS = 2000;
5987
6082
  var nextSpeakableBoundary = (buffer) => {
5988
6083
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
5989
6084
  return match ? match.index + match[0].length : -1;
@@ -6223,6 +6318,8 @@ var createVoiceSession = (options) => {
6223
6318
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
6224
6319
  const fillerDelayMs = options.fillerDelayMs ?? 250;
6225
6320
  const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
6321
+ const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
6322
+ let backchannelSuppressedAt = null;
6226
6323
  const fillerFor = options.fillerFor;
6227
6324
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
6228
6325
  const currentTurnAudio = [];
@@ -6723,18 +6820,14 @@ var createVoiceSession = (options) => {
6723
6820
  text: pendingText,
6724
6821
  transcripts: session.currentTurn.transcripts
6725
6822
  };
6726
- const startedAt = Date.now();
6727
6823
  const speculate = options.route.speculate;
6728
6824
  const promise = Promise.resolve(speculate({
6729
6825
  api,
6730
6826
  context: options.context,
6731
6827
  session,
6732
6828
  turn: provisionalTurn
6733
- })).then((result) => {
6734
- console.info(`[voice][p3] speculate done session=${session.id} -> ${result?.text ? `${result.text.length} chars` : "null"} in ${Date.now() - startedAt}ms for "${pendingText.slice(0, 30)}"`);
6735
- return result && result.text.trim() ? { text: result.text } : null;
6736
- }).catch((error) => {
6737
- console.info(`[voice][p3] speculate error: ${error instanceof Error ? error.message : String(error)}`);
6829
+ })).then((result) => result && result.text.trim() ? { text: result.text } : null).catch((error) => {
6830
+ console.info(`[voice][p3] speculate error session=${session.id}: ${error instanceof Error ? error.message : String(error)}`);
6738
6831
  return null;
6739
6832
  });
6740
6833
  speculation = { pendingText, promise };
@@ -7400,7 +7493,19 @@ var createVoiceSession = (options) => {
7400
7493
  const triggeringText = transcript.text.trim();
7401
7494
  if (triggeringText) {
7402
7495
  const wordCount = triggeringText.split(/\s+/).length;
7403
- if (wordCount >= bargeInMinPartialWords) {
7496
+ if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
7497
+ backchannelSuppressedAt = Date.now();
7498
+ appendTurnLatencyStage({
7499
+ metadata: {
7500
+ partial: triggeringText.slice(0, 200),
7501
+ reason: "backchannel",
7502
+ wordCount
7503
+ },
7504
+ stage: "barge_in_suppressed",
7505
+ turnId: activeTTSTurnId
7506
+ }).catch(() => {});
7507
+ } else if (wordCount >= bargeInMinPartialWords) {
7508
+ backchannelSuppressedAt = null;
7404
7509
  appendTurnLatencyStage({
7405
7510
  metadata: {
7406
7511
  partial: triggeringText.slice(0, 200),
@@ -7462,6 +7567,11 @@ var createVoiceSession = (options) => {
7462
7567
  };
7463
7568
  const handleFinal = async (transcript) => {
7464
7569
  sttReconnectCount = 0;
7570
+ if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
7571
+ backchannelSuppressedAt = null;
7572
+ return;
7573
+ }
7574
+ backchannelSuppressedAt = null;
7465
7575
  const session = await writeSession((session2) => {
7466
7576
  const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
7467
7577
  if (!alreadyPresent) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.606",
3
+ "version": "0.0.22-beta.608",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",