@absolutejs/voice 0.0.22-beta.607 → 0.0.22-beta.608

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,4 +21,5 @@ export type VoiceBackchannelDriver = {
21
21
  noteSilence: (timestampMs?: number) => void;
22
22
  reset: () => void;
23
23
  };
24
+ export declare const isBackchannelUtterance: (text: string, maxWords?: number) => boolean;
24
25
  export declare const createVoiceBackchannelDriver: (options: VoiceBackchannelDriverOptions) => VoiceBackchannelDriver;
@@ -802,6 +802,14 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
802
802
  turnDetection?: VoiceTurnDetectionConfig;
803
803
  semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
804
804
  bargeInMinPartialWords?: number;
805
+ /**
806
+ * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
807
+ * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
808
+ * and the cue is dropped so it never becomes the caller's next turn. A bare
809
+ * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
810
+ * Default false (any in-speech words interrupt, the prior behavior).
811
+ */
812
+ backchannelBargeInGuard?: boolean;
805
813
  fillerPhrases?: ReadonlyArray<string>;
806
814
  fillerDelayMs?: number;
807
815
  fillerFor?: (input: {
@@ -975,6 +983,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
975
983
  * Word splitting is whitespace-based. Punctuation is left attached.
976
984
  */
977
985
  bargeInMinPartialWords?: number;
986
+ /**
987
+ * When true, a pure listening cue ("mm-hm", "yeah", "right", "got it") spoken
988
+ * WHILE the assistant is talking does NOT barge-in — the assistant keeps going
989
+ * and the cue is dropped so it never becomes the caller's next turn. A bare
990
+ * "yeah" said AFTER the assistant finishes is a normal answer, unaffected.
991
+ * Default false (any in-speech words interrupt, the prior behavior).
992
+ */
993
+ backchannelBargeInGuard?: boolean;
978
994
  fillerPhrases?: ReadonlyArray<string>;
979
995
  /** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
980
996
  fillerDelayMs?: number;
package/dist/index.js CHANGED
@@ -3118,6 +3118,100 @@ var DEFAULT_CUES = [
3118
3118
  { text: "right" },
3119
3119
  { text: "go on" }
3120
3120
  ];
3121
+ var BACKCHANNEL_TOKENS = new Set([
3122
+ "mm",
3123
+ "mmm",
3124
+ "mhm",
3125
+ "mmhm",
3126
+ "mmhmm",
3127
+ "hm",
3128
+ "hmm",
3129
+ "uh-huh",
3130
+ "uhhuh",
3131
+ "uh",
3132
+ "huh",
3133
+ "ah",
3134
+ "oh",
3135
+ "yeah",
3136
+ "yep",
3137
+ "yup",
3138
+ "yes",
3139
+ "ya",
3140
+ "yah",
3141
+ "ok",
3142
+ "okay",
3143
+ "k",
3144
+ "kay",
3145
+ "right",
3146
+ "sure",
3147
+ "totally",
3148
+ "exactly",
3149
+ "absolutely",
3150
+ "definitely",
3151
+ "gotcha",
3152
+ "cool",
3153
+ "nice",
3154
+ "wow",
3155
+ "true",
3156
+ "fair",
3157
+ "aha",
3158
+ "perfect",
3159
+ "awesome",
3160
+ "great",
3161
+ "good",
3162
+ "wonderful",
3163
+ "amazing",
3164
+ "interesting",
3165
+ "understood",
3166
+ "agreed"
3167
+ ]);
3168
+ var BACKCHANNEL_PHRASES = new Set([
3169
+ "i see",
3170
+ "got it",
3171
+ "makes sense",
3172
+ "of course",
3173
+ "for sure",
3174
+ "fair enough",
3175
+ "sounds good",
3176
+ "i know",
3177
+ "oh ok",
3178
+ "oh okay",
3179
+ "that's right",
3180
+ "thats right",
3181
+ "oh wow",
3182
+ "oh nice",
3183
+ "oh cool",
3184
+ "uh huh",
3185
+ "mm hm",
3186
+ "mm hmm",
3187
+ "i hear you",
3188
+ "for real",
3189
+ "no way",
3190
+ "makes total sense",
3191
+ "got you",
3192
+ "i get it",
3193
+ "right right",
3194
+ "yeah yeah",
3195
+ "ok ok",
3196
+ "oh i see",
3197
+ "oh got it",
3198
+ "yeah totally",
3199
+ "yeah exactly"
3200
+ ]);
3201
+ var isBackchannelUtterance = (text, maxWords = 3) => {
3202
+ const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
3203
+ if (!normalized) {
3204
+ return false;
3205
+ }
3206
+ if (BACKCHANNEL_PHRASES.has(normalized)) {
3207
+ return true;
3208
+ }
3209
+ const words = normalized.split(" ");
3210
+ if (words.length > maxWords) {
3211
+ return false;
3212
+ }
3213
+ return words.every((word) => BACKCHANNEL_TOKENS.has(word));
3214
+ };
3121
3215
  var createVoiceBackchannelDriver = (options) => {
3122
3216
  const cues = options.cues ?? DEFAULT_CUES;
3123
3217
  const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -3757,6 +3851,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
3757
3851
  var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
3758
3852
  var STREAM_IDLE_FLUSH_MS = 350;
3759
3853
  var SPECULATIVE_DELAY_MS = 500;
3854
+ var BACKCHANNEL_DROP_WINDOW_MS = 2000;
3760
3855
  var nextSpeakableBoundary = (buffer) => {
3761
3856
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
3762
3857
  return match ? match.index + match[0].length : -1;
@@ -3996,6 +4091,8 @@ var createVoiceSession = (options) => {
3996
4091
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
3997
4092
  const fillerDelayMs = options.fillerDelayMs ?? 250;
3998
4093
  const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
4094
+ const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
4095
+ let backchannelSuppressedAt = null;
3999
4096
  const fillerFor = options.fillerFor;
4000
4097
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
4001
4098
  const currentTurnAudio = [];
@@ -5169,7 +5266,19 @@ var createVoiceSession = (options) => {
5169
5266
  const triggeringText = transcript.text.trim();
5170
5267
  if (triggeringText) {
5171
5268
  const wordCount = triggeringText.split(/\s+/).length;
5172
- if (wordCount >= bargeInMinPartialWords) {
5269
+ if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
5270
+ backchannelSuppressedAt = Date.now();
5271
+ appendTurnLatencyStage({
5272
+ metadata: {
5273
+ partial: triggeringText.slice(0, 200),
5274
+ reason: "backchannel",
5275
+ wordCount
5276
+ },
5277
+ stage: "barge_in_suppressed",
5278
+ turnId: activeTTSTurnId
5279
+ }).catch(() => {});
5280
+ } else if (wordCount >= bargeInMinPartialWords) {
5281
+ backchannelSuppressedAt = null;
5173
5282
  appendTurnLatencyStage({
5174
5283
  metadata: {
5175
5284
  partial: triggeringText.slice(0, 200),
@@ -5231,6 +5340,11 @@ var createVoiceSession = (options) => {
5231
5340
  };
5232
5341
  const handleFinal = async (transcript) => {
5233
5342
  sttReconnectCount = 0;
5343
+ if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
5344
+ backchannelSuppressedAt = null;
5345
+ return;
5346
+ }
5347
+ backchannelSuppressedAt = null;
5234
5348
  const session = await writeSession((session2) => {
5235
5349
  const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
5236
5350
  if (!alreadyPresent) {
@@ -39581,6 +39695,7 @@ var voice = (config) => {
39581
39695
  sttLifecycle: sessionOptions.sttLifecycle,
39582
39696
  ...config.semanticTurnDetector ? { semanticTurnDetector: config.semanticTurnDetector } : {},
39583
39697
  ...config.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: config.bargeInMinPartialWords } : {},
39698
+ ...config.backchannelBargeInGuard !== undefined ? { backchannelBargeInGuard: config.backchannelBargeInGuard } : {},
39584
39699
  ...config.fillerPhrases ? { fillerPhrases: config.fillerPhrases } : {},
39585
39700
  ...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
39586
39701
  ...config.fillerFor ? { fillerFor: config.fillerFor } : {},
@@ -5450,6 +5450,100 @@ var DEFAULT_CUES = [
5450
5450
  { text: "right" },
5451
5451
  { text: "go on" }
5452
5452
  ];
5453
+ var BACKCHANNEL_TOKENS = new Set([
5454
+ "mm",
5455
+ "mmm",
5456
+ "mhm",
5457
+ "mmhm",
5458
+ "mmhmm",
5459
+ "hm",
5460
+ "hmm",
5461
+ "uh-huh",
5462
+ "uhhuh",
5463
+ "uh",
5464
+ "huh",
5465
+ "ah",
5466
+ "oh",
5467
+ "yeah",
5468
+ "yep",
5469
+ "yup",
5470
+ "yes",
5471
+ "ya",
5472
+ "yah",
5473
+ "ok",
5474
+ "okay",
5475
+ "k",
5476
+ "kay",
5477
+ "right",
5478
+ "sure",
5479
+ "totally",
5480
+ "exactly",
5481
+ "absolutely",
5482
+ "definitely",
5483
+ "gotcha",
5484
+ "cool",
5485
+ "nice",
5486
+ "wow",
5487
+ "true",
5488
+ "fair",
5489
+ "aha",
5490
+ "perfect",
5491
+ "awesome",
5492
+ "great",
5493
+ "good",
5494
+ "wonderful",
5495
+ "amazing",
5496
+ "interesting",
5497
+ "understood",
5498
+ "agreed"
5499
+ ]);
5500
+ var BACKCHANNEL_PHRASES = new Set([
5501
+ "i see",
5502
+ "got it",
5503
+ "makes sense",
5504
+ "of course",
5505
+ "for sure",
5506
+ "fair enough",
5507
+ "sounds good",
5508
+ "i know",
5509
+ "oh ok",
5510
+ "oh okay",
5511
+ "that's right",
5512
+ "thats right",
5513
+ "oh wow",
5514
+ "oh nice",
5515
+ "oh cool",
5516
+ "uh huh",
5517
+ "mm hm",
5518
+ "mm hmm",
5519
+ "i hear you",
5520
+ "for real",
5521
+ "no way",
5522
+ "makes total sense",
5523
+ "got you",
5524
+ "i get it",
5525
+ "right right",
5526
+ "yeah yeah",
5527
+ "ok ok",
5528
+ "oh i see",
5529
+ "oh got it",
5530
+ "yeah totally",
5531
+ "yeah exactly"
5532
+ ]);
5533
+ var isBackchannelUtterance = (text, maxWords = 3) => {
5534
+ const normalized = text.toLowerCase().replace(/[^a-z']/g, " ").replace(/\s+/g, " ").trim();
5535
+ if (!normalized) {
5536
+ return false;
5537
+ }
5538
+ if (BACKCHANNEL_PHRASES.has(normalized)) {
5539
+ return true;
5540
+ }
5541
+ const words = normalized.split(" ");
5542
+ if (words.length > maxWords) {
5543
+ return false;
5544
+ }
5545
+ return words.every((word) => BACKCHANNEL_TOKENS.has(word));
5546
+ };
5453
5547
  var createVoiceBackchannelDriver = (options) => {
5454
5548
  const cues = options.cues ?? DEFAULT_CUES;
5455
5549
  const minSpeechMs = options.minSpeechMs ?? 2500;
@@ -5984,6 +6078,7 @@ var MAX_TTS_CHUNK_CHARS = 320;
5984
6078
  var STREAM_SENTENCE_END = /[.!?\u2026]['")\]]*$/;
5985
6079
  var STREAM_IDLE_FLUSH_MS = 350;
5986
6080
  var SPECULATIVE_DELAY_MS = 500;
6081
+ var BACKCHANNEL_DROP_WINDOW_MS = 2000;
5987
6082
  var nextSpeakableBoundary = (buffer) => {
5988
6083
  const match = STREAM_SENTENCE_BOUNDARY.exec(buffer);
5989
6084
  return match ? match.index + match[0].length : -1;
@@ -6223,6 +6318,8 @@ var createVoiceSession = (options) => {
6223
6318
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
6224
6319
  const fillerDelayMs = options.fillerDelayMs ?? 250;
6225
6320
  const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
6321
+ const backchannelBargeInGuard = options.backchannelBargeInGuard ?? false;
6322
+ let backchannelSuppressedAt = null;
6226
6323
  const fillerFor = options.fillerFor;
6227
6324
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
6228
6325
  const currentTurnAudio = [];
@@ -7396,7 +7493,19 @@ var createVoiceSession = (options) => {
7396
7493
  const triggeringText = transcript.text.trim();
7397
7494
  if (triggeringText) {
7398
7495
  const wordCount = triggeringText.split(/\s+/).length;
7399
- if (wordCount >= bargeInMinPartialWords) {
7496
+ if (wordCount >= bargeInMinPartialWords && backchannelBargeInGuard && isBackchannelUtterance(triggeringText)) {
7497
+ backchannelSuppressedAt = Date.now();
7498
+ appendTurnLatencyStage({
7499
+ metadata: {
7500
+ partial: triggeringText.slice(0, 200),
7501
+ reason: "backchannel",
7502
+ wordCount
7503
+ },
7504
+ stage: "barge_in_suppressed",
7505
+ turnId: activeTTSTurnId
7506
+ }).catch(() => {});
7507
+ } else if (wordCount >= bargeInMinPartialWords) {
7508
+ backchannelSuppressedAt = null;
7400
7509
  appendTurnLatencyStage({
7401
7510
  metadata: {
7402
7511
  partial: triggeringText.slice(0, 200),
@@ -7458,6 +7567,11 @@ var createVoiceSession = (options) => {
7458
7567
  };
7459
7568
  const handleFinal = async (transcript) => {
7460
7569
  sttReconnectCount = 0;
7570
+ if (backchannelBargeInGuard && backchannelSuppressedAt !== null && Date.now() - backchannelSuppressedAt < BACKCHANNEL_DROP_WINDOW_MS && isBackchannelUtterance(transcript.text.trim())) {
7571
+ backchannelSuppressedAt = null;
7572
+ return;
7573
+ }
7574
+ backchannelSuppressedAt = null;
7461
7575
  const session = await writeSession((session2) => {
7462
7576
  const alreadyPresent = session2.currentTurn.transcripts.some((existing) => existing.id === transcript.id);
7463
7577
  if (!alreadyPresent) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.607",
3
+ "version": "0.0.22-beta.608",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",