@absolutejs/voice 0.0.22-beta.562 → 0.0.22-beta.564

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -913,6 +913,20 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
913
913
  * Set `fillerPhrases: []` (or omit) to disable. Reasonable defaults if
914
914
  * you enable: `["Hmm.", "Got it.", "Right.", "Mm-hm.", "Let me think.", "Okay."]`.
915
915
  */
916
+ /**
917
+ * Minimum word count in an STT partial transcript before speech-gated
918
+ * barge-in cancels the in-flight assistant TTS. Default 1 (any non-empty
919
+ * partial triggers barge-in — backwards-compatible).
920
+ *
921
+ * Set to 2 (or higher) on phone routes where the caller's brief
922
+ * acknowledgements ("yeah", "uh-huh", "you", "am i") would otherwise
923
+ * cut the bot off mid-question. Each extra word added typically delays
924
+ * barge-in by ~100-200ms (one extra STT partial cycle) — cheap compared
925
+ * to losing the bot's response.
926
+ *
927
+ * Word splitting is whitespace-based. Punctuation is left attached.
928
+ */
929
+ bargeInMinPartialWords?: number;
916
930
  fillerPhrases?: ReadonlyArray<string>;
917
931
  /** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
918
932
  fillerDelayMs?: number;
package/dist/index.js CHANGED
@@ -3852,7 +3852,7 @@ var createVoiceSession = (options) => {
3852
3852
  };
3853
3853
  const appendTurnLatencyStage = async (input) => appendTrace({
3854
3854
  at: input.at,
3855
- payload: { stage: input.stage },
3855
+ payload: { stage: input.stage, ...input.metadata ?? {} },
3856
3856
  session: input.session,
3857
3857
  turnId: input.turnId,
3858
3858
  type: "turn_latency.stage"
@@ -3875,6 +3875,7 @@ var createVoiceSession = (options) => {
3875
3875
  let fillerToken = 0;
3876
3876
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
3877
3877
  const fillerDelayMs = options.fillerDelayMs ?? 250;
3878
+ const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
3878
3879
  const fillerFor = options.fillerFor;
3879
3880
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
3880
3881
  const currentTurnAudio = [];
@@ -4261,6 +4262,11 @@ var createVoiceSession = (options) => {
4261
4262
  return;
4262
4263
  }
4263
4264
  activeTTSTurnId = undefined;
4265
+ appendTurnLatencyStage({
4266
+ metadata: { reason },
4267
+ stage: "tts_canceled",
4268
+ turnId: cancelledTurnId
4269
+ }).catch(() => {});
4264
4270
  Promise.resolve(socket.clear?.()).catch(() => {});
4265
4271
  if (!ttsAdapterSessionCanCancel(activeSession)) {
4266
4272
  return;
@@ -4919,8 +4925,33 @@ var createVoiceSession = (options) => {
4919
4925
  };
4920
4926
  };
4921
4927
  const handlePartial = async (transcript) => {
4922
- if (activeTTSTurnId !== undefined && transcript.text.trim()) {
4923
- cancelActiveTTS("barge-in");
4928
+ if (activeTTSTurnId !== undefined) {
4929
+ const triggeringText = transcript.text.trim();
4930
+ if (triggeringText) {
4931
+ const wordCount = triggeringText.split(/\s+/).length;
4932
+ if (wordCount >= bargeInMinPartialWords) {
4933
+ appendTurnLatencyStage({
4934
+ metadata: {
4935
+ partial: triggeringText.slice(0, 200),
4936
+ source: "stt_partial",
4937
+ wordCount
4938
+ },
4939
+ stage: "barge_in",
4940
+ turnId: activeTTSTurnId
4941
+ }).catch(() => {});
4942
+ cancelActiveTTS("barge-in");
4943
+ } else {
4944
+ appendTurnLatencyStage({
4945
+ metadata: {
4946
+ partial: triggeringText.slice(0, 200),
4947
+ reason: "below_min_words",
4948
+ wordCount
4949
+ },
4950
+ stage: "barge_in_suppressed",
4951
+ turnId: activeTTSTurnId
4952
+ }).catch(() => {});
4953
+ }
4954
+ }
4924
4955
  }
4925
4956
  const session = await writeSession((session2) => {
4926
4957
  const nextPartialStartedAt = transcript.startedAtMs ?? session2.currentTurn.partialStartedAt;
@@ -5327,13 +5358,17 @@ var createVoiceSession = (options) => {
5327
5358
  if (myToken !== fillerToken || activeTTSTurnId === turn.id)
5328
5359
  return;
5329
5360
  let phrase = null;
5361
+ let source = "static";
5330
5362
  if (fillerForPromise) {
5331
5363
  phrase = await fillerForPromise;
5364
+ if (phrase)
5365
+ source = "fillerFor";
5332
5366
  if (myToken !== fillerToken || activeTTSTurnId === turn.id)
5333
5367
  return;
5334
5368
  }
5335
5369
  if (!phrase && fillerPhrases.length > 0) {
5336
5370
  phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? null;
5371
+ source = "static";
5337
5372
  }
5338
5373
  if (!phrase)
5339
5374
  return;
@@ -5341,6 +5376,11 @@ var createVoiceSession = (options) => {
5341
5376
  if (!adapterSession)
5342
5377
  return;
5343
5378
  fillerActive = true;
5379
+ appendTurnLatencyStage({
5380
+ metadata: { phrase, source },
5381
+ stage: "filler_sent",
5382
+ turnId: turn.id
5383
+ }).catch(() => {});
5344
5384
  try {
5345
5385
  await adapterSession.send(phrase);
5346
5386
  } catch {
@@ -24742,6 +24782,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
24742
24782
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
24743
24783
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
24744
24784
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
24785
+ ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
24745
24786
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
24746
24787
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
24747
24788
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
@@ -132,6 +132,14 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
132
132
  fillerPhrases?: ReadonlyArray<string>;
133
133
  /** Milliseconds after turn-commit before the filler fires. Default 250ms. */
134
134
  fillerDelayMs?: number;
135
+ /**
136
+ * Minimum word count in an STT partial transcript before barge-in
137
+ * cancels in-flight assistant TTS. Default 1 (any partial).
138
+ * Recommended 2 on phone routes — single-word partials ("you", "am i")
139
+ * cut the bot off mid-question per live-test 2026-05-27. See
140
+ * CreateVoiceSessionOptions for full semantics.
141
+ */
142
+ bargeInMinPartialWords?: number;
135
143
  /**
136
144
  * Content-aware filler (Latency Theater). Called in parallel with the
137
145
  * main LLM turn; if it resolves within `fillerForTimeoutMs` the runtime
@@ -5723,7 +5723,7 @@ var createVoiceSession = (options) => {
5723
5723
  };
5724
5724
  const appendTurnLatencyStage = async (input) => appendTrace({
5725
5725
  at: input.at,
5726
- payload: { stage: input.stage },
5726
+ payload: { stage: input.stage, ...input.metadata ?? {} },
5727
5727
  session: input.session,
5728
5728
  turnId: input.turnId,
5729
5729
  type: "turn_latency.stage"
@@ -5746,6 +5746,7 @@ var createVoiceSession = (options) => {
5746
5746
  let fillerToken = 0;
5747
5747
  const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
5748
5748
  const fillerDelayMs = options.fillerDelayMs ?? 250;
5749
+ const bargeInMinPartialWords = Math.max(1, options.bargeInMinPartialWords ?? 1);
5749
5750
  const fillerFor = options.fillerFor;
5750
5751
  const fillerForTimeoutMs = options.fillerForTimeoutMs ?? 600;
5751
5752
  const currentTurnAudio = [];
@@ -6132,6 +6133,11 @@ var createVoiceSession = (options) => {
6132
6133
  return;
6133
6134
  }
6134
6135
  activeTTSTurnId = undefined;
6136
+ appendTurnLatencyStage({
6137
+ metadata: { reason },
6138
+ stage: "tts_canceled",
6139
+ turnId: cancelledTurnId
6140
+ }).catch(() => {});
6135
6141
  Promise.resolve(socket.clear?.()).catch(() => {});
6136
6142
  if (!ttsAdapterSessionCanCancel(activeSession)) {
6137
6143
  return;
@@ -6790,8 +6796,33 @@ var createVoiceSession = (options) => {
6790
6796
  };
6791
6797
  };
6792
6798
  const handlePartial = async (transcript) => {
6793
- if (activeTTSTurnId !== undefined && transcript.text.trim()) {
6794
- cancelActiveTTS("barge-in");
6799
+ if (activeTTSTurnId !== undefined) {
6800
+ const triggeringText = transcript.text.trim();
6801
+ if (triggeringText) {
6802
+ const wordCount = triggeringText.split(/\s+/).length;
6803
+ if (wordCount >= bargeInMinPartialWords) {
6804
+ appendTurnLatencyStage({
6805
+ metadata: {
6806
+ partial: triggeringText.slice(0, 200),
6807
+ source: "stt_partial",
6808
+ wordCount
6809
+ },
6810
+ stage: "barge_in",
6811
+ turnId: activeTTSTurnId
6812
+ }).catch(() => {});
6813
+ cancelActiveTTS("barge-in");
6814
+ } else {
6815
+ appendTurnLatencyStage({
6816
+ metadata: {
6817
+ partial: triggeringText.slice(0, 200),
6818
+ reason: "below_min_words",
6819
+ wordCount
6820
+ },
6821
+ stage: "barge_in_suppressed",
6822
+ turnId: activeTTSTurnId
6823
+ }).catch(() => {});
6824
+ }
6825
+ }
6795
6826
  }
6796
6827
  const session = await writeSession((session2) => {
6797
6828
  const nextPartialStartedAt = transcript.startedAtMs ?? session2.currentTurn.partialStartedAt;
@@ -7198,13 +7229,17 @@ var createVoiceSession = (options) => {
7198
7229
  if (myToken !== fillerToken || activeTTSTurnId === turn.id)
7199
7230
  return;
7200
7231
  let phrase = null;
7232
+ let source = "static";
7201
7233
  if (fillerForPromise) {
7202
7234
  phrase = await fillerForPromise;
7235
+ if (phrase)
7236
+ source = "fillerFor";
7203
7237
  if (myToken !== fillerToken || activeTTSTurnId === turn.id)
7204
7238
  return;
7205
7239
  }
7206
7240
  if (!phrase && fillerPhrases.length > 0) {
7207
7241
  phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? null;
7242
+ source = "static";
7208
7243
  }
7209
7244
  if (!phrase)
7210
7245
  return;
@@ -7212,6 +7247,11 @@ var createVoiceSession = (options) => {
7212
7247
  if (!adapterSession)
7213
7248
  return;
7214
7249
  fillerActive = true;
7250
+ appendTurnLatencyStage({
7251
+ metadata: { phrase, source },
7252
+ stage: "filler_sent",
7253
+ turnId: turn.id
7254
+ }).catch(() => {});
7215
7255
  try {
7216
7256
  await adapterSession.send(phrase);
7217
7257
  } catch {
@@ -13332,6 +13372,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
13332
13372
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
13333
13373
  ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
13334
13374
  ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
13375
+ ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
13335
13376
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
13336
13377
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
13337
13378
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.562",
3
+ "version": "0.0.22-beta.564",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",