@absolutejs/voice 0.0.22-beta.552 → 0.0.22-beta.554

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -894,6 +894,35 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
894
894
  };
895
895
  redact?: import("./redaction").VoiceTranscriptRedactor;
896
896
  semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
897
+ /**
898
+ * Pre-rendered filler phrases the runtime plays in the gap between
899
+ * user-turn-commit and real assistant audio (typically 800-1500ms). The
900
+ * caller hears something within ~150-300ms of stopping speaking, so the
901
+ * LLM/TTS latency feels like the bot thinking instead of dead air. Boardy's
902
+ * killer UX feature.
903
+ *
904
+ * Behavior:
905
+ * - After a turn commits, a timer fires at `fillerDelayMs` (default
906
+ * 250ms). At that point, if the real assistant audio for this turn
907
+ * hasn't started flowing yet, a random phrase is rendered via the
908
+ * configured `tts` adapter and pushed to the socket.
909
+ * - When the real assistant audio's first chunk arrives, any in-flight
910
+ * filler is cancelled (`cancelActiveTTS` clears the carrier buffer).
911
+ * - Cooldown protects against double-fillers per turn.
912
+ *
913
+ * Set `fillerPhrases: []` (or omit) to disable. Reasonable defaults if
914
+ * you enable: `["Hmm.", "Got it.", "Right.", "Mm-hm.", "Let me think.", "Okay."]`.
915
+ */
916
+ fillerPhrases?: ReadonlyArray<string>;
917
+ /** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
918
+ fillerDelayMs?: number;
919
+ /**
920
+ * Default spoken ack if the model returns ONLY tool calls (no text) and the
921
+ * turn isn't ending. Without this, the caller hears total silence after
922
+ * their turn and assumes the line dropped. Default is "Sorry, one moment."
923
+ * Set to "" to opt out entirely.
924
+ */
925
+ defaultSilentTurnAck?: string;
897
926
  assistantMode?: import("./assistantMode").VoiceAssistantMode;
898
927
  modalities?: ReadonlyArray<"audio" | "text">;
899
928
  prosody?: VoiceTTSProsody;
package/dist/index.js CHANGED
@@ -3870,6 +3870,11 @@ var createVoiceSession = (options) => {
3870
3870
  let adapterGenerationCounter = 0;
3871
3871
  let activeAdapterGeneration = 0;
3872
3872
  let activeTTSTurnId;
3873
+ let fillerTimer = null;
3874
+ let fillerActive = false;
3875
+ let fillerToken = 0;
3876
+ const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
3877
+ const fillerDelayMs = options.fillerDelayMs ?? 250;
3873
3878
  const currentTurnAudio = [];
3874
3879
  const pendingUserAttachments = [];
3875
3880
  let fallbackAttemptsForCurrentTurn = 0;
@@ -5268,6 +5273,36 @@ var createVoiceSession = (options) => {
5268
5273
  }
5269
5274
  const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
5270
5275
  const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
5276
+ if (fillerPhrases.length > 0 && options.tts && !ttsStreamer) {}
5277
+ if (fillerPhrases.length > 0 && options.tts) {
5278
+ fillerToken += 1;
5279
+ const myToken = fillerToken;
5280
+ if (fillerTimer)
5281
+ clearTimeout(fillerTimer);
5282
+ fillerTimer = setTimeout(() => {
5283
+ fillerTimer = null;
5284
+ if (myToken !== fillerToken)
5285
+ return;
5286
+ if (activeTTSTurnId === turn.id)
5287
+ return;
5288
+ const phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? "";
5289
+ if (!phrase)
5290
+ return;
5291
+ runSerial("filler.send", async () => {
5292
+ if (myToken !== fillerToken || activeTTSTurnId === turn.id)
5293
+ return;
5294
+ const adapterSession = await ensureTTSSession();
5295
+ if (!adapterSession)
5296
+ return;
5297
+ fillerActive = true;
5298
+ try {
5299
+ await adapterSession.send(phrase);
5300
+ } catch {
5301
+ fillerActive = false;
5302
+ }
5303
+ });
5304
+ }, fillerDelayMs);
5305
+ }
5271
5306
  const committedOutput = await options.route.onTurn({
5272
5307
  api,
5273
5308
  context: options.context,
@@ -5348,6 +5383,15 @@ var createVoiceSession = (options) => {
5348
5383
  try {
5349
5384
  const activeTTSSession = await ensureTTSSession();
5350
5385
  if (activeTTSSession) {
5386
+ fillerToken += 1;
5387
+ if (fillerTimer) {
5388
+ clearTimeout(fillerTimer);
5389
+ fillerTimer = null;
5390
+ }
5391
+ if (fillerActive) {
5392
+ await cancelActiveTTS("filler-superseded").catch(() => {});
5393
+ fillerActive = false;
5394
+ }
5351
5395
  const ttsStartedAt = Date.now();
5352
5396
  activeTTSTurnId = turn.id;
5353
5397
  await appendTurnLatencyStage({
@@ -5420,6 +5464,53 @@ var createVoiceSession = (options) => {
5420
5464
  });
5421
5465
  }
5422
5466
  }
5467
+ const audioWasSent = Boolean(streamResult?.streamed) || Boolean(output?.assistantText?.trim());
5468
+ const turnIsEnding = Boolean(output?.complete) || Boolean(output?.transfer) || Boolean(output?.escalate) || Boolean(output?.voicemail) || Boolean(output?.noAnswer);
5469
+ if (!audioWasSent && !turnIsEnding) {
5470
+ const fallback = typeof options.defaultSilentTurnAck === "string" ? options.defaultSilentTurnAck : "Sorry, one moment.";
5471
+ if (fallback.trim() && options.tts) {
5472
+ try {
5473
+ const activeTTSSession = await ensureTTSSession();
5474
+ if (activeTTSSession) {
5475
+ fillerToken += 1;
5476
+ if (fillerTimer) {
5477
+ clearTimeout(fillerTimer);
5478
+ fillerTimer = null;
5479
+ }
5480
+ if (fillerActive) {
5481
+ await cancelActiveTTS("filler-superseded").catch(() => {});
5482
+ fillerActive = false;
5483
+ }
5484
+ activeTTSTurnId = turn.id;
5485
+ await activeTTSSession.send(fallback);
5486
+ await appendTrace({
5487
+ payload: {
5488
+ assistantMode: resolveVoiceAssistantMode(options),
5489
+ fallback: true,
5490
+ realtimeConfigured: Boolean(options.realtime),
5491
+ reason: "model-returned-no-text",
5492
+ text: fallback,
5493
+ ttsConfigured: Boolean(options.tts)
5494
+ },
5495
+ session,
5496
+ turnId: turn.id,
5497
+ type: "turn.assistant"
5498
+ });
5499
+ if (options.costAccountant) {
5500
+ options.costAccountant.recordTTS({
5501
+ characters: fallback.length
5502
+ });
5503
+ }
5504
+ }
5505
+ } catch (error) {
5506
+ logger.warn("voice default-silent-turn-ack fallback send failed", {
5507
+ error: toError(error).message,
5508
+ sessionId: options.id,
5509
+ turnId: turn.id
5510
+ });
5511
+ }
5512
+ }
5513
+ }
5423
5514
  if (output?.result !== undefined) {
5424
5515
  await writeSession((currentSession) => {
5425
5516
  setTurnResult(currentSession, turn.id, {
@@ -24562,6 +24653,9 @@ var createTwilioMediaStreamBridge = (socket, options) => {
24562
24653
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
24563
24654
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
24564
24655
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
24656
+ ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
24657
+ ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
24658
+ ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
24565
24659
  trace: options.trace,
24566
24660
  tts: options.tts,
24567
24661
  turnDetection
@@ -123,6 +123,22 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
123
123
  * snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
124
124
  */
125
125
  semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
126
+ /**
127
+ * Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
128
+ * in the gap between user-turn-commit and real assistant audio. Boardy's
129
+ * "the pause is character, not lag" pattern. See CreateVoiceSessionOptions
130
+ * for full semantics.
131
+ */
132
+ fillerPhrases?: ReadonlyArray<string>;
133
+ /** Milliseconds after turn-commit before the filler fires. Default 250ms. */
134
+ fillerDelayMs?: number;
135
+ /**
136
+ * Default spoken ack if the model returns ONLY tool calls (no text) and
137
+ * the turn isn't ending. Without this, the caller hears silence and
138
+ * assumes the line dropped. Default "Sorry, one moment." — set to ""
139
+ * to opt out. See CreateVoiceSessionOptions for full semantics.
140
+ */
141
+ defaultSilentTurnAck?: string;
126
142
  };
127
143
  export type TwilioMediaStreamBridge = {
128
144
  close: (reason?: string) => Promise<void>;
@@ -5687,6 +5687,11 @@ var createVoiceSession = (options) => {
5687
5687
  let adapterGenerationCounter = 0;
5688
5688
  let activeAdapterGeneration = 0;
5689
5689
  let activeTTSTurnId;
5690
+ let fillerTimer = null;
5691
+ let fillerActive = false;
5692
+ let fillerToken = 0;
5693
+ const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
5694
+ const fillerDelayMs = options.fillerDelayMs ?? 250;
5690
5695
  const currentTurnAudio = [];
5691
5696
  const pendingUserAttachments = [];
5692
5697
  let fallbackAttemptsForCurrentTurn = 0;
@@ -7085,6 +7090,36 @@ var createVoiceSession = (options) => {
7085
7090
  }
7086
7091
  const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
7087
7092
  const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
7093
+ if (fillerPhrases.length > 0 && options.tts && !ttsStreamer) {}
7094
+ if (fillerPhrases.length > 0 && options.tts) {
7095
+ fillerToken += 1;
7096
+ const myToken = fillerToken;
7097
+ if (fillerTimer)
7098
+ clearTimeout(fillerTimer);
7099
+ fillerTimer = setTimeout(() => {
7100
+ fillerTimer = null;
7101
+ if (myToken !== fillerToken)
7102
+ return;
7103
+ if (activeTTSTurnId === turn.id)
7104
+ return;
7105
+ const phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? "";
7106
+ if (!phrase)
7107
+ return;
7108
+ runSerial("filler.send", async () => {
7109
+ if (myToken !== fillerToken || activeTTSTurnId === turn.id)
7110
+ return;
7111
+ const adapterSession = await ensureTTSSession();
7112
+ if (!adapterSession)
7113
+ return;
7114
+ fillerActive = true;
7115
+ try {
7116
+ await adapterSession.send(phrase);
7117
+ } catch {
7118
+ fillerActive = false;
7119
+ }
7120
+ });
7121
+ }, fillerDelayMs);
7122
+ }
7088
7123
  const committedOutput = await options.route.onTurn({
7089
7124
  api,
7090
7125
  context: options.context,
@@ -7165,6 +7200,15 @@ var createVoiceSession = (options) => {
7165
7200
  try {
7166
7201
  const activeTTSSession = await ensureTTSSession();
7167
7202
  if (activeTTSSession) {
7203
+ fillerToken += 1;
7204
+ if (fillerTimer) {
7205
+ clearTimeout(fillerTimer);
7206
+ fillerTimer = null;
7207
+ }
7208
+ if (fillerActive) {
7209
+ await cancelActiveTTS("filler-superseded").catch(() => {});
7210
+ fillerActive = false;
7211
+ }
7168
7212
  const ttsStartedAt = Date.now();
7169
7213
  activeTTSTurnId = turn.id;
7170
7214
  await appendTurnLatencyStage({
@@ -7237,6 +7281,53 @@ var createVoiceSession = (options) => {
7237
7281
  });
7238
7282
  }
7239
7283
  }
7284
+ const audioWasSent = Boolean(streamResult?.streamed) || Boolean(output?.assistantText?.trim());
7285
+ const turnIsEnding = Boolean(output?.complete) || Boolean(output?.transfer) || Boolean(output?.escalate) || Boolean(output?.voicemail) || Boolean(output?.noAnswer);
7286
+ if (!audioWasSent && !turnIsEnding) {
7287
+ const fallback = typeof options.defaultSilentTurnAck === "string" ? options.defaultSilentTurnAck : "Sorry, one moment.";
7288
+ if (fallback.trim() && options.tts) {
7289
+ try {
7290
+ const activeTTSSession = await ensureTTSSession();
7291
+ if (activeTTSSession) {
7292
+ fillerToken += 1;
7293
+ if (fillerTimer) {
7294
+ clearTimeout(fillerTimer);
7295
+ fillerTimer = null;
7296
+ }
7297
+ if (fillerActive) {
7298
+ await cancelActiveTTS("filler-superseded").catch(() => {});
7299
+ fillerActive = false;
7300
+ }
7301
+ activeTTSTurnId = turn.id;
7302
+ await activeTTSSession.send(fallback);
7303
+ await appendTrace({
7304
+ payload: {
7305
+ assistantMode: resolveVoiceAssistantMode(options),
7306
+ fallback: true,
7307
+ realtimeConfigured: Boolean(options.realtime),
7308
+ reason: "model-returned-no-text",
7309
+ text: fallback,
7310
+ ttsConfigured: Boolean(options.tts)
7311
+ },
7312
+ session,
7313
+ turnId: turn.id,
7314
+ type: "turn.assistant"
7315
+ });
7316
+ if (options.costAccountant) {
7317
+ options.costAccountant.recordTTS({
7318
+ characters: fallback.length
7319
+ });
7320
+ }
7321
+ }
7322
+ } catch (error) {
7323
+ logger.warn("voice default-silent-turn-ack fallback send failed", {
7324
+ error: toError(error).message,
7325
+ sessionId: options.id,
7326
+ turnId: turn.id
7327
+ });
7328
+ }
7329
+ }
7330
+ }
7240
7331
  if (output?.result !== undefined) {
7241
7332
  await writeSession((currentSession) => {
7242
7333
  setTurnResult(currentSession, turn.id, {
@@ -13098,6 +13189,9 @@ var createTwilioMediaStreamBridge = (socket, options) => {
13098
13189
  sttFallback: resolveSTTFallbackConfig(options.sttFallback),
13099
13190
  sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
13100
13191
  ...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
13192
+ ...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
13193
+ ...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
13194
+ ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
13101
13195
  trace: options.trace,
13102
13196
  tts: options.tts,
13103
13197
  turnDetection
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.552",
3
+ "version": "0.0.22-beta.554",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",