@absolutejs/voice 0.0.22-beta.552 → 0.0.22-beta.554
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/types.d.ts +29 -0
- package/dist/index.js +94 -0
- package/dist/telephony/twilio.d.ts +16 -0
- package/dist/testing/index.js +94 -0
- package/package.json +1 -1
package/dist/core/types.d.ts
CHANGED
|
@@ -894,6 +894,35 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
894
894
|
};
|
|
895
895
|
redact?: import("./redaction").VoiceTranscriptRedactor;
|
|
896
896
|
semanticTurnDetector?: import("./semanticTurn").VoiceSemanticTurnDetector;
|
|
897
|
+
/**
|
|
898
|
+
* Pre-rendered filler phrases the runtime plays in the gap between
|
|
899
|
+
* user-turn-commit and real assistant audio (typically 800-1500ms). The
|
|
900
|
+
* caller hears something within ~150-300ms of stopping speaking, so the
|
|
901
|
+
* LLM/TTS latency feels like the bot thinking instead of dead air. Boardy's
|
|
902
|
+
* killer UX feature.
|
|
903
|
+
*
|
|
904
|
+
* Behavior:
|
|
905
|
+
* - After a turn commits, a timer fires at `fillerDelayMs` (default
|
|
906
|
+
* 250ms). At that point, if the real assistant audio for this turn
|
|
907
|
+
* hasn't started flowing yet, a random phrase is rendered via the
|
|
908
|
+
* configured `tts` adapter and pushed to the socket.
|
|
909
|
+
* - When the real assistant audio's first chunk arrives, any in-flight
|
|
910
|
+
* filler is cancelled (`cancelActiveTTS` clears the carrier buffer).
|
|
911
|
+
* - Cooldown protects against double-fillers per turn.
|
|
912
|
+
*
|
|
913
|
+
* Set `fillerPhrases: []` (or omit) to disable. Reasonable defaults if
|
|
914
|
+
* you enable: `["Hmm.", "Got it.", "Right.", "Mm-hm.", "Let me think.", "Okay."]`.
|
|
915
|
+
*/
|
|
916
|
+
fillerPhrases?: ReadonlyArray<string>;
|
|
917
|
+
/** Milliseconds after turn-commit before the filler fires. Default 250ms — short enough to feel instant, long enough to skip if the LLM is very fast. */
|
|
918
|
+
fillerDelayMs?: number;
|
|
919
|
+
/**
|
|
920
|
+
* Default spoken ack if the model returns ONLY tool calls (no text) and the
|
|
921
|
+
* turn isn't ending. Without this, the caller hears total silence after
|
|
922
|
+
* their turn and assumes the line dropped. Default is "Sorry, one moment."
|
|
923
|
+
* Set to "" to opt out entirely.
|
|
924
|
+
*/
|
|
925
|
+
defaultSilentTurnAck?: string;
|
|
897
926
|
assistantMode?: import("./assistantMode").VoiceAssistantMode;
|
|
898
927
|
modalities?: ReadonlyArray<"audio" | "text">;
|
|
899
928
|
prosody?: VoiceTTSProsody;
|
package/dist/index.js
CHANGED
|
@@ -3870,6 +3870,11 @@ var createVoiceSession = (options) => {
|
|
|
3870
3870
|
let adapterGenerationCounter = 0;
|
|
3871
3871
|
let activeAdapterGeneration = 0;
|
|
3872
3872
|
let activeTTSTurnId;
|
|
3873
|
+
let fillerTimer = null;
|
|
3874
|
+
let fillerActive = false;
|
|
3875
|
+
let fillerToken = 0;
|
|
3876
|
+
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
3877
|
+
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
3873
3878
|
const currentTurnAudio = [];
|
|
3874
3879
|
const pendingUserAttachments = [];
|
|
3875
3880
|
let fallbackAttemptsForCurrentTurn = 0;
|
|
@@ -5268,6 +5273,36 @@ var createVoiceSession = (options) => {
|
|
|
5268
5273
|
}
|
|
5269
5274
|
const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
|
|
5270
5275
|
const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
|
|
5276
|
+
if (fillerPhrases.length > 0 && options.tts && !ttsStreamer) {}
|
|
5277
|
+
if (fillerPhrases.length > 0 && options.tts) {
|
|
5278
|
+
fillerToken += 1;
|
|
5279
|
+
const myToken = fillerToken;
|
|
5280
|
+
if (fillerTimer)
|
|
5281
|
+
clearTimeout(fillerTimer);
|
|
5282
|
+
fillerTimer = setTimeout(() => {
|
|
5283
|
+
fillerTimer = null;
|
|
5284
|
+
if (myToken !== fillerToken)
|
|
5285
|
+
return;
|
|
5286
|
+
if (activeTTSTurnId === turn.id)
|
|
5287
|
+
return;
|
|
5288
|
+
const phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? "";
|
|
5289
|
+
if (!phrase)
|
|
5290
|
+
return;
|
|
5291
|
+
runSerial("filler.send", async () => {
|
|
5292
|
+
if (myToken !== fillerToken || activeTTSTurnId === turn.id)
|
|
5293
|
+
return;
|
|
5294
|
+
const adapterSession = await ensureTTSSession();
|
|
5295
|
+
if (!adapterSession)
|
|
5296
|
+
return;
|
|
5297
|
+
fillerActive = true;
|
|
5298
|
+
try {
|
|
5299
|
+
await adapterSession.send(phrase);
|
|
5300
|
+
} catch {
|
|
5301
|
+
fillerActive = false;
|
|
5302
|
+
}
|
|
5303
|
+
});
|
|
5304
|
+
}, fillerDelayMs);
|
|
5305
|
+
}
|
|
5271
5306
|
const committedOutput = await options.route.onTurn({
|
|
5272
5307
|
api,
|
|
5273
5308
|
context: options.context,
|
|
@@ -5348,6 +5383,15 @@ var createVoiceSession = (options) => {
|
|
|
5348
5383
|
try {
|
|
5349
5384
|
const activeTTSSession = await ensureTTSSession();
|
|
5350
5385
|
if (activeTTSSession) {
|
|
5386
|
+
fillerToken += 1;
|
|
5387
|
+
if (fillerTimer) {
|
|
5388
|
+
clearTimeout(fillerTimer);
|
|
5389
|
+
fillerTimer = null;
|
|
5390
|
+
}
|
|
5391
|
+
if (fillerActive) {
|
|
5392
|
+
await cancelActiveTTS("filler-superseded").catch(() => {});
|
|
5393
|
+
fillerActive = false;
|
|
5394
|
+
}
|
|
5351
5395
|
const ttsStartedAt = Date.now();
|
|
5352
5396
|
activeTTSTurnId = turn.id;
|
|
5353
5397
|
await appendTurnLatencyStage({
|
|
@@ -5420,6 +5464,53 @@ var createVoiceSession = (options) => {
|
|
|
5420
5464
|
});
|
|
5421
5465
|
}
|
|
5422
5466
|
}
|
|
5467
|
+
const audioWasSent = Boolean(streamResult?.streamed) || Boolean(output?.assistantText?.trim());
|
|
5468
|
+
const turnIsEnding = Boolean(output?.complete) || Boolean(output?.transfer) || Boolean(output?.escalate) || Boolean(output?.voicemail) || Boolean(output?.noAnswer);
|
|
5469
|
+
if (!audioWasSent && !turnIsEnding) {
|
|
5470
|
+
const fallback = typeof options.defaultSilentTurnAck === "string" ? options.defaultSilentTurnAck : "Sorry, one moment.";
|
|
5471
|
+
if (fallback.trim() && options.tts) {
|
|
5472
|
+
try {
|
|
5473
|
+
const activeTTSSession = await ensureTTSSession();
|
|
5474
|
+
if (activeTTSSession) {
|
|
5475
|
+
fillerToken += 1;
|
|
5476
|
+
if (fillerTimer) {
|
|
5477
|
+
clearTimeout(fillerTimer);
|
|
5478
|
+
fillerTimer = null;
|
|
5479
|
+
}
|
|
5480
|
+
if (fillerActive) {
|
|
5481
|
+
await cancelActiveTTS("filler-superseded").catch(() => {});
|
|
5482
|
+
fillerActive = false;
|
|
5483
|
+
}
|
|
5484
|
+
activeTTSTurnId = turn.id;
|
|
5485
|
+
await activeTTSSession.send(fallback);
|
|
5486
|
+
await appendTrace({
|
|
5487
|
+
payload: {
|
|
5488
|
+
assistantMode: resolveVoiceAssistantMode(options),
|
|
5489
|
+
fallback: true,
|
|
5490
|
+
realtimeConfigured: Boolean(options.realtime),
|
|
5491
|
+
reason: "model-returned-no-text",
|
|
5492
|
+
text: fallback,
|
|
5493
|
+
ttsConfigured: Boolean(options.tts)
|
|
5494
|
+
},
|
|
5495
|
+
session,
|
|
5496
|
+
turnId: turn.id,
|
|
5497
|
+
type: "turn.assistant"
|
|
5498
|
+
});
|
|
5499
|
+
if (options.costAccountant) {
|
|
5500
|
+
options.costAccountant.recordTTS({
|
|
5501
|
+
characters: fallback.length
|
|
5502
|
+
});
|
|
5503
|
+
}
|
|
5504
|
+
}
|
|
5505
|
+
} catch (error) {
|
|
5506
|
+
logger.warn("voice default-silent-turn-ack fallback send failed", {
|
|
5507
|
+
error: toError(error).message,
|
|
5508
|
+
sessionId: options.id,
|
|
5509
|
+
turnId: turn.id
|
|
5510
|
+
});
|
|
5511
|
+
}
|
|
5512
|
+
}
|
|
5513
|
+
}
|
|
5423
5514
|
if (output?.result !== undefined) {
|
|
5424
5515
|
await writeSession((currentSession) => {
|
|
5425
5516
|
setTurnResult(currentSession, turn.id, {
|
|
@@ -24562,6 +24653,9 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
24562
24653
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
24563
24654
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
24564
24655
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
24656
|
+
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
24657
|
+
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
24658
|
+
...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
|
|
24565
24659
|
trace: options.trace,
|
|
24566
24660
|
tts: options.tts,
|
|
24567
24661
|
turnDetection
|
|
@@ -123,6 +123,22 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
123
123
|
* snappy responses on clear-cut answers. See VoiceSemanticTurnDetector.
|
|
124
124
|
*/
|
|
125
125
|
semanticTurnDetector?: import("../core/semanticTurn").VoiceSemanticTurnDetector;
|
|
126
|
+
/**
|
|
127
|
+
* Pre-rendered filler phrases ("Hmm.", "Got it.", "Let me think.") played
|
|
128
|
+
* in the gap between user-turn-commit and real assistant audio. Boardy's
|
|
129
|
+
* "the pause is character, not lag" pattern. See CreateVoiceSessionOptions
|
|
130
|
+
* for full semantics.
|
|
131
|
+
*/
|
|
132
|
+
fillerPhrases?: ReadonlyArray<string>;
|
|
133
|
+
/** Milliseconds after turn-commit before the filler fires. Default 250ms. */
|
|
134
|
+
fillerDelayMs?: number;
|
|
135
|
+
/**
|
|
136
|
+
* Default spoken ack if the model returns ONLY tool calls (no text) and
|
|
137
|
+
* the turn isn't ending. Without this, the caller hears silence and
|
|
138
|
+
* assumes the line dropped. Default "Sorry, one moment." — set to ""
|
|
139
|
+
* to opt out. See CreateVoiceSessionOptions for full semantics.
|
|
140
|
+
*/
|
|
141
|
+
defaultSilentTurnAck?: string;
|
|
126
142
|
};
|
|
127
143
|
export type TwilioMediaStreamBridge = {
|
|
128
144
|
close: (reason?: string) => Promise<void>;
|
package/dist/testing/index.js
CHANGED
|
@@ -5687,6 +5687,11 @@ var createVoiceSession = (options) => {
|
|
|
5687
5687
|
let adapterGenerationCounter = 0;
|
|
5688
5688
|
let activeAdapterGeneration = 0;
|
|
5689
5689
|
let activeTTSTurnId;
|
|
5690
|
+
let fillerTimer = null;
|
|
5691
|
+
let fillerActive = false;
|
|
5692
|
+
let fillerToken = 0;
|
|
5693
|
+
const fillerPhrases = (options.fillerPhrases ?? []).filter((p) => typeof p === "string" && p.trim().length > 0);
|
|
5694
|
+
const fillerDelayMs = options.fillerDelayMs ?? 250;
|
|
5690
5695
|
const currentTurnAudio = [];
|
|
5691
5696
|
const pendingUserAttachments = [];
|
|
5692
5697
|
let fallbackAttemptsForCurrentTurn = 0;
|
|
@@ -7085,6 +7090,36 @@ var createVoiceSession = (options) => {
|
|
|
7085
7090
|
}
|
|
7086
7091
|
const injectedInstruction = liveOpsControl?.injectedInstruction?.trim();
|
|
7087
7092
|
const ttsStreamer = options.tts ? createTurnTTSStreamer(turn, session) : undefined;
|
|
7093
|
+
if (fillerPhrases.length > 0 && options.tts && !ttsStreamer) {}
|
|
7094
|
+
if (fillerPhrases.length > 0 && options.tts) {
|
|
7095
|
+
fillerToken += 1;
|
|
7096
|
+
const myToken = fillerToken;
|
|
7097
|
+
if (fillerTimer)
|
|
7098
|
+
clearTimeout(fillerTimer);
|
|
7099
|
+
fillerTimer = setTimeout(() => {
|
|
7100
|
+
fillerTimer = null;
|
|
7101
|
+
if (myToken !== fillerToken)
|
|
7102
|
+
return;
|
|
7103
|
+
if (activeTTSTurnId === turn.id)
|
|
7104
|
+
return;
|
|
7105
|
+
const phrase = fillerPhrases[Math.floor(Math.random() * fillerPhrases.length)] ?? "";
|
|
7106
|
+
if (!phrase)
|
|
7107
|
+
return;
|
|
7108
|
+
runSerial("filler.send", async () => {
|
|
7109
|
+
if (myToken !== fillerToken || activeTTSTurnId === turn.id)
|
|
7110
|
+
return;
|
|
7111
|
+
const adapterSession = await ensureTTSSession();
|
|
7112
|
+
if (!adapterSession)
|
|
7113
|
+
return;
|
|
7114
|
+
fillerActive = true;
|
|
7115
|
+
try {
|
|
7116
|
+
await adapterSession.send(phrase);
|
|
7117
|
+
} catch {
|
|
7118
|
+
fillerActive = false;
|
|
7119
|
+
}
|
|
7120
|
+
});
|
|
7121
|
+
}, fillerDelayMs);
|
|
7122
|
+
}
|
|
7088
7123
|
const committedOutput = await options.route.onTurn({
|
|
7089
7124
|
api,
|
|
7090
7125
|
context: options.context,
|
|
@@ -7165,6 +7200,15 @@ var createVoiceSession = (options) => {
|
|
|
7165
7200
|
try {
|
|
7166
7201
|
const activeTTSSession = await ensureTTSSession();
|
|
7167
7202
|
if (activeTTSSession) {
|
|
7203
|
+
fillerToken += 1;
|
|
7204
|
+
if (fillerTimer) {
|
|
7205
|
+
clearTimeout(fillerTimer);
|
|
7206
|
+
fillerTimer = null;
|
|
7207
|
+
}
|
|
7208
|
+
if (fillerActive) {
|
|
7209
|
+
await cancelActiveTTS("filler-superseded").catch(() => {});
|
|
7210
|
+
fillerActive = false;
|
|
7211
|
+
}
|
|
7168
7212
|
const ttsStartedAt = Date.now();
|
|
7169
7213
|
activeTTSTurnId = turn.id;
|
|
7170
7214
|
await appendTurnLatencyStage({
|
|
@@ -7237,6 +7281,53 @@ var createVoiceSession = (options) => {
|
|
|
7237
7281
|
});
|
|
7238
7282
|
}
|
|
7239
7283
|
}
|
|
7284
|
+
const audioWasSent = Boolean(streamResult?.streamed) || Boolean(output?.assistantText?.trim());
|
|
7285
|
+
const turnIsEnding = Boolean(output?.complete) || Boolean(output?.transfer) || Boolean(output?.escalate) || Boolean(output?.voicemail) || Boolean(output?.noAnswer);
|
|
7286
|
+
if (!audioWasSent && !turnIsEnding) {
|
|
7287
|
+
const fallback = typeof options.defaultSilentTurnAck === "string" ? options.defaultSilentTurnAck : "Sorry, one moment.";
|
|
7288
|
+
if (fallback.trim() && options.tts) {
|
|
7289
|
+
try {
|
|
7290
|
+
const activeTTSSession = await ensureTTSSession();
|
|
7291
|
+
if (activeTTSSession) {
|
|
7292
|
+
fillerToken += 1;
|
|
7293
|
+
if (fillerTimer) {
|
|
7294
|
+
clearTimeout(fillerTimer);
|
|
7295
|
+
fillerTimer = null;
|
|
7296
|
+
}
|
|
7297
|
+
if (fillerActive) {
|
|
7298
|
+
await cancelActiveTTS("filler-superseded").catch(() => {});
|
|
7299
|
+
fillerActive = false;
|
|
7300
|
+
}
|
|
7301
|
+
activeTTSTurnId = turn.id;
|
|
7302
|
+
await activeTTSSession.send(fallback);
|
|
7303
|
+
await appendTrace({
|
|
7304
|
+
payload: {
|
|
7305
|
+
assistantMode: resolveVoiceAssistantMode(options),
|
|
7306
|
+
fallback: true,
|
|
7307
|
+
realtimeConfigured: Boolean(options.realtime),
|
|
7308
|
+
reason: "model-returned-no-text",
|
|
7309
|
+
text: fallback,
|
|
7310
|
+
ttsConfigured: Boolean(options.tts)
|
|
7311
|
+
},
|
|
7312
|
+
session,
|
|
7313
|
+
turnId: turn.id,
|
|
7314
|
+
type: "turn.assistant"
|
|
7315
|
+
});
|
|
7316
|
+
if (options.costAccountant) {
|
|
7317
|
+
options.costAccountant.recordTTS({
|
|
7318
|
+
characters: fallback.length
|
|
7319
|
+
});
|
|
7320
|
+
}
|
|
7321
|
+
}
|
|
7322
|
+
} catch (error) {
|
|
7323
|
+
logger.warn("voice default-silent-turn-ack fallback send failed", {
|
|
7324
|
+
error: toError(error).message,
|
|
7325
|
+
sessionId: options.id,
|
|
7326
|
+
turnId: turn.id
|
|
7327
|
+
});
|
|
7328
|
+
}
|
|
7329
|
+
}
|
|
7330
|
+
}
|
|
7240
7331
|
if (output?.result !== undefined) {
|
|
7241
7332
|
await writeSession((currentSession) => {
|
|
7242
7333
|
setTurnResult(currentSession, turn.id, {
|
|
@@ -13098,6 +13189,9 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
13098
13189
|
sttFallback: resolveSTTFallbackConfig(options.sttFallback),
|
|
13099
13190
|
sttLifecycle: options.sttLifecycle ?? runtimePreset.sttLifecycle,
|
|
13100
13191
|
...options.semanticTurnDetector ? { semanticTurnDetector: options.semanticTurnDetector } : {},
|
|
13192
|
+
...options.fillerPhrases ? { fillerPhrases: options.fillerPhrases } : {},
|
|
13193
|
+
...options.fillerDelayMs !== undefined ? { fillerDelayMs: options.fillerDelayMs } : {},
|
|
13194
|
+
...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
|
|
13101
13195
|
trace: options.trace,
|
|
13102
13196
|
tts: options.tts,
|
|
13103
13197
|
turnDetection
|