@absolutejs/voice 0.0.22-beta.580 → 0.0.22-beta.582
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client/audioPlayer.d.ts +7 -0
- package/dist/client/htmxBootstrap.js +25 -0
- package/dist/client/index.js +25 -0
- package/dist/core/types.d.ts +3 -0
- package/dist/index.js +41 -11
- package/dist/testing/index.js +66 -11
- package/package.json +1 -1
|
@@ -22,9 +22,16 @@ type MinimalGainNode = {
|
|
|
22
22
|
value: number;
|
|
23
23
|
};
|
|
24
24
|
};
|
|
25
|
+
type MinimalAnalyserNode = {
|
|
26
|
+
connect?: (destination: unknown) => void;
|
|
27
|
+
disconnect?: () => void;
|
|
28
|
+
fftSize: number;
|
|
29
|
+
getByteTimeDomainData: (array: Uint8Array) => void;
|
|
30
|
+
};
|
|
25
31
|
type MinimalAudioContext = {
|
|
26
32
|
baseLatency?: number;
|
|
27
33
|
close: () => Promise<void>;
|
|
34
|
+
createAnalyser?: () => MinimalAnalyserNode;
|
|
28
35
|
createBuffer: (numberOfChannels: number, length: number, sampleRate: number) => MinimalAudioBuffer;
|
|
29
36
|
createBufferSource: () => MinimalAudioBufferSourceNode;
|
|
30
37
|
createGain?: () => MinimalGainNode;
|
|
@@ -1693,6 +1693,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
|
|
|
1693
1693
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
1694
1694
|
var MAX_PLAYBACK_RATE = 2;
|
|
1695
1695
|
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
1696
|
+
var ANALYSER_FFT_SIZE = 256;
|
|
1697
|
+
var PCM_BYTE_MIDPOINT = 128;
|
|
1696
1698
|
var createInitialState3 = () => ({
|
|
1697
1699
|
activeSourceCount: 0,
|
|
1698
1700
|
error: null,
|
|
@@ -1753,6 +1755,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1753
1755
|
let state = createInitialState3();
|
|
1754
1756
|
let audioContext = null;
|
|
1755
1757
|
let outputNode = null;
|
|
1758
|
+
let analyserNode = null;
|
|
1759
|
+
let analyserBuffer = null;
|
|
1756
1760
|
let volume = clampVolume(options.volume);
|
|
1757
1761
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
1758
1762
|
let stretcher = null;
|
|
@@ -1849,6 +1853,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1849
1853
|
if (audioContext.createGain) {
|
|
1850
1854
|
outputNode = audioContext.createGain();
|
|
1851
1855
|
outputNode.connect?.(audioContext.destination);
|
|
1856
|
+
if (audioContext.createAnalyser) {
|
|
1857
|
+
analyserNode = audioContext.createAnalyser();
|
|
1858
|
+
analyserNode.fftSize = ANALYSER_FFT_SIZE;
|
|
1859
|
+
analyserBuffer = new Uint8Array(analyserNode.fftSize);
|
|
1860
|
+
outputNode.connect?.(analyserNode);
|
|
1861
|
+
}
|
|
1852
1862
|
}
|
|
1853
1863
|
queueEndTime = audioContext.currentTime;
|
|
1854
1864
|
return audioContext;
|
|
@@ -1973,6 +1983,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1973
1983
|
audioContext = null;
|
|
1974
1984
|
outputNode?.disconnect?.();
|
|
1975
1985
|
outputNode = null;
|
|
1986
|
+
analyserNode?.disconnect?.();
|
|
1987
|
+
analyserNode = null;
|
|
1988
|
+
analyserBuffer = null;
|
|
1976
1989
|
queueEndTime = 0;
|
|
1977
1990
|
setState({
|
|
1978
1991
|
activeSourceCount: 0,
|
|
@@ -1983,6 +1996,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1983
1996
|
get error() {
|
|
1984
1997
|
return state.error;
|
|
1985
1998
|
},
|
|
1999
|
+
getOutputLevel: () => {
|
|
2000
|
+
if (!analyserNode || !analyserBuffer) {
|
|
2001
|
+
return 0;
|
|
2002
|
+
}
|
|
2003
|
+
analyserNode.getByteTimeDomainData(analyserBuffer);
|
|
2004
|
+
let sumSquares = 0;
|
|
2005
|
+
for (const sample of analyserBuffer) {
|
|
2006
|
+
const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
|
|
2007
|
+
sumSquares += centered * centered;
|
|
2008
|
+
}
|
|
2009
|
+
return Math.sqrt(sumSquares / analyserBuffer.length);
|
|
2010
|
+
},
|
|
1986
2011
|
getSnapshot: () => state,
|
|
1987
2012
|
interrupt: async () => {
|
|
1988
2013
|
const startedAt = Date.now();
|
package/dist/client/index.js
CHANGED
|
@@ -529,6 +529,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
|
|
|
529
529
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
530
530
|
var MAX_PLAYBACK_RATE = 2;
|
|
531
531
|
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
532
|
+
var ANALYSER_FFT_SIZE = 256;
|
|
533
|
+
var PCM_BYTE_MIDPOINT = 128;
|
|
532
534
|
var createInitialState = () => ({
|
|
533
535
|
activeSourceCount: 0,
|
|
534
536
|
error: null,
|
|
@@ -589,6 +591,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
589
591
|
let state = createInitialState();
|
|
590
592
|
let audioContext = null;
|
|
591
593
|
let outputNode = null;
|
|
594
|
+
let analyserNode = null;
|
|
595
|
+
let analyserBuffer = null;
|
|
592
596
|
let volume = clampVolume(options.volume);
|
|
593
597
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
594
598
|
let stretcher = null;
|
|
@@ -685,6 +689,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
685
689
|
if (audioContext.createGain) {
|
|
686
690
|
outputNode = audioContext.createGain();
|
|
687
691
|
outputNode.connect?.(audioContext.destination);
|
|
692
|
+
if (audioContext.createAnalyser) {
|
|
693
|
+
analyserNode = audioContext.createAnalyser();
|
|
694
|
+
analyserNode.fftSize = ANALYSER_FFT_SIZE;
|
|
695
|
+
analyserBuffer = new Uint8Array(analyserNode.fftSize);
|
|
696
|
+
outputNode.connect?.(analyserNode);
|
|
697
|
+
}
|
|
688
698
|
}
|
|
689
699
|
queueEndTime = audioContext.currentTime;
|
|
690
700
|
return audioContext;
|
|
@@ -809,6 +819,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
809
819
|
audioContext = null;
|
|
810
820
|
outputNode?.disconnect?.();
|
|
811
821
|
outputNode = null;
|
|
822
|
+
analyserNode?.disconnect?.();
|
|
823
|
+
analyserNode = null;
|
|
824
|
+
analyserBuffer = null;
|
|
812
825
|
queueEndTime = 0;
|
|
813
826
|
setState({
|
|
814
827
|
activeSourceCount: 0,
|
|
@@ -819,6 +832,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
819
832
|
get error() {
|
|
820
833
|
return state.error;
|
|
821
834
|
},
|
|
835
|
+
getOutputLevel: () => {
|
|
836
|
+
if (!analyserNode || !analyserBuffer) {
|
|
837
|
+
return 0;
|
|
838
|
+
}
|
|
839
|
+
analyserNode.getByteTimeDomainData(analyserBuffer);
|
|
840
|
+
let sumSquares = 0;
|
|
841
|
+
for (const sample of analyserBuffer) {
|
|
842
|
+
const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
|
|
843
|
+
sumSquares += centered * centered;
|
|
844
|
+
}
|
|
845
|
+
return Math.sqrt(sumSquares / analyserBuffer.length);
|
|
846
|
+
},
|
|
822
847
|
getSnapshot: () => state,
|
|
823
848
|
interrupt: async () => {
|
|
824
849
|
const startedAt = Date.now();
|
package/dist/core/types.d.ts
CHANGED
|
@@ -1331,6 +1331,9 @@ export type VoiceAudioPlayerSource = {
|
|
|
1331
1331
|
export type VoiceAudioPlayer = {
|
|
1332
1332
|
close: () => Promise<void>;
|
|
1333
1333
|
error: string | null;
|
|
1334
|
+
/** Instantaneous RMS amplitude (0..1) of the assistant's audio output — for
|
|
1335
|
+
* driving a visualizer from the actual voice. 0 when idle / no analyser. */
|
|
1336
|
+
getOutputLevel: () => number;
|
|
1334
1337
|
getSnapshot: () => VoiceAudioPlayerState;
|
|
1335
1338
|
activeSourceCount: number;
|
|
1336
1339
|
isActive: boolean;
|
package/dist/index.js
CHANGED
|
@@ -3936,6 +3936,8 @@ var createVoiceSession = (options) => {
|
|
|
3936
3936
|
let activeAdapterGeneration = 0;
|
|
3937
3937
|
let activeTTSTurnId;
|
|
3938
3938
|
let assistantSpeechEndsAt = 0;
|
|
3939
|
+
let lastAssistantAudioAt = 0;
|
|
3940
|
+
let lastTtsSendAt = 0;
|
|
3939
3941
|
let fillerTimer = null;
|
|
3940
3942
|
let fillerActive = false;
|
|
3941
3943
|
let fillerToken = 0;
|
|
@@ -4209,6 +4211,15 @@ var createVoiceSession = (options) => {
|
|
|
4209
4211
|
});
|
|
4210
4212
|
return result;
|
|
4211
4213
|
};
|
|
4214
|
+
let assistantAudioQueue = Promise.resolve();
|
|
4215
|
+
const runAudioSerial = (operation) => {
|
|
4216
|
+
const next = assistantAudioQueue.then(operation);
|
|
4217
|
+
assistantAudioQueue = next.then(() => {
|
|
4218
|
+
return;
|
|
4219
|
+
}, () => {
|
|
4220
|
+
return;
|
|
4221
|
+
});
|
|
4222
|
+
};
|
|
4212
4223
|
const closeAdapter = async (reason) => {
|
|
4213
4224
|
if (!sttSession) {
|
|
4214
4225
|
return;
|
|
@@ -4377,6 +4388,7 @@ var createVoiceSession = (options) => {
|
|
|
4377
4388
|
const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
|
|
4378
4389
|
assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
|
|
4379
4390
|
}
|
|
4391
|
+
lastAssistantAudioAt = Date.now();
|
|
4380
4392
|
if (activeTTSTurnId) {
|
|
4381
4393
|
await appendTurnLatencyStage({
|
|
4382
4394
|
at: input.receivedAt,
|
|
@@ -4486,18 +4498,28 @@ var createVoiceSession = (options) => {
|
|
|
4486
4498
|
session
|
|
4487
4499
|
});
|
|
4488
4500
|
};
|
|
4489
|
-
const DRAIN_POLL_MS =
|
|
4501
|
+
const DRAIN_POLL_MS = 100;
|
|
4490
4502
|
const DRAIN_TAIL_BUFFER_MS = 300;
|
|
4491
|
-
const
|
|
4492
|
-
const
|
|
4503
|
+
const DRAIN_QUIET_MS = 600;
|
|
4504
|
+
const DRAIN_RENDER_START_MS = 4000;
|
|
4505
|
+
const DRAIN_MAX_MS = 20000;
|
|
4506
|
+
const drainAssistantSpeech = async (renderPendingSince) => {
|
|
4493
4507
|
const startedAt = Date.now();
|
|
4508
|
+
const sleep3 = (delayMs) => new Promise((resolve) => {
|
|
4509
|
+
setTimeout(resolve, delayMs);
|
|
4510
|
+
});
|
|
4494
4511
|
while (Date.now() - startedAt < DRAIN_MAX_MS) {
|
|
4495
|
-
const
|
|
4496
|
-
|
|
4512
|
+
const now = Date.now();
|
|
4513
|
+
const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
|
|
4514
|
+
if (!renderStarted) {
|
|
4515
|
+
await sleep3(DRAIN_POLL_MS);
|
|
4516
|
+
continue;
|
|
4517
|
+
}
|
|
4518
|
+
const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
|
|
4519
|
+
const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
|
|
4520
|
+
if (streamQuiet && playbackDrained)
|
|
4497
4521
|
return;
|
|
4498
|
-
await
|
|
4499
|
-
setTimeout(resolve, Math.min(remaining, DRAIN_POLL_MS));
|
|
4500
|
-
});
|
|
4522
|
+
await sleep3(DRAIN_POLL_MS);
|
|
4501
4523
|
}
|
|
4502
4524
|
};
|
|
4503
4525
|
const completeInternal = async (result, input = {}) => {
|
|
@@ -4534,7 +4556,8 @@ var createVoiceSession = (options) => {
|
|
|
4534
4556
|
return;
|
|
4535
4557
|
}
|
|
4536
4558
|
if (disposition === "completed") {
|
|
4537
|
-
await drainAssistantSpeech();
|
|
4559
|
+
await drainAssistantSpeech(lastTtsSendAt);
|
|
4560
|
+
await assistantAudioQueue;
|
|
4538
4561
|
}
|
|
4539
4562
|
await appendTrace({
|
|
4540
4563
|
payload: {
|
|
@@ -5204,7 +5227,10 @@ var createVoiceSession = (options) => {
|
|
|
5204
5227
|
});
|
|
5205
5228
|
if (options.realtime) {
|
|
5206
5229
|
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
5207
|
-
|
|
5230
|
+
runAudioSerial(async () => {
|
|
5231
|
+
if (activeAdapterGeneration !== generation) {
|
|
5232
|
+
return;
|
|
5233
|
+
}
|
|
5208
5234
|
await sendAssistantAudio(chunk, {
|
|
5209
5235
|
format,
|
|
5210
5236
|
receivedAt
|
|
@@ -5233,7 +5259,7 @@ var createVoiceSession = (options) => {
|
|
|
5233
5259
|
});
|
|
5234
5260
|
ttsSession = openedSession;
|
|
5235
5261
|
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
5236
|
-
|
|
5262
|
+
runAudioSerial(async () => {
|
|
5237
5263
|
if (ttsSession !== openedSession) {
|
|
5238
5264
|
return;
|
|
5239
5265
|
}
|
|
@@ -5361,6 +5387,7 @@ var createVoiceSession = (options) => {
|
|
|
5361
5387
|
try {
|
|
5362
5388
|
await ttsSession2.send(text);
|
|
5363
5389
|
charsSent += text.length;
|
|
5390
|
+
lastTtsSendAt = Date.now();
|
|
5364
5391
|
} catch (error) {
|
|
5365
5392
|
logger.warn("voice assistant audio send failed", {
|
|
5366
5393
|
error: toError(error).message,
|
|
@@ -5643,6 +5670,7 @@ var createVoiceSession = (options) => {
|
|
|
5643
5670
|
turnId: turn.id
|
|
5644
5671
|
});
|
|
5645
5672
|
await activeTTSSession.send(output.assistantText);
|
|
5673
|
+
lastTtsSendAt = Date.now();
|
|
5646
5674
|
if (options.costAccountant) {
|
|
5647
5675
|
options.costAccountant.recordTTS({
|
|
5648
5676
|
characters: output.assistantText.length
|
|
@@ -6067,10 +6095,12 @@ var createVoiceSession = (options) => {
|
|
|
6067
6095
|
if (greetingTTSSession) {
|
|
6068
6096
|
activeTTSTurnId = greetingTurnId;
|
|
6069
6097
|
await greetingTTSSession.send(greetingText);
|
|
6098
|
+
lastTtsSendAt = Date.now();
|
|
6070
6099
|
} else if (options.realtime) {
|
|
6071
6100
|
const greetingRealtimeSession = await ensureAdapter();
|
|
6072
6101
|
activeTTSTurnId = greetingTurnId;
|
|
6073
6102
|
await greetingRealtimeSession.send(greetingText);
|
|
6103
|
+
lastTtsSendAt = Date.now();
|
|
6074
6104
|
}
|
|
6075
6105
|
} catch {}
|
|
6076
6106
|
}
|
package/dist/testing/index.js
CHANGED
|
@@ -1736,6 +1736,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
|
|
|
1736
1736
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
1737
1737
|
var MAX_PLAYBACK_RATE = 2;
|
|
1738
1738
|
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
1739
|
+
var ANALYSER_FFT_SIZE = 256;
|
|
1740
|
+
var PCM_BYTE_MIDPOINT = 128;
|
|
1739
1741
|
var createInitialState = () => ({
|
|
1740
1742
|
activeSourceCount: 0,
|
|
1741
1743
|
error: null,
|
|
@@ -1796,6 +1798,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1796
1798
|
let state = createInitialState();
|
|
1797
1799
|
let audioContext = null;
|
|
1798
1800
|
let outputNode = null;
|
|
1801
|
+
let analyserNode = null;
|
|
1802
|
+
let analyserBuffer = null;
|
|
1799
1803
|
let volume = clampVolume(options.volume);
|
|
1800
1804
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
1801
1805
|
let stretcher = null;
|
|
@@ -1892,6 +1896,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1892
1896
|
if (audioContext.createGain) {
|
|
1893
1897
|
outputNode = audioContext.createGain();
|
|
1894
1898
|
outputNode.connect?.(audioContext.destination);
|
|
1899
|
+
if (audioContext.createAnalyser) {
|
|
1900
|
+
analyserNode = audioContext.createAnalyser();
|
|
1901
|
+
analyserNode.fftSize = ANALYSER_FFT_SIZE;
|
|
1902
|
+
analyserBuffer = new Uint8Array(analyserNode.fftSize);
|
|
1903
|
+
outputNode.connect?.(analyserNode);
|
|
1904
|
+
}
|
|
1895
1905
|
}
|
|
1896
1906
|
queueEndTime = audioContext.currentTime;
|
|
1897
1907
|
return audioContext;
|
|
@@ -2016,6 +2026,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
2016
2026
|
audioContext = null;
|
|
2017
2027
|
outputNode?.disconnect?.();
|
|
2018
2028
|
outputNode = null;
|
|
2029
|
+
analyserNode?.disconnect?.();
|
|
2030
|
+
analyserNode = null;
|
|
2031
|
+
analyserBuffer = null;
|
|
2019
2032
|
queueEndTime = 0;
|
|
2020
2033
|
setState({
|
|
2021
2034
|
activeSourceCount: 0,
|
|
@@ -2026,6 +2039,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
2026
2039
|
get error() {
|
|
2027
2040
|
return state.error;
|
|
2028
2041
|
},
|
|
2042
|
+
getOutputLevel: () => {
|
|
2043
|
+
if (!analyserNode || !analyserBuffer) {
|
|
2044
|
+
return 0;
|
|
2045
|
+
}
|
|
2046
|
+
analyserNode.getByteTimeDomainData(analyserBuffer);
|
|
2047
|
+
let sumSquares = 0;
|
|
2048
|
+
for (const sample of analyserBuffer) {
|
|
2049
|
+
const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
|
|
2050
|
+
sumSquares += centered * centered;
|
|
2051
|
+
}
|
|
2052
|
+
return Math.sqrt(sumSquares / analyserBuffer.length);
|
|
2053
|
+
},
|
|
2029
2054
|
getSnapshot: () => state,
|
|
2030
2055
|
interrupt: async () => {
|
|
2031
2056
|
const startedAt = Date.now();
|
|
@@ -6053,6 +6078,8 @@ var createVoiceSession = (options) => {
|
|
|
6053
6078
|
let activeAdapterGeneration = 0;
|
|
6054
6079
|
let activeTTSTurnId;
|
|
6055
6080
|
let assistantSpeechEndsAt = 0;
|
|
6081
|
+
let lastAssistantAudioAt = 0;
|
|
6082
|
+
let lastTtsSendAt = 0;
|
|
6056
6083
|
let fillerTimer = null;
|
|
6057
6084
|
let fillerActive = false;
|
|
6058
6085
|
let fillerToken = 0;
|
|
@@ -6326,6 +6353,15 @@ var createVoiceSession = (options) => {
|
|
|
6326
6353
|
});
|
|
6327
6354
|
return result;
|
|
6328
6355
|
};
|
|
6356
|
+
let assistantAudioQueue = Promise.resolve();
|
|
6357
|
+
const runAudioSerial = (operation) => {
|
|
6358
|
+
const next = assistantAudioQueue.then(operation);
|
|
6359
|
+
assistantAudioQueue = next.then(() => {
|
|
6360
|
+
return;
|
|
6361
|
+
}, () => {
|
|
6362
|
+
return;
|
|
6363
|
+
});
|
|
6364
|
+
};
|
|
6329
6365
|
const closeAdapter = async (reason) => {
|
|
6330
6366
|
if (!sttSession) {
|
|
6331
6367
|
return;
|
|
@@ -6494,6 +6530,7 @@ var createVoiceSession = (options) => {
|
|
|
6494
6530
|
const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
|
|
6495
6531
|
assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
|
|
6496
6532
|
}
|
|
6533
|
+
lastAssistantAudioAt = Date.now();
|
|
6497
6534
|
if (activeTTSTurnId) {
|
|
6498
6535
|
await appendTurnLatencyStage({
|
|
6499
6536
|
at: input.receivedAt,
|
|
@@ -6603,18 +6640,28 @@ var createVoiceSession = (options) => {
|
|
|
6603
6640
|
session
|
|
6604
6641
|
});
|
|
6605
6642
|
};
|
|
6606
|
-
const DRAIN_POLL_MS =
|
|
6643
|
+
const DRAIN_POLL_MS = 100;
|
|
6607
6644
|
const DRAIN_TAIL_BUFFER_MS = 300;
|
|
6608
|
-
const
|
|
6609
|
-
const
|
|
6645
|
+
const DRAIN_QUIET_MS = 600;
|
|
6646
|
+
const DRAIN_RENDER_START_MS = 4000;
|
|
6647
|
+
const DRAIN_MAX_MS = 20000;
|
|
6648
|
+
const drainAssistantSpeech = async (renderPendingSince) => {
|
|
6610
6649
|
const startedAt = Date.now();
|
|
6650
|
+
const sleep2 = (delayMs) => new Promise((resolve2) => {
|
|
6651
|
+
setTimeout(resolve2, delayMs);
|
|
6652
|
+
});
|
|
6611
6653
|
while (Date.now() - startedAt < DRAIN_MAX_MS) {
|
|
6612
|
-
const
|
|
6613
|
-
|
|
6654
|
+
const now = Date.now();
|
|
6655
|
+
const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
|
|
6656
|
+
if (!renderStarted) {
|
|
6657
|
+
await sleep2(DRAIN_POLL_MS);
|
|
6658
|
+
continue;
|
|
6659
|
+
}
|
|
6660
|
+
const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
|
|
6661
|
+
const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
|
|
6662
|
+
if (streamQuiet && playbackDrained)
|
|
6614
6663
|
return;
|
|
6615
|
-
await
|
|
6616
|
-
setTimeout(resolve2, Math.min(remaining, DRAIN_POLL_MS));
|
|
6617
|
-
});
|
|
6664
|
+
await sleep2(DRAIN_POLL_MS);
|
|
6618
6665
|
}
|
|
6619
6666
|
};
|
|
6620
6667
|
const completeInternal = async (result, input = {}) => {
|
|
@@ -6651,7 +6698,8 @@ var createVoiceSession = (options) => {
|
|
|
6651
6698
|
return;
|
|
6652
6699
|
}
|
|
6653
6700
|
if (disposition === "completed") {
|
|
6654
|
-
await drainAssistantSpeech();
|
|
6701
|
+
await drainAssistantSpeech(lastTtsSendAt);
|
|
6702
|
+
await assistantAudioQueue;
|
|
6655
6703
|
}
|
|
6656
6704
|
await appendTrace({
|
|
6657
6705
|
payload: {
|
|
@@ -7321,7 +7369,10 @@ var createVoiceSession = (options) => {
|
|
|
7321
7369
|
});
|
|
7322
7370
|
if (options.realtime) {
|
|
7323
7371
|
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
7324
|
-
|
|
7372
|
+
runAudioSerial(async () => {
|
|
7373
|
+
if (activeAdapterGeneration !== generation) {
|
|
7374
|
+
return;
|
|
7375
|
+
}
|
|
7325
7376
|
await sendAssistantAudio(chunk, {
|
|
7326
7377
|
format,
|
|
7327
7378
|
receivedAt
|
|
@@ -7350,7 +7401,7 @@ var createVoiceSession = (options) => {
|
|
|
7350
7401
|
});
|
|
7351
7402
|
ttsSession = openedSession;
|
|
7352
7403
|
openedSession.on("audio", ({ chunk, format, receivedAt }) => {
|
|
7353
|
-
|
|
7404
|
+
runAudioSerial(async () => {
|
|
7354
7405
|
if (ttsSession !== openedSession) {
|
|
7355
7406
|
return;
|
|
7356
7407
|
}
|
|
@@ -7478,6 +7529,7 @@ var createVoiceSession = (options) => {
|
|
|
7478
7529
|
try {
|
|
7479
7530
|
await ttsSession2.send(text);
|
|
7480
7531
|
charsSent += text.length;
|
|
7532
|
+
lastTtsSendAt = Date.now();
|
|
7481
7533
|
} catch (error) {
|
|
7482
7534
|
logger.warn("voice assistant audio send failed", {
|
|
7483
7535
|
error: toError(error).message,
|
|
@@ -7760,6 +7812,7 @@ var createVoiceSession = (options) => {
|
|
|
7760
7812
|
turnId: turn.id
|
|
7761
7813
|
});
|
|
7762
7814
|
await activeTTSSession.send(output.assistantText);
|
|
7815
|
+
lastTtsSendAt = Date.now();
|
|
7763
7816
|
if (options.costAccountant) {
|
|
7764
7817
|
options.costAccountant.recordTTS({
|
|
7765
7818
|
characters: output.assistantText.length
|
|
@@ -8184,10 +8237,12 @@ var createVoiceSession = (options) => {
|
|
|
8184
8237
|
if (greetingTTSSession) {
|
|
8185
8238
|
activeTTSTurnId = greetingTurnId;
|
|
8186
8239
|
await greetingTTSSession.send(greetingText);
|
|
8240
|
+
lastTtsSendAt = Date.now();
|
|
8187
8241
|
} else if (options.realtime) {
|
|
8188
8242
|
const greetingRealtimeSession = await ensureAdapter();
|
|
8189
8243
|
activeTTSTurnId = greetingTurnId;
|
|
8190
8244
|
await greetingRealtimeSession.send(greetingText);
|
|
8245
|
+
lastTtsSendAt = Date.now();
|
|
8191
8246
|
}
|
|
8192
8247
|
} catch {}
|
|
8193
8248
|
}
|