@absolutejs/voice 0.0.22-beta.580 → 0.0.22-beta.582

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,9 +22,16 @@ type MinimalGainNode = {
22
22
  value: number;
23
23
  };
24
24
  };
25
+ type MinimalAnalyserNode = {
26
+ connect?: (destination: unknown) => void;
27
+ disconnect?: () => void;
28
+ fftSize: number;
29
+ getByteTimeDomainData: (array: Uint8Array) => void;
30
+ };
25
31
  type MinimalAudioContext = {
26
32
  baseLatency?: number;
27
33
  close: () => Promise<void>;
34
+ createAnalyser?: () => MinimalAnalyserNode;
28
35
  createBuffer: (numberOfChannels: number, length: number, sampleRate: number) => MinimalAudioBuffer;
29
36
  createBufferSource: () => MinimalAudioBufferSourceNode;
30
37
  createGain?: () => MinimalGainNode;
@@ -1693,6 +1693,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
1693
1693
  var MIN_PLAYBACK_RATE = 0.5;
1694
1694
  var MAX_PLAYBACK_RATE = 2;
1695
1695
  var STRETCH_BYPASS_EPSILON = 0.01;
1696
+ var ANALYSER_FFT_SIZE = 256;
1697
+ var PCM_BYTE_MIDPOINT = 128;
1696
1698
  var createInitialState3 = () => ({
1697
1699
  activeSourceCount: 0,
1698
1700
  error: null,
@@ -1753,6 +1755,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1753
1755
  let state = createInitialState3();
1754
1756
  let audioContext = null;
1755
1757
  let outputNode = null;
1758
+ let analyserNode = null;
1759
+ let analyserBuffer = null;
1756
1760
  let volume = clampVolume(options.volume);
1757
1761
  let playbackRate = clampPlaybackRate(options.playbackRate);
1758
1762
  let stretcher = null;
@@ -1849,6 +1853,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1849
1853
  if (audioContext.createGain) {
1850
1854
  outputNode = audioContext.createGain();
1851
1855
  outputNode.connect?.(audioContext.destination);
1856
+ if (audioContext.createAnalyser) {
1857
+ analyserNode = audioContext.createAnalyser();
1858
+ analyserNode.fftSize = ANALYSER_FFT_SIZE;
1859
+ analyserBuffer = new Uint8Array(analyserNode.fftSize);
1860
+ outputNode.connect?.(analyserNode);
1861
+ }
1852
1862
  }
1853
1863
  queueEndTime = audioContext.currentTime;
1854
1864
  return audioContext;
@@ -1973,6 +1983,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1973
1983
  audioContext = null;
1974
1984
  outputNode?.disconnect?.();
1975
1985
  outputNode = null;
1986
+ analyserNode?.disconnect?.();
1987
+ analyserNode = null;
1988
+ analyserBuffer = null;
1976
1989
  queueEndTime = 0;
1977
1990
  setState({
1978
1991
  activeSourceCount: 0,
@@ -1983,6 +1996,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1983
1996
  get error() {
1984
1997
  return state.error;
1985
1998
  },
1999
+ getOutputLevel: () => {
2000
+ if (!analyserNode || !analyserBuffer) {
2001
+ return 0;
2002
+ }
2003
+ analyserNode.getByteTimeDomainData(analyserBuffer);
2004
+ let sumSquares = 0;
2005
+ for (const sample of analyserBuffer) {
2006
+ const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
2007
+ sumSquares += centered * centered;
2008
+ }
2009
+ return Math.sqrt(sumSquares / analyserBuffer.length);
2010
+ },
1986
2011
  getSnapshot: () => state,
1987
2012
  interrupt: async () => {
1988
2013
  const startedAt = Date.now();
@@ -529,6 +529,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
529
529
  var MIN_PLAYBACK_RATE = 0.5;
530
530
  var MAX_PLAYBACK_RATE = 2;
531
531
  var STRETCH_BYPASS_EPSILON = 0.01;
532
+ var ANALYSER_FFT_SIZE = 256;
533
+ var PCM_BYTE_MIDPOINT = 128;
532
534
  var createInitialState = () => ({
533
535
  activeSourceCount: 0,
534
536
  error: null,
@@ -589,6 +591,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
589
591
  let state = createInitialState();
590
592
  let audioContext = null;
591
593
  let outputNode = null;
594
+ let analyserNode = null;
595
+ let analyserBuffer = null;
592
596
  let volume = clampVolume(options.volume);
593
597
  let playbackRate = clampPlaybackRate(options.playbackRate);
594
598
  let stretcher = null;
@@ -685,6 +689,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
685
689
  if (audioContext.createGain) {
686
690
  outputNode = audioContext.createGain();
687
691
  outputNode.connect?.(audioContext.destination);
692
+ if (audioContext.createAnalyser) {
693
+ analyserNode = audioContext.createAnalyser();
694
+ analyserNode.fftSize = ANALYSER_FFT_SIZE;
695
+ analyserBuffer = new Uint8Array(analyserNode.fftSize);
696
+ outputNode.connect?.(analyserNode);
697
+ }
688
698
  }
689
699
  queueEndTime = audioContext.currentTime;
690
700
  return audioContext;
@@ -809,6 +819,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
809
819
  audioContext = null;
810
820
  outputNode?.disconnect?.();
811
821
  outputNode = null;
822
+ analyserNode?.disconnect?.();
823
+ analyserNode = null;
824
+ analyserBuffer = null;
812
825
  queueEndTime = 0;
813
826
  setState({
814
827
  activeSourceCount: 0,
@@ -819,6 +832,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
819
832
  get error() {
820
833
  return state.error;
821
834
  },
835
+ getOutputLevel: () => {
836
+ if (!analyserNode || !analyserBuffer) {
837
+ return 0;
838
+ }
839
+ analyserNode.getByteTimeDomainData(analyserBuffer);
840
+ let sumSquares = 0;
841
+ for (const sample of analyserBuffer) {
842
+ const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
843
+ sumSquares += centered * centered;
844
+ }
845
+ return Math.sqrt(sumSquares / analyserBuffer.length);
846
+ },
822
847
  getSnapshot: () => state,
823
848
  interrupt: async () => {
824
849
  const startedAt = Date.now();
@@ -1331,6 +1331,9 @@ export type VoiceAudioPlayerSource = {
1331
1331
  export type VoiceAudioPlayer = {
1332
1332
  close: () => Promise<void>;
1333
1333
  error: string | null;
1334
+ /** Instantaneous RMS amplitude (0..1) of the assistant's audio output — for
1335
+ * driving a visualizer from the actual voice. 0 when idle / no analyser. */
1336
+ getOutputLevel: () => number;
1334
1337
  getSnapshot: () => VoiceAudioPlayerState;
1335
1338
  activeSourceCount: number;
1336
1339
  isActive: boolean;
package/dist/index.js CHANGED
@@ -3936,6 +3936,8 @@ var createVoiceSession = (options) => {
3936
3936
  let activeAdapterGeneration = 0;
3937
3937
  let activeTTSTurnId;
3938
3938
  let assistantSpeechEndsAt = 0;
3939
+ let lastAssistantAudioAt = 0;
3940
+ let lastTtsSendAt = 0;
3939
3941
  let fillerTimer = null;
3940
3942
  let fillerActive = false;
3941
3943
  let fillerToken = 0;
@@ -4209,6 +4211,15 @@ var createVoiceSession = (options) => {
4209
4211
  });
4210
4212
  return result;
4211
4213
  };
4214
+ let assistantAudioQueue = Promise.resolve();
4215
+ const runAudioSerial = (operation) => {
4216
+ const next = assistantAudioQueue.then(operation);
4217
+ assistantAudioQueue = next.then(() => {
4218
+ return;
4219
+ }, () => {
4220
+ return;
4221
+ });
4222
+ };
4212
4223
  const closeAdapter = async (reason) => {
4213
4224
  if (!sttSession) {
4214
4225
  return;
@@ -4377,6 +4388,7 @@ var createVoiceSession = (options) => {
4377
4388
  const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
4378
4389
  assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
4379
4390
  }
4391
+ lastAssistantAudioAt = Date.now();
4380
4392
  if (activeTTSTurnId) {
4381
4393
  await appendTurnLatencyStage({
4382
4394
  at: input.receivedAt,
@@ -4486,18 +4498,28 @@ var createVoiceSession = (options) => {
4486
4498
  session
4487
4499
  });
4488
4500
  };
4489
- const DRAIN_POLL_MS = 200;
4501
+ const DRAIN_POLL_MS = 100;
4490
4502
  const DRAIN_TAIL_BUFFER_MS = 300;
4491
- const DRAIN_MAX_MS = 12000;
4492
- const drainAssistantSpeech = async () => {
4503
+ const DRAIN_QUIET_MS = 600;
4504
+ const DRAIN_RENDER_START_MS = 4000;
4505
+ const DRAIN_MAX_MS = 20000;
4506
+ const drainAssistantSpeech = async (renderPendingSince) => {
4493
4507
  const startedAt = Date.now();
4508
+ const sleep3 = (delayMs) => new Promise((resolve) => {
4509
+ setTimeout(resolve, delayMs);
4510
+ });
4494
4511
  while (Date.now() - startedAt < DRAIN_MAX_MS) {
4495
- const remaining = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS - Date.now();
4496
- if (remaining <= 0)
4512
+ const now = Date.now();
4513
+ const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
4514
+ if (!renderStarted) {
4515
+ await sleep3(DRAIN_POLL_MS);
4516
+ continue;
4517
+ }
4518
+ const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
4519
+ const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
4520
+ if (streamQuiet && playbackDrained)
4497
4521
  return;
4498
- await new Promise((resolve) => {
4499
- setTimeout(resolve, Math.min(remaining, DRAIN_POLL_MS));
4500
- });
4522
+ await sleep3(DRAIN_POLL_MS);
4501
4523
  }
4502
4524
  };
4503
4525
  const completeInternal = async (result, input = {}) => {
@@ -4534,7 +4556,8 @@ var createVoiceSession = (options) => {
4534
4556
  return;
4535
4557
  }
4536
4558
  if (disposition === "completed") {
4537
- await drainAssistantSpeech();
4559
+ await drainAssistantSpeech(lastTtsSendAt);
4560
+ await assistantAudioQueue;
4538
4561
  }
4539
4562
  await appendTrace({
4540
4563
  payload: {
@@ -5204,7 +5227,10 @@ var createVoiceSession = (options) => {
5204
5227
  });
5205
5228
  if (options.realtime) {
5206
5229
  openedSession.on("audio", ({ chunk, format, receivedAt }) => {
5207
- runAdapterEvent("adapter.audio", async () => {
5230
+ runAudioSerial(async () => {
5231
+ if (activeAdapterGeneration !== generation) {
5232
+ return;
5233
+ }
5208
5234
  await sendAssistantAudio(chunk, {
5209
5235
  format,
5210
5236
  receivedAt
@@ -5233,7 +5259,7 @@ var createVoiceSession = (options) => {
5233
5259
  });
5234
5260
  ttsSession = openedSession;
5235
5261
  openedSession.on("audio", ({ chunk, format, receivedAt }) => {
5236
- runSerial("tts.audio", async () => {
5262
+ runAudioSerial(async () => {
5237
5263
  if (ttsSession !== openedSession) {
5238
5264
  return;
5239
5265
  }
@@ -5361,6 +5387,7 @@ var createVoiceSession = (options) => {
5361
5387
  try {
5362
5388
  await ttsSession2.send(text);
5363
5389
  charsSent += text.length;
5390
+ lastTtsSendAt = Date.now();
5364
5391
  } catch (error) {
5365
5392
  logger.warn("voice assistant audio send failed", {
5366
5393
  error: toError(error).message,
@@ -5643,6 +5670,7 @@ var createVoiceSession = (options) => {
5643
5670
  turnId: turn.id
5644
5671
  });
5645
5672
  await activeTTSSession.send(output.assistantText);
5673
+ lastTtsSendAt = Date.now();
5646
5674
  if (options.costAccountant) {
5647
5675
  options.costAccountant.recordTTS({
5648
5676
  characters: output.assistantText.length
@@ -6067,10 +6095,12 @@ var createVoiceSession = (options) => {
6067
6095
  if (greetingTTSSession) {
6068
6096
  activeTTSTurnId = greetingTurnId;
6069
6097
  await greetingTTSSession.send(greetingText);
6098
+ lastTtsSendAt = Date.now();
6070
6099
  } else if (options.realtime) {
6071
6100
  const greetingRealtimeSession = await ensureAdapter();
6072
6101
  activeTTSTurnId = greetingTurnId;
6073
6102
  await greetingRealtimeSession.send(greetingText);
6103
+ lastTtsSendAt = Date.now();
6074
6104
  }
6075
6105
  } catch {}
6076
6106
  }
@@ -1736,6 +1736,8 @@ var DEFAULT_PLAYBACK_RATE = 1;
1736
1736
  var MIN_PLAYBACK_RATE = 0.5;
1737
1737
  var MAX_PLAYBACK_RATE = 2;
1738
1738
  var STRETCH_BYPASS_EPSILON = 0.01;
1739
+ var ANALYSER_FFT_SIZE = 256;
1740
+ var PCM_BYTE_MIDPOINT = 128;
1739
1741
  var createInitialState = () => ({
1740
1742
  activeSourceCount: 0,
1741
1743
  error: null,
@@ -1796,6 +1798,8 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1796
1798
  let state = createInitialState();
1797
1799
  let audioContext = null;
1798
1800
  let outputNode = null;
1801
+ let analyserNode = null;
1802
+ let analyserBuffer = null;
1799
1803
  let volume = clampVolume(options.volume);
1800
1804
  let playbackRate = clampPlaybackRate(options.playbackRate);
1801
1805
  let stretcher = null;
@@ -1892,6 +1896,12 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1892
1896
  if (audioContext.createGain) {
1893
1897
  outputNode = audioContext.createGain();
1894
1898
  outputNode.connect?.(audioContext.destination);
1899
+ if (audioContext.createAnalyser) {
1900
+ analyserNode = audioContext.createAnalyser();
1901
+ analyserNode.fftSize = ANALYSER_FFT_SIZE;
1902
+ analyserBuffer = new Uint8Array(analyserNode.fftSize);
1903
+ outputNode.connect?.(analyserNode);
1904
+ }
1895
1905
  }
1896
1906
  queueEndTime = audioContext.currentTime;
1897
1907
  return audioContext;
@@ -2016,6 +2026,9 @@ var createVoiceAudioPlayer = (source, options = {}) => {
2016
2026
  audioContext = null;
2017
2027
  outputNode?.disconnect?.();
2018
2028
  outputNode = null;
2029
+ analyserNode?.disconnect?.();
2030
+ analyserNode = null;
2031
+ analyserBuffer = null;
2019
2032
  queueEndTime = 0;
2020
2033
  setState({
2021
2034
  activeSourceCount: 0,
@@ -2026,6 +2039,18 @@ var createVoiceAudioPlayer = (source, options = {}) => {
2026
2039
  get error() {
2027
2040
  return state.error;
2028
2041
  },
2042
+ getOutputLevel: () => {
2043
+ if (!analyserNode || !analyserBuffer) {
2044
+ return 0;
2045
+ }
2046
+ analyserNode.getByteTimeDomainData(analyserBuffer);
2047
+ let sumSquares = 0;
2048
+ for (const sample of analyserBuffer) {
2049
+ const centered = (sample - PCM_BYTE_MIDPOINT) / PCM_BYTE_MIDPOINT;
2050
+ sumSquares += centered * centered;
2051
+ }
2052
+ return Math.sqrt(sumSquares / analyserBuffer.length);
2053
+ },
2029
2054
  getSnapshot: () => state,
2030
2055
  interrupt: async () => {
2031
2056
  const startedAt = Date.now();
@@ -6053,6 +6078,8 @@ var createVoiceSession = (options) => {
6053
6078
  let activeAdapterGeneration = 0;
6054
6079
  let activeTTSTurnId;
6055
6080
  let assistantSpeechEndsAt = 0;
6081
+ let lastAssistantAudioAt = 0;
6082
+ let lastTtsSendAt = 0;
6056
6083
  let fillerTimer = null;
6057
6084
  let fillerActive = false;
6058
6085
  let fillerToken = 0;
@@ -6326,6 +6353,15 @@ var createVoiceSession = (options) => {
6326
6353
  });
6327
6354
  return result;
6328
6355
  };
6356
+ let assistantAudioQueue = Promise.resolve();
6357
+ const runAudioSerial = (operation) => {
6358
+ const next = assistantAudioQueue.then(operation);
6359
+ assistantAudioQueue = next.then(() => {
6360
+ return;
6361
+ }, () => {
6362
+ return;
6363
+ });
6364
+ };
6329
6365
  const closeAdapter = async (reason) => {
6330
6366
  if (!sttSession) {
6331
6367
  return;
@@ -6494,6 +6530,7 @@ var createVoiceSession = (options) => {
6494
6530
  const chunkMs = normalizedChunk.byteLength / bytesPerSecond * 1000;
6495
6531
  assistantSpeechEndsAt = Math.max(assistantSpeechEndsAt, Date.now()) + chunkMs;
6496
6532
  }
6533
+ lastAssistantAudioAt = Date.now();
6497
6534
  if (activeTTSTurnId) {
6498
6535
  await appendTurnLatencyStage({
6499
6536
  at: input.receivedAt,
@@ -6603,18 +6640,28 @@ var createVoiceSession = (options) => {
6603
6640
  session
6604
6641
  });
6605
6642
  };
6606
- const DRAIN_POLL_MS = 200;
6643
+ const DRAIN_POLL_MS = 100;
6607
6644
  const DRAIN_TAIL_BUFFER_MS = 300;
6608
- const DRAIN_MAX_MS = 12000;
6609
- const drainAssistantSpeech = async () => {
6645
+ const DRAIN_QUIET_MS = 600;
6646
+ const DRAIN_RENDER_START_MS = 4000;
6647
+ const DRAIN_MAX_MS = 20000;
6648
+ const drainAssistantSpeech = async (renderPendingSince) => {
6610
6649
  const startedAt = Date.now();
6650
+ const sleep2 = (delayMs) => new Promise((resolve2) => {
6651
+ setTimeout(resolve2, delayMs);
6652
+ });
6611
6653
  while (Date.now() - startedAt < DRAIN_MAX_MS) {
6612
- const remaining = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS - Date.now();
6613
- if (remaining <= 0)
6654
+ const now = Date.now();
6655
+ const renderStarted = lastAssistantAudioAt >= renderPendingSince || now - renderPendingSince >= DRAIN_RENDER_START_MS;
6656
+ if (!renderStarted) {
6657
+ await sleep2(DRAIN_POLL_MS);
6658
+ continue;
6659
+ }
6660
+ const streamQuiet = now - lastAssistantAudioAt >= DRAIN_QUIET_MS;
6661
+ const playbackDrained = assistantSpeechEndsAt + DRAIN_TAIL_BUFFER_MS <= now;
6662
+ if (streamQuiet && playbackDrained)
6614
6663
  return;
6615
- await new Promise((resolve2) => {
6616
- setTimeout(resolve2, Math.min(remaining, DRAIN_POLL_MS));
6617
- });
6664
+ await sleep2(DRAIN_POLL_MS);
6618
6665
  }
6619
6666
  };
6620
6667
  const completeInternal = async (result, input = {}) => {
@@ -6651,7 +6698,8 @@ var createVoiceSession = (options) => {
6651
6698
  return;
6652
6699
  }
6653
6700
  if (disposition === "completed") {
6654
- await drainAssistantSpeech();
6701
+ await drainAssistantSpeech(lastTtsSendAt);
6702
+ await assistantAudioQueue;
6655
6703
  }
6656
6704
  await appendTrace({
6657
6705
  payload: {
@@ -7321,7 +7369,10 @@ var createVoiceSession = (options) => {
7321
7369
  });
7322
7370
  if (options.realtime) {
7323
7371
  openedSession.on("audio", ({ chunk, format, receivedAt }) => {
7324
- runAdapterEvent("adapter.audio", async () => {
7372
+ runAudioSerial(async () => {
7373
+ if (activeAdapterGeneration !== generation) {
7374
+ return;
7375
+ }
7325
7376
  await sendAssistantAudio(chunk, {
7326
7377
  format,
7327
7378
  receivedAt
@@ -7350,7 +7401,7 @@ var createVoiceSession = (options) => {
7350
7401
  });
7351
7402
  ttsSession = openedSession;
7352
7403
  openedSession.on("audio", ({ chunk, format, receivedAt }) => {
7353
- runSerial("tts.audio", async () => {
7404
+ runAudioSerial(async () => {
7354
7405
  if (ttsSession !== openedSession) {
7355
7406
  return;
7356
7407
  }
@@ -7478,6 +7529,7 @@ var createVoiceSession = (options) => {
7478
7529
  try {
7479
7530
  await ttsSession2.send(text);
7480
7531
  charsSent += text.length;
7532
+ lastTtsSendAt = Date.now();
7481
7533
  } catch (error) {
7482
7534
  logger.warn("voice assistant audio send failed", {
7483
7535
  error: toError(error).message,
@@ -7760,6 +7812,7 @@ var createVoiceSession = (options) => {
7760
7812
  turnId: turn.id
7761
7813
  });
7762
7814
  await activeTTSSession.send(output.assistantText);
7815
+ lastTtsSendAt = Date.now();
7763
7816
  if (options.costAccountant) {
7764
7817
  options.costAccountant.recordTTS({
7765
7818
  characters: output.assistantText.length
@@ -8184,10 +8237,12 @@ var createVoiceSession = (options) => {
8184
8237
  if (greetingTTSSession) {
8185
8238
  activeTTSTurnId = greetingTurnId;
8186
8239
  await greetingTTSSession.send(greetingText);
8240
+ lastTtsSendAt = Date.now();
8187
8241
  } else if (options.realtime) {
8188
8242
  const greetingRealtimeSession = await ensureAdapter();
8189
8243
  activeTTSTurnId = greetingTurnId;
8190
8244
  await greetingRealtimeSession.send(greetingText);
8245
+ lastTtsSendAt = Date.now();
8191
8246
  }
8192
8247
  } catch {}
8193
8248
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.580",
3
+ "version": "0.0.22-beta.582",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",