@livekit/agents 1.0.25 → 1.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/connection_pool.cjs +242 -0
  2. package/dist/connection_pool.cjs.map +1 -0
  3. package/dist/connection_pool.d.cts +123 -0
  4. package/dist/connection_pool.d.ts +123 -0
  5. package/dist/connection_pool.d.ts.map +1 -0
  6. package/dist/connection_pool.js +218 -0
  7. package/dist/connection_pool.js.map +1 -0
  8. package/dist/connection_pool.test.cjs +256 -0
  9. package/dist/connection_pool.test.cjs.map +1 -0
  10. package/dist/connection_pool.test.js +255 -0
  11. package/dist/connection_pool.test.js.map +1 -0
  12. package/dist/index.cjs +2 -0
  13. package/dist/index.cjs.map +1 -1
  14. package/dist/index.d.cts +1 -0
  15. package/dist/index.d.ts +1 -0
  16. package/dist/index.d.ts.map +1 -1
  17. package/dist/index.js +1 -0
  18. package/dist/index.js.map +1 -1
  19. package/dist/inference/tts.cjs +172 -58
  20. package/dist/inference/tts.cjs.map +1 -1
  21. package/dist/inference/tts.d.cts +3 -1
  22. package/dist/inference/tts.d.ts +3 -1
  23. package/dist/inference/tts.d.ts.map +1 -1
  24. package/dist/inference/tts.js +173 -59
  25. package/dist/inference/tts.js.map +1 -1
  26. package/dist/tts/stream_adapter.cjs +6 -3
  27. package/dist/tts/stream_adapter.cjs.map +1 -1
  28. package/dist/tts/stream_adapter.d.cts +1 -1
  29. package/dist/tts/stream_adapter.d.ts +1 -1
  30. package/dist/tts/stream_adapter.d.ts.map +1 -1
  31. package/dist/tts/stream_adapter.js +6 -3
  32. package/dist/tts/stream_adapter.js.map +1 -1
  33. package/dist/tts/tts.cjs +26 -15
  34. package/dist/tts/tts.cjs.map +1 -1
  35. package/dist/tts/tts.d.cts +7 -4
  36. package/dist/tts/tts.d.ts +7 -4
  37. package/dist/tts/tts.d.ts.map +1 -1
  38. package/dist/tts/tts.js +26 -15
  39. package/dist/tts/tts.js.map +1 -1
  40. package/dist/utils.cjs +20 -0
  41. package/dist/utils.cjs.map +1 -1
  42. package/dist/utils.d.cts +7 -0
  43. package/dist/utils.d.ts +7 -0
  44. package/dist/utils.d.ts.map +1 -1
  45. package/dist/utils.js +19 -0
  46. package/dist/utils.js.map +1 -1
  47. package/dist/voice/agent_activity.cjs +3 -1
  48. package/dist/voice/agent_activity.cjs.map +1 -1
  49. package/dist/voice/agent_activity.d.ts.map +1 -1
  50. package/dist/voice/agent_activity.js +3 -1
  51. package/dist/voice/agent_activity.js.map +1 -1
  52. package/dist/voice/agent_session.cjs +6 -1
  53. package/dist/voice/agent_session.cjs.map +1 -1
  54. package/dist/voice/agent_session.d.ts.map +1 -1
  55. package/dist/voice/agent_session.js +6 -1
  56. package/dist/voice/agent_session.js.map +1 -1
  57. package/dist/voice/avatar/datastream_io.cjs +1 -1
  58. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  59. package/dist/voice/avatar/datastream_io.js +1 -1
  60. package/dist/voice/avatar/datastream_io.js.map +1 -1
  61. package/dist/voice/background_audio.cjs +77 -37
  62. package/dist/voice/background_audio.cjs.map +1 -1
  63. package/dist/voice/background_audio.d.cts +10 -3
  64. package/dist/voice/background_audio.d.ts +10 -3
  65. package/dist/voice/background_audio.d.ts.map +1 -1
  66. package/dist/voice/background_audio.js +78 -37
  67. package/dist/voice/background_audio.js.map +1 -1
  68. package/dist/voice/index.cjs +1 -0
  69. package/dist/voice/index.cjs.map +1 -1
  70. package/dist/voice/index.d.cts +1 -0
  71. package/dist/voice/index.d.ts +1 -0
  72. package/dist/voice/index.d.ts.map +1 -1
  73. package/dist/voice/index.js +1 -0
  74. package/dist/voice/index.js.map +1 -1
  75. package/dist/voice/io.cjs +10 -1
  76. package/dist/voice/io.cjs.map +1 -1
  77. package/dist/voice/io.d.cts +18 -1
  78. package/dist/voice/io.d.ts +18 -1
  79. package/dist/voice/io.d.ts.map +1 -1
  80. package/dist/voice/io.js +10 -1
  81. package/dist/voice/io.js.map +1 -1
  82. package/dist/voice/recorder_io/recorder_io.cjs +1 -1
  83. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  84. package/dist/voice/recorder_io/recorder_io.js +1 -1
  85. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  86. package/dist/voice/room_io/_output.cjs +1 -1
  87. package/dist/voice/room_io/_output.cjs.map +1 -1
  88. package/dist/voice/room_io/_output.js +1 -1
  89. package/dist/voice/room_io/_output.js.map +1 -1
  90. package/dist/voice/transcription/synchronizer.cjs +1 -1
  91. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  92. package/dist/voice/transcription/synchronizer.js +1 -1
  93. package/dist/voice/transcription/synchronizer.js.map +1 -1
  94. package/dist/worker.cjs +4 -6
  95. package/dist/worker.cjs.map +1 -1
  96. package/dist/worker.d.ts.map +1 -1
  97. package/dist/worker.js +4 -6
  98. package/dist/worker.js.map +1 -1
  99. package/package.json +3 -3
  100. package/src/connection_pool.test.ts +346 -0
  101. package/src/connection_pool.ts +307 -0
  102. package/src/index.ts +1 -0
  103. package/src/inference/tts.ts +206 -65
  104. package/src/tts/stream_adapter.ts +10 -3
  105. package/src/tts/tts.ts +41 -18
  106. package/src/utils.ts +25 -0
  107. package/src/voice/agent_activity.ts +7 -1
  108. package/src/voice/agent_session.ts +6 -1
  109. package/src/voice/avatar/datastream_io.ts +1 -1
  110. package/src/voice/background_audio.ts +95 -55
  111. package/src/voice/index.ts +1 -0
  112. package/src/voice/io.ts +24 -0
  113. package/src/voice/recorder_io/recorder_io.ts +1 -1
  114. package/src/voice/room_io/_output.ts +1 -1
  115. package/src/voice/transcription/synchronizer.ts +1 -1
  116. package/src/worker.ts +4 -7
@@ -3,6 +3,7 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import {
5
5
  AudioFrame,
6
+ AudioMixer,
6
7
  AudioSource,
7
8
  LocalAudioTrack,
8
9
  type LocalTrackPublication,
@@ -57,7 +58,7 @@ export interface BackgroundAudioPlayerOptions {
57
58
 
58
59
  /**
59
60
  * Sound to play when the agent is thinking.
60
- * TODO (Brian): Implement thinking sound when AudioMixer becomes available
61
+ * Plays when agent state changes to 'thinking' and stops when it changes to other states.
61
62
  */
62
63
  thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
63
64
 
@@ -113,15 +114,16 @@ export class PlayHandle {
113
114
  * This class handles playing ambient sounds and manages audio track publishing.
114
115
  * It supports:
115
116
  * - Continuous ambient sound playback with looping
117
+ * - Thinking sound playback during agent processing
118
+ * - Multiple simultaneous audio streams via AudioMixer
116
119
  * - Volume control and probability-based sound selection
117
120
  * - Integration with LiveKit rooms and agent sessions
118
121
  *
119
- * Note: Thinking sound not yet supported
120
- *
121
122
  * @example
122
123
  * ```typescript
123
124
  * const player = new BackgroundAudioPlayer({
124
125
  * ambientSound: { source: BuiltinAudioClip.OFFICE_AMBIENCE, volume: 0.8 },
126
+ * thinkingSound: { source: BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.6 },
125
127
  * });
126
128
  *
127
129
  * await player.start({ room, agentSession });
@@ -130,9 +132,12 @@ export class PlayHandle {
130
132
  export class BackgroundAudioPlayer {
131
133
  private ambientSound?: AudioSourceType | AudioConfig | AudioConfig[];
132
134
  private thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
135
+ private streamTimeoutMs: number;
133
136
 
134
137
  private playTasks: Task<void>[] = [];
135
138
  private audioSource = new AudioSource(48000, 1, AUDIO_SOURCE_BUFFER_MS);
139
+ private audioMixer: AudioMixer;
140
+ private mixerTask?: Task<void>;
136
141
 
137
142
  private room?: Room;
138
143
  private agentSession?: AgentSession;
@@ -143,20 +148,24 @@ export class BackgroundAudioPlayer {
143
148
  private ambientHandle?: PlayHandle;
144
149
  private thinkingHandle?: PlayHandle;
145
150
 
151
+ private closed = true;
152
+
146
153
  // TODO (Brian): add lock
147
154
 
148
155
  #logger = log();
149
156
 
150
157
  constructor(options?: BackgroundAudioPlayerOptions) {
151
- const { ambientSound, thinkingSound } = options || {};
158
+ const { ambientSound, thinkingSound, streamTimeoutMs = 200 } = options || {};
152
159
 
153
160
  this.ambientSound = ambientSound;
154
161
  this.thinkingSound = thinkingSound;
162
+ this.streamTimeoutMs = streamTimeoutMs;
155
163
 
156
- if (this.thinkingSound) {
157
- this.#logger.warn('thinkingSound is not yet supported');
158
- // TODO: Implement thinking sound when AudioMixer becomes available
159
- }
164
+ this.audioMixer = new AudioMixer(48000, 1, {
165
+ blocksize: 4800, // 100ms at 48kHz
166
+ capacity: 1,
167
+ streamTimeoutMs: this.streamTimeoutMs,
168
+ });
160
169
  }
161
170
 
162
171
  /**
@@ -278,15 +287,24 @@ export class BackgroundAudioPlayer {
278
287
  this.agentSession = agentSession;
279
288
  this.trackPublishOptions = trackPublishOptions;
280
289
 
290
+ this.closed = false;
291
+
281
292
  await this.publishTrack();
282
293
 
283
294
  // TODO (Brian): check job context is not fake
284
295
 
285
- // TODO (Brian): start audio mixer task
296
+ this.mixerTask = Task.from(async () => {
297
+ try {
298
+ await this.runMixerTask();
299
+ } catch (err) {
300
+ if (this.closed) return; // expected when AudioSource is closed
301
+ throw err;
302
+ }
303
+ });
304
+
286
305
  this.room.on('reconnected', this.onReconnected);
287
306
 
288
307
  this.agentSession?.on(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
289
-
290
308
  if (!this.ambientSound) return;
291
309
 
292
310
  const normalized = this.normalizeSoundSource(this.ambientSound);
@@ -301,16 +319,21 @@ export class BackgroundAudioPlayer {
301
319
  * Close and cleanup the background audio system
302
320
  */
303
321
  async close(): Promise<void> {
322
+ this.closed = true;
323
+
304
324
  await cancelAndWait(this.playTasks, TASK_TIMEOUT_MS);
305
325
 
306
326
  if (this.republishTask) {
307
327
  await this.republishTask.cancelAndWait(TASK_TIMEOUT_MS);
308
328
  }
309
329
 
310
- // TODO (Brian): cancel audio mixer task and close audio mixer
311
-
330
+ await this.audioMixer.aclose();
312
331
  await this.audioSource.close();
313
332
 
333
+ if (this.mixerTask) {
334
+ await this.mixerTask.cancelAndWait(TASK_TIMEOUT_MS);
335
+ }
336
+
314
337
  this.agentSession?.off(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
315
338
  this.room?.off('reconnected', this.onReconnected);
316
339
 
@@ -362,6 +385,12 @@ export class BackgroundAudioPlayer {
362
385
  await this.publishTrack();
363
386
  }
364
387
 
388
+ private async runMixerTask(): Promise<void> {
389
+ for await (const frame of this.audioMixer) {
390
+ await this.audioSource.captureFrame(frame);
391
+ }
392
+ }
393
+
365
394
  private onAgentStateChanged = (ev: AgentStateChangedEvent): void => {
366
395
  if (!this.thinkingSound) {
367
396
  return;
@@ -372,12 +401,45 @@ export class BackgroundAudioPlayer {
372
401
  return;
373
402
  }
374
403
 
375
- // TODO (Brian): play thinking sound and assign to thinkingHandle
404
+ const normalized = this.normalizeSoundSource(this.thinkingSound);
405
+ if (normalized) {
406
+ const { source, volume } = normalized;
407
+ const selectedSound: AudioConfig = { source, volume, probability: 1.0 };
408
+ // Loop thinking sound while in thinking state (same as ambient)
409
+ this.thinkingHandle = this.play(selectedSound, typeof source === 'string');
410
+ }
376
411
  } else {
377
412
  this.thinkingHandle?.stop();
378
413
  }
379
414
  };
380
415
 
416
+ // Note: Python uses numpy, TS uses typed arrays for equivalent logic
417
+ private applyVolumeToFrame(frame: AudioFrame, volume: number): AudioFrame {
418
+ const int16Data = new Int16Array(
419
+ frame.data.buffer,
420
+ frame.data.byteOffset,
421
+ frame.data.byteLength / 2,
422
+ );
423
+ const float32Data = new Float32Array(int16Data.length);
424
+
425
+ for (let i = 0; i < int16Data.length; i++) {
426
+ float32Data[i] = int16Data[i]!;
427
+ }
428
+
429
+ const volumeFactor = 10 ** Math.log10(volume);
430
+ for (let i = 0; i < float32Data.length; i++) {
431
+ float32Data[i]! *= volumeFactor;
432
+ }
433
+
434
+ const outputData = new Int16Array(float32Data.length);
435
+ for (let i = 0; i < float32Data.length; i++) {
436
+ const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!));
437
+ outputData[i] = Math.round(clipped);
438
+ }
439
+
440
+ return new AudioFrame(outputData, frame.sampleRate, frame.channels, frame.samplesPerChannel);
441
+ }
442
+
381
443
  private async playTask({
382
444
  playHandle,
383
445
  sound,
@@ -395,57 +457,35 @@ export class BackgroundAudioPlayer {
395
457
  sound = getBuiltinAudioPath(sound);
396
458
  }
397
459
 
460
+ let audioStream: AsyncIterable<AudioFrame>;
398
461
  if (typeof sound === 'string') {
399
- sound = loop
462
+ audioStream = loop
400
463
  ? loopAudioFramesFromFile(sound, { abortSignal: signal })
401
464
  : audioFramesFromFile(sound, { abortSignal: signal });
465
+ } else {
466
+ audioStream = sound;
402
467
  }
403
468
 
404
- try {
405
- for await (const frame of sound) {
469
+ const applyVolume = this.applyVolumeToFrame.bind(this);
470
+ async function* genWrapper(): AsyncGenerator<AudioFrame> {
471
+ for await (const frame of audioStream) {
406
472
  if (signal.aborted || playHandle.done()) break;
407
-
408
- let processedFrame: AudioFrame;
409
-
410
- if (volume !== 1.0) {
411
- const int16Data = new Int16Array(
412
- frame.data.buffer,
413
- frame.data.byteOffset,
414
- frame.data.byteLength / 2,
415
- );
416
- const float32Data = new Float32Array(int16Data.length);
417
-
418
- for (let i = 0; i < int16Data.length; i++) {
419
- float32Data[i] = int16Data[i]!;
420
- }
421
-
422
- const volumeFactor = 10 ** Math.log10(volume);
423
- for (let i = 0; i < float32Data.length; i++) {
424
- float32Data[i]! *= volumeFactor;
425
- }
426
-
427
- const outputData = new Int16Array(float32Data.length);
428
- for (let i = 0; i < float32Data.length; i++) {
429
- const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!));
430
- outputData[i] = Math.round(clipped);
431
- }
432
-
433
- processedFrame = new AudioFrame(
434
- outputData,
435
- frame.sampleRate,
436
- frame.channels,
437
- frame.samplesPerChannel,
438
- );
439
- } else {
440
- processedFrame = frame;
441
- }
442
-
443
- // TODO (Brian): use AudioMixer to add/remove frame streams
444
- await this.audioSource.captureFrame(processedFrame);
473
+ yield volume !== 1.0 ? applyVolume(frame, volume) : frame;
445
474
  }
475
+ playHandle._markPlayoutDone();
476
+ }
477
+
478
+ const gen = genWrapper();
479
+ try {
480
+ this.audioMixer.addStream(gen);
481
+ await playHandle.waitForPlayout();
446
482
  } finally {
447
- // TODO: the waitForPlayout() may be innaccurate by 400ms
483
+ this.audioMixer.removeStream(gen);
448
484
  playHandle._markPlayoutDone();
485
+
486
+ if (playHandle.done()) {
487
+ await gen.return(undefined);
488
+ }
449
489
  }
450
490
  }
451
491
  }
@@ -6,6 +6,7 @@ export { AgentSession, type AgentSessionOptions } from './agent_session.js';
6
6
  export * from './avatar/index.js';
7
7
  export * from './background_audio.js';
8
8
  export * from './events.js';
9
+ export { type TimedString } from './io.js';
9
10
  export * from './report.js';
10
11
  export * from './room_io/index.js';
11
12
  export { RunContext } from './run_context.js';
package/src/voice/io.ts CHANGED
@@ -29,6 +29,20 @@ export type TTSNode = (
29
29
  modelSettings: ModelSettings,
30
30
  ) => Promise<ReadableStream<AudioFrame> | null>;
31
31
 
32
+ /**
33
+ * A string with timing information for word-level alignment.
34
+ */
35
+ export interface TimedString {
36
+ text: string;
37
+ startTime?: number; // seconds
38
+ endTime?: number; // seconds
39
+ }
40
+
41
+ export interface AudioOutputCapabilities {
42
+ /** Whether this output supports pause/resume functionality */
43
+ pause: boolean;
44
+ }
45
+
32
46
  export abstract class AudioInput {
33
47
  protected deferredStream: DeferredReadableStream<AudioFrame> =
34
48
  new DeferredReadableStream<AudioFrame>();
@@ -54,12 +68,15 @@ export abstract class AudioOutput extends EventEmitter {
54
68
  interrupted: false,
55
69
  };
56
70
  protected logger = log();
71
+ protected readonly capabilities: AudioOutputCapabilities;
57
72
 
58
73
  constructor(
59
74
  public sampleRate?: number,
60
75
  protected readonly nextInChain?: AudioOutput,
76
+ capabilities: AudioOutputCapabilities = { pause: false },
61
77
  ) {
62
78
  super();
79
+ this.capabilities = capabilities;
63
80
  if (this.nextInChain) {
64
81
  this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
65
82
  this.onPlaybackFinished(ev),
@@ -67,6 +84,13 @@ export abstract class AudioOutput extends EventEmitter {
67
84
  }
68
85
  }
69
86
 
87
+ /**
88
+ * Whether this output and all outputs in the chain support pause/resume.
89
+ */
90
+ get canPause(): boolean {
91
+ return this.capabilities.pause && (this.nextInChain?.canPause ?? true);
92
+ }
93
+
70
94
  /**
71
95
  * Capture an audio frame for playback, frames can be pushed faster than real-time
72
96
  */
@@ -465,7 +465,7 @@ class RecorderAudioOutput extends AudioOutput {
465
465
  audioOutput: AudioOutput,
466
466
  writeFn: (buf: AudioFrame[]) => void,
467
467
  ) {
468
- super(audioOutput.sampleRate, audioOutput);
468
+ super(audioOutput.sampleRate, audioOutput, { pause: true });
469
469
  this.recorderIO = recorderIO;
470
470
  this.writeFn = writeFn;
471
471
  }
@@ -328,7 +328,7 @@ export class ParticipantAudioOutput extends AudioOutput {
328
328
  private interruptedFuture: Future<void> = new Future();
329
329
 
330
330
  constructor(room: Room, options: AudioOutputOptions) {
331
- super(options.sampleRate);
331
+ super(options.sampleRate, undefined, { pause: true });
332
332
  this.room = room;
333
333
  this.options = options;
334
334
  this.audioSource = new AudioSource(options.sampleRate, options.numChannels);
@@ -362,7 +362,7 @@ class SyncedAudioOutput extends AudioOutput {
362
362
  public synchronizer: TranscriptionSynchronizer,
363
363
  private nextInChainAudio: AudioOutput,
364
364
  ) {
365
- super(nextInChainAudio.sampleRate, nextInChainAudio);
365
+ super(nextInChainAudio.sampleRate, nextInChainAudio, { pause: true });
366
366
  }
367
367
 
368
368
  async captureFrame(frame: AudioFrame): Promise<void> {
package/src/worker.ts CHANGED
@@ -384,7 +384,7 @@ export class AgentServer {
384
384
  try {
385
385
  await new Promise((resolve, reject) => {
386
386
  this.#session!.on('open', resolve);
387
- this.#session!.on('error', (error) => reject(error.message));
387
+ this.#session!.on('error', (error) => reject(error));
388
388
  this.#session!.on('close', (code) => reject(`WebSocket returned ${code}`));
389
389
  });
390
390
 
@@ -392,14 +392,10 @@ export class AgentServer {
392
392
  this.#logger.debug('connected to LiveKit server');
393
393
  await this.#runWS(this.#session);
394
394
  } catch (e: unknown) {
395
- if (e instanceof Error || e instanceof ErrorEvent) {
396
- e = e.message;
397
- }
398
-
399
395
  if (this.#closed) return;
400
396
  if (retries >= this.#opts.maxRetry) {
401
397
  throw new WorkerError(
402
- `failed to connect to LiveKit server after ${retries} attempts: ${e}`,
398
+ `failed to connect to LiveKit server (${this.#opts.wsURL}) after ${retries} attempts: ${e}`,
403
399
  );
404
400
  }
405
401
 
@@ -407,7 +403,8 @@ export class AgentServer {
407
403
  const delay = Math.min(retries * 2, 10);
408
404
 
409
405
  this.#logger.warn(
410
- `failed to connect to LiveKit server, retrying in ${delay} seconds: ${e} (${retries}/${this.#opts.maxRetry})`,
406
+ e,
407
+ `failed to connect to LiveKit server (${this.#opts.wsURL}), retrying in ${delay} seconds: (${retries}/${this.#opts.maxRetry})`,
411
408
  );
412
409
 
413
410
  await new Promise((resolve) => setTimeout(resolve, delay * 1000));