@livekit/agents 1.0.48 → 1.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/cpu.cjs +189 -0
  2. package/dist/cpu.cjs.map +1 -0
  3. package/dist/cpu.d.cts +24 -0
  4. package/dist/cpu.d.ts +24 -0
  5. package/dist/cpu.d.ts.map +1 -0
  6. package/dist/cpu.js +152 -0
  7. package/dist/cpu.js.map +1 -0
  8. package/dist/cpu.test.cjs +227 -0
  9. package/dist/cpu.test.cjs.map +1 -0
  10. package/dist/cpu.test.js +204 -0
  11. package/dist/cpu.test.js.map +1 -0
  12. package/dist/inference/llm.cjs.map +1 -1
  13. package/dist/inference/llm.d.cts +1 -1
  14. package/dist/inference/llm.d.ts +1 -1
  15. package/dist/inference/llm.d.ts.map +1 -1
  16. package/dist/inference/llm.js.map +1 -1
  17. package/dist/inference/tts.cjs.map +1 -1
  18. package/dist/inference/tts.d.cts +6 -0
  19. package/dist/inference/tts.d.ts +6 -0
  20. package/dist/inference/tts.d.ts.map +1 -1
  21. package/dist/inference/tts.js.map +1 -1
  22. package/dist/ipc/job_proc_lazy_main.cjs +13 -4
  23. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  24. package/dist/ipc/job_proc_lazy_main.js +13 -4
  25. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  26. package/dist/version.cjs +1 -1
  27. package/dist/version.js +1 -1
  28. package/dist/voice/agent_activity.cjs +39 -8
  29. package/dist/voice/agent_activity.cjs.map +1 -1
  30. package/dist/voice/agent_activity.d.ts.map +1 -1
  31. package/dist/voice/agent_activity.js +40 -9
  32. package/dist/voice/agent_activity.js.map +1 -1
  33. package/dist/voice/agent_session.cjs +27 -1
  34. package/dist/voice/agent_session.cjs.map +1 -1
  35. package/dist/voice/agent_session.d.cts +6 -0
  36. package/dist/voice/agent_session.d.ts +6 -0
  37. package/dist/voice/agent_session.d.ts.map +1 -1
  38. package/dist/voice/agent_session.js +27 -1
  39. package/dist/voice/agent_session.js.map +1 -1
  40. package/dist/worker.cjs +6 -29
  41. package/dist/worker.cjs.map +1 -1
  42. package/dist/worker.d.ts.map +1 -1
  43. package/dist/worker.js +6 -19
  44. package/dist/worker.js.map +1 -1
  45. package/package.json +1 -1
  46. package/src/cpu.test.ts +239 -0
  47. package/src/cpu.ts +173 -0
  48. package/src/inference/llm.ts +2 -0
  49. package/src/inference/tts.ts +8 -1
  50. package/src/ipc/job_proc_lazy_main.ts +15 -4
  51. package/src/voice/agent_activity.ts +68 -10
  52. package/src/voice/agent_session.ts +33 -2
  53. package/src/worker.ts +34 -50
@@ -7,7 +7,7 @@ import type { Span } from '@opentelemetry/api';
7
7
  import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
- import { ReadableStream } from 'node:stream/web';
10
+ import { ReadableStream, TransformStream } from 'node:stream/web';
11
11
  import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
12
12
  import {
13
13
  type ChatItem,
@@ -485,15 +485,36 @@ export class AgentActivity implements RecognitionHooks {
485
485
  void this.audioStream.close();
486
486
  this.audioStream = new MultiInputStream<AudioFrame>();
487
487
 
488
+ // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
489
+ // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
490
+ // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
491
+ // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
492
+ const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
493
+ transform: (frame, controller) => {
494
+ const shouldDiscardForAecWarmup =
495
+ this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
496
+ if (!shouldDiscardForAecWarmup) {
497
+ controller.enqueue(frame);
498
+ }
499
+ },
500
+ });
501
+
488
502
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
489
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
490
503
 
491
- if (this.realtimeSession) {
504
+ if (this.realtimeSession && this.audioRecognition) {
505
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
506
+ .pipeThrough(aecWarmupAudioFilter)
507
+ .tee();
492
508
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
493
- }
494
-
495
- if (this.audioRecognition) {
496
509
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
510
+ } else if (this.realtimeSession) {
511
+ this.realtimeSession.setInputAudioStream(
512
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
513
+ );
514
+ } else if (this.audioRecognition) {
515
+ this.audioRecognition.setInputAudioStream(
516
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
517
+ );
497
518
  }
498
519
  }
499
520
 
@@ -755,6 +776,11 @@ export class AgentActivity implements RecognitionHooks {
755
776
  }
756
777
 
757
778
  private interruptByAudioActivity(): void {
779
+ if (this.agentSession._aecWarmupRemaining > 0) {
780
+ // Disable interruption from audio activity while AEC warmup is active.
781
+ return;
782
+ }
783
+
758
784
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
759
785
  // skip speech handle interruption if server side turn detection is enabled
760
786
  return;
@@ -1014,6 +1040,16 @@ export class AgentActivity implements RecognitionHooks {
1014
1040
  throw new Error('Speech queue is empty');
1015
1041
  }
1016
1042
  const speechHandle = heapItem[2];
1043
+
1044
+ // Skip speech handles that were already interrupted/done before being
1045
+ // picked up from the queue (e.g. interrupted during shutdown before the
1046
+ // main loop had a chance to process them). Calling _authorizeGeneration
1047
+ // on a done handle would create a generation Future that nobody resolves,
1048
+ // causing the main loop to hang forever.
1049
+ if (speechHandle.interrupted || speechHandle.done()) {
1050
+ continue;
1051
+ }
1052
+
1017
1053
  this._currentSpeech = speechHandle;
1018
1054
  speechHandle._authorizeGeneration();
1019
1055
  await speechHandle._waitForGeneration();
@@ -1210,7 +1246,24 @@ export class AgentActivity implements RecognitionHooks {
1210
1246
 
1211
1247
  this.realtimeSession?.interrupt();
1212
1248
 
1213
- if (currentSpeech === undefined) {
1249
+ if (force) {
1250
+ // Force-interrupt (used during shutdown): cancel all speech tasks so they
1251
+ // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
1252
+ // when the room is disconnected). Mark the current speech as done immediately
1253
+ // so the interrupt future resolves without waiting for tasks to finish.
1254
+ // Clear the queue so mainTask doesn't dequeue already-interrupted handles
1255
+ // and hang on _waitForGeneration() (the generation future created by
1256
+ // _authorizeGeneration would never resolve since _markDone is a no-op
1257
+ // once doneFut is already settled).
1258
+ for (const task of this.speechTasks) {
1259
+ task.cancel();
1260
+ }
1261
+ if (currentSpeech && !currentSpeech.done()) {
1262
+ currentSpeech._markDone();
1263
+ }
1264
+ this.speechQueue.clear();
1265
+ future.resolve();
1266
+ } else if (currentSpeech === undefined) {
1214
1267
  future.resolve();
1215
1268
  } else {
1216
1269
  currentSpeech.addDoneCallback(() => {
@@ -1718,9 +1771,7 @@ export class AgentActivity implements RecognitionHooks {
1718
1771
  }
1719
1772
 
1720
1773
  replyAbortController.abort();
1721
- await Promise.allSettled(
1722
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
1723
- );
1774
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1724
1775
 
1725
1776
  let forwardedText = textOut?.text || '';
1726
1777
 
@@ -2549,6 +2600,13 @@ export class AgentActivity implements RecognitionHooks {
2549
2600
  const unlock = await this.lock.lock();
2550
2601
  try {
2551
2602
  this.cancelPreemptiveGeneration();
2603
+
2604
+ await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2605
+
2606
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2607
+ this._currentSpeech._markDone();
2608
+ }
2609
+
2552
2610
  await this._closeSessionResources();
2553
2611
 
2554
2612
  if (this._mainTask) {
@@ -77,6 +77,7 @@ export interface VoiceOptions {
77
77
  maxToolSteps: number;
78
78
  preemptiveGeneration: boolean;
79
79
  userAwayTimeout?: number | null;
80
+ aecWarmupDuration: number | null;
80
81
  useTtsAlignedTranscript: boolean;
81
82
  }
82
83
 
@@ -90,6 +91,7 @@ const defaultVoiceOptions: VoiceOptions = {
90
91
  maxToolSteps: 3,
91
92
  preemptiveGeneration: false,
92
93
  userAwayTimeout: 15.0,
94
+ aecWarmupDuration: 3000,
93
95
  useTtsAlignedTranscript: true,
94
96
  } as const;
95
97
 
@@ -158,6 +160,8 @@ export class AgentSession<
158
160
  private closingTask: Promise<void> | null = null;
159
161
  private userAwayTimer: NodeJS.Timeout | null = null;
160
162
 
163
+ private _aecWarmupTimer: NodeJS.Timeout | null = null;
164
+
161
165
  // Connection options for STT, LLM, and TTS
162
166
  private _connOptions: ResolvedSessionConnectOptions;
163
167
 
@@ -169,6 +173,9 @@ export class AgentSession<
169
173
  private userSpeakingSpan?: Span;
170
174
  private agentSpeakingSpan?: Span;
171
175
 
176
+ /** @internal */
177
+ _aecWarmupRemaining = 0;
178
+
172
179
  /** @internal */
173
180
  _recorderIO?: RecorderIO;
174
181
 
@@ -241,6 +248,7 @@ export class AgentSession<
241
248
  // This is the "global" chat context, it holds the entire conversation history
242
249
  this._chatCtx = ChatContext.empty();
243
250
  this.options = { ...defaultVoiceOptions, ...voiceOptions };
251
+ this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0;
244
252
 
245
253
  this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
246
254
  this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
@@ -774,7 +782,9 @@ export class AgentSession<
774
782
  if (this.closingTask) {
775
783
  return;
776
784
  }
777
- this.closeImpl(reason, error, drain);
785
+ this.closingTask = this.closeImpl(reason, error, drain).finally(() => {
786
+ this.closingTask = null;
787
+ });
778
788
  }
779
789
 
780
790
  /** @internal */
@@ -845,6 +855,14 @@ export class AgentSession<
845
855
  this.agentSpeakingSpan = undefined;
846
856
  }
847
857
 
858
+ if (state === 'speaking' && this._aecWarmupRemaining > 0 && this._aecWarmupTimer === null) {
859
+ this._aecWarmupTimer = setTimeout(() => this._onAecWarmupExpired(), this._aecWarmupRemaining);
860
+ this.logger.debug(
861
+ { warmupDurationMs: this._aecWarmupRemaining },
862
+ 'aec warmup active, disabling interruptions',
863
+ );
864
+ }
865
+
848
866
  const oldState = this._agentState;
849
867
  this._agentState = state;
850
868
 
@@ -938,6 +956,19 @@ export class AgentSession<
938
956
  }
939
957
  }
940
958
 
959
+ /** @internal */
960
+ _onAecWarmupExpired(): void {
961
+ if (this._aecWarmupRemaining > 0) {
962
+ this.logger.debug('aec warmup expired, re-enabling interruptions');
963
+ }
964
+
965
+ this._aecWarmupRemaining = 0;
966
+ if (this._aecWarmupTimer !== null) {
967
+ clearTimeout(this._aecWarmupTimer);
968
+ this._aecWarmupTimer = null;
969
+ }
970
+ }
971
+
941
972
  private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
942
973
  if (this.userState === 'away' && ev.isFinal) {
943
974
  this.logger.debug('User returned from away state due to speech input');
@@ -969,6 +1000,7 @@ export class AgentSession<
969
1000
  }
970
1001
 
971
1002
  this._cancelUserAwayTimer();
1003
+ this._onAecWarmupExpired();
972
1004
  this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
973
1005
 
974
1006
  if (this.activity) {
@@ -976,7 +1008,6 @@ export class AgentSession<
976
1008
  try {
977
1009
  await this.activity.interrupt({ force: true }).await;
978
1010
  } catch (error) {
979
- // Uninterruptible speech can throw during forced interruption.
980
1011
  this.logger.warn({ error }, 'Error interrupting activity');
981
1012
  }
982
1013
  }
package/src/worker.ts CHANGED
@@ -13,8 +13,8 @@ import {
13
13
  import type { ParticipantInfo } from 'livekit-server-sdk';
14
14
  import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
15
15
  import { EventEmitter } from 'node:events';
16
- import os from 'node:os';
17
16
  import { WebSocket } from 'ws';
17
+ import { getCpuMonitor } from './cpu.js';
18
18
  import { HTTPServer } from './http_server.js';
19
19
  import { InferenceRunner } from './inference_runner.js';
20
20
  import { InferenceProcExecutor } from './ipc/inference_proc_executor.js';
@@ -79,32 +79,11 @@ const defaultRequestFunc = async (ctx: JobRequest) => {
79
79
  await ctx.accept();
80
80
  };
81
81
 
82
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
83
- const defaultCpuLoad = async (worker: AgentServer): Promise<number> => {
84
- return new Promise((resolve) => {
85
- const cpus1 = os.cpus();
86
-
87
- setTimeout(() => {
88
- const cpus2 = os.cpus();
89
-
90
- let idle = 0;
91
- let total = 0;
92
-
93
- for (let i = 0; i < cpus1.length; i++) {
94
- const cpu1 = cpus1[i]!.times;
95
- const cpu2 = cpus2[i]!.times;
96
-
97
- idle += cpu2.idle - cpu1.idle;
98
-
99
- const total1 = Object.values(cpu1).reduce((acc, i) => acc + i, 0);
100
- const total2 = Object.values(cpu2).reduce((acc, i) => acc + i, 0);
82
+ const cpuMonitor = getCpuMonitor();
101
83
 
102
- total += total2 - total1;
103
- }
104
-
105
- resolve(+(1 - idle / total).toFixed(2));
106
- }, UPDATE_LOAD_INTERVAL);
107
- });
84
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
85
+ const defaultCpuLoad = async (_worker: AgentServer): Promise<number> => {
86
+ return cpuMonitor.cpuPercent(UPDATE_LOAD_INTERVAL);
108
87
  };
109
88
 
110
89
  /** Participant permissions to pass to every agent spun up by this worker. */
@@ -651,33 +630,38 @@ export class AgentServer {
651
630
  if (closingWS) clearInterval(loadMonitor);
652
631
 
653
632
  const oldStatus = currentStatus;
654
- this.#opts.loadFunc(this).then((currentLoad: number) => {
655
- const isFull = currentLoad >= this.#opts.loadThreshold;
656
- const currentlyAvailable = !isFull;
657
- currentStatus = currentlyAvailable ? WorkerStatus.WS_AVAILABLE : WorkerStatus.WS_FULL;
658
-
659
- if (oldStatus != currentStatus) {
660
- const extra = { load: currentLoad, loadThreshold: this.#opts.loadThreshold };
661
- if (isFull) {
662
- this.#logger.child(extra).info('worker is at full capacity, marking as unavailable');
663
- } else {
664
- this.#logger.child(extra).info('worker is below capacity, marking as available');
633
+ this.#opts
634
+ .loadFunc(this)
635
+ .then((currentLoad: number) => {
636
+ const isFull = currentLoad >= this.#opts.loadThreshold;
637
+ const currentlyAvailable = !isFull;
638
+ currentStatus = currentlyAvailable ? WorkerStatus.WS_AVAILABLE : WorkerStatus.WS_FULL;
639
+
640
+ if (oldStatus != currentStatus) {
641
+ const extra = { load: currentLoad, loadThreshold: this.#opts.loadThreshold };
642
+ if (isFull) {
643
+ this.#logger.child(extra).info('worker is at full capacity, marking as unavailable');
644
+ } else {
645
+ this.#logger.child(extra).info('worker is below capacity, marking as available');
646
+ }
665
647
  }
666
- }
667
648
 
668
- this.event.emit(
669
- 'worker_msg',
670
- new WorkerMessage({
671
- message: {
672
- case: 'updateWorker',
673
- value: {
674
- load: currentLoad,
675
- status: currentStatus,
649
+ this.event.emit(
650
+ 'worker_msg',
651
+ new WorkerMessage({
652
+ message: {
653
+ case: 'updateWorker',
654
+ value: {
655
+ load: currentLoad,
656
+ status: currentStatus,
657
+ },
676
658
  },
677
- },
678
- }),
679
- );
680
- });
659
+ }),
660
+ );
661
+ })
662
+ .catch((e) => {
663
+ this.#logger.warn({ error: e }, 'failed to measure CPU load');
664
+ });
681
665
  }, UPDATE_LOAD_INTERVAL);
682
666
 
683
667
  await close;