@livekit/agents 1.0.48 → 1.0.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cpu.cjs +189 -0
- package/dist/cpu.cjs.map +1 -0
- package/dist/cpu.d.cts +24 -0
- package/dist/cpu.d.ts +24 -0
- package/dist/cpu.d.ts.map +1 -0
- package/dist/cpu.js +152 -0
- package/dist/cpu.js.map +1 -0
- package/dist/cpu.test.cjs +227 -0
- package/dist/cpu.test.cjs.map +1 -0
- package/dist/cpu.test.js +204 -0
- package/dist/cpu.test.js.map +1 -0
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +1 -1
- package/dist/inference/llm.d.ts +1 -1
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +6 -0
- package/dist/inference/tts.d.ts +6 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent_activity.cjs +36 -8
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +37 -9
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +27 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +6 -0
- package/dist/voice/agent_session.d.ts +6 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +27 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/worker.cjs +6 -29
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +6 -19
- package/dist/worker.js.map +1 -1
- package/package.json +1 -1
- package/src/cpu.test.ts +239 -0
- package/src/cpu.ts +173 -0
- package/src/inference/llm.ts +2 -0
- package/src/inference/tts.ts +8 -1
- package/src/voice/agent_activity.ts +58 -10
- package/src/voice/agent_session.ts +33 -2
- package/src/worker.ts +34 -50
|
@@ -7,7 +7,7 @@ import type { Span } from '@opentelemetry/api';
|
|
|
7
7
|
import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
|
|
8
8
|
import { Heap } from 'heap-js';
|
|
9
9
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
10
|
-
import { ReadableStream } from 'node:stream/web';
|
|
10
|
+
import { ReadableStream, TransformStream } from 'node:stream/web';
|
|
11
11
|
import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
12
12
|
import {
|
|
13
13
|
type ChatItem,
|
|
@@ -485,15 +485,36 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
485
485
|
void this.audioStream.close();
|
|
486
486
|
this.audioStream = new MultiInputStream<AudioFrame>();
|
|
487
487
|
|
|
488
|
+
// Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
|
|
489
|
+
// than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
|
|
490
|
+
// if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
|
|
491
|
+
// and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
|
|
492
|
+
const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
|
|
493
|
+
transform: (frame, controller) => {
|
|
494
|
+
const shouldDiscardForAecWarmup =
|
|
495
|
+
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
|
|
496
|
+
if (!shouldDiscardForAecWarmup) {
|
|
497
|
+
controller.enqueue(frame);
|
|
498
|
+
}
|
|
499
|
+
},
|
|
500
|
+
});
|
|
501
|
+
|
|
488
502
|
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
489
|
-
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
490
503
|
|
|
491
|
-
if (this.realtimeSession) {
|
|
504
|
+
if (this.realtimeSession && this.audioRecognition) {
|
|
505
|
+
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
|
|
506
|
+
.pipeThrough(aecWarmupAudioFilter)
|
|
507
|
+
.tee();
|
|
492
508
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
if (this.audioRecognition) {
|
|
496
509
|
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
|
|
510
|
+
} else if (this.realtimeSession) {
|
|
511
|
+
this.realtimeSession.setInputAudioStream(
|
|
512
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
513
|
+
);
|
|
514
|
+
} else if (this.audioRecognition) {
|
|
515
|
+
this.audioRecognition.setInputAudioStream(
|
|
516
|
+
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
|
|
517
|
+
);
|
|
497
518
|
}
|
|
498
519
|
}
|
|
499
520
|
|
|
@@ -755,6 +776,11 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
755
776
|
}
|
|
756
777
|
|
|
757
778
|
private interruptByAudioActivity(): void {
|
|
779
|
+
if (this.agentSession._aecWarmupRemaining > 0) {
|
|
780
|
+
// Disable interruption from audio activity while AEC warmup is active.
|
|
781
|
+
return;
|
|
782
|
+
}
|
|
783
|
+
|
|
758
784
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
759
785
|
// skip speech handle interruption if server side turn detection is enabled
|
|
760
786
|
return;
|
|
@@ -1210,7 +1236,24 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1210
1236
|
|
|
1211
1237
|
this.realtimeSession?.interrupt();
|
|
1212
1238
|
|
|
1213
|
-
if (
|
|
1239
|
+
if (force) {
|
|
1240
|
+
// Force-interrupt (used during shutdown): cancel all speech tasks so they
|
|
1241
|
+
// don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
|
|
1242
|
+
// when the room is disconnected). Mark the current speech as done immediately
|
|
1243
|
+
// so the interrupt future resolves without waiting for tasks to finish.
|
|
1244
|
+
// Clear the queue so mainTask doesn't dequeue already-interrupted handles
|
|
1245
|
+
// and hang on _waitForGeneration() (the generation future created by
|
|
1246
|
+
// _authorizeGeneration would never resolve since _markDone is a no-op
|
|
1247
|
+
// once doneFut is already settled).
|
|
1248
|
+
for (const task of this.speechTasks) {
|
|
1249
|
+
task.cancel();
|
|
1250
|
+
}
|
|
1251
|
+
if (currentSpeech && !currentSpeech.done()) {
|
|
1252
|
+
currentSpeech._markDone();
|
|
1253
|
+
}
|
|
1254
|
+
this.speechQueue.clear();
|
|
1255
|
+
future.resolve();
|
|
1256
|
+
} else if (currentSpeech === undefined) {
|
|
1214
1257
|
future.resolve();
|
|
1215
1258
|
} else {
|
|
1216
1259
|
currentSpeech.addDoneCallback(() => {
|
|
@@ -1718,9 +1761,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1718
1761
|
}
|
|
1719
1762
|
|
|
1720
1763
|
replyAbortController.abort();
|
|
1721
|
-
await
|
|
1722
|
-
tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
|
|
1723
|
-
);
|
|
1764
|
+
await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
1724
1765
|
|
|
1725
1766
|
let forwardedText = textOut?.text || '';
|
|
1726
1767
|
|
|
@@ -2549,6 +2590,13 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
2549
2590
|
const unlock = await this.lock.lock();
|
|
2550
2591
|
try {
|
|
2551
2592
|
this.cancelPreemptiveGeneration();
|
|
2593
|
+
|
|
2594
|
+
await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
|
|
2595
|
+
|
|
2596
|
+
if (this._currentSpeech && !this._currentSpeech.done()) {
|
|
2597
|
+
this._currentSpeech._markDone();
|
|
2598
|
+
}
|
|
2599
|
+
|
|
2552
2600
|
await this._closeSessionResources();
|
|
2553
2601
|
|
|
2554
2602
|
if (this._mainTask) {
|
|
@@ -77,6 +77,7 @@ export interface VoiceOptions {
|
|
|
77
77
|
maxToolSteps: number;
|
|
78
78
|
preemptiveGeneration: boolean;
|
|
79
79
|
userAwayTimeout?: number | null;
|
|
80
|
+
aecWarmupDuration: number | null;
|
|
80
81
|
useTtsAlignedTranscript: boolean;
|
|
81
82
|
}
|
|
82
83
|
|
|
@@ -90,6 +91,7 @@ const defaultVoiceOptions: VoiceOptions = {
|
|
|
90
91
|
maxToolSteps: 3,
|
|
91
92
|
preemptiveGeneration: false,
|
|
92
93
|
userAwayTimeout: 15.0,
|
|
94
|
+
aecWarmupDuration: 3000,
|
|
93
95
|
useTtsAlignedTranscript: true,
|
|
94
96
|
} as const;
|
|
95
97
|
|
|
@@ -158,6 +160,8 @@ export class AgentSession<
|
|
|
158
160
|
private closingTask: Promise<void> | null = null;
|
|
159
161
|
private userAwayTimer: NodeJS.Timeout | null = null;
|
|
160
162
|
|
|
163
|
+
private _aecWarmupTimer: NodeJS.Timeout | null = null;
|
|
164
|
+
|
|
161
165
|
// Connection options for STT, LLM, and TTS
|
|
162
166
|
private _connOptions: ResolvedSessionConnectOptions;
|
|
163
167
|
|
|
@@ -169,6 +173,9 @@ export class AgentSession<
|
|
|
169
173
|
private userSpeakingSpan?: Span;
|
|
170
174
|
private agentSpeakingSpan?: Span;
|
|
171
175
|
|
|
176
|
+
/** @internal */
|
|
177
|
+
_aecWarmupRemaining = 0;
|
|
178
|
+
|
|
172
179
|
/** @internal */
|
|
173
180
|
_recorderIO?: RecorderIO;
|
|
174
181
|
|
|
@@ -241,6 +248,7 @@ export class AgentSession<
|
|
|
241
248
|
// This is the "global" chat context, it holds the entire conversation history
|
|
242
249
|
this._chatCtx = ChatContext.empty();
|
|
243
250
|
this.options = { ...defaultVoiceOptions, ...voiceOptions };
|
|
251
|
+
this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0;
|
|
244
252
|
|
|
245
253
|
this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
|
|
246
254
|
this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
|
|
@@ -774,7 +782,9 @@ export class AgentSession<
|
|
|
774
782
|
if (this.closingTask) {
|
|
775
783
|
return;
|
|
776
784
|
}
|
|
777
|
-
this.closeImpl(reason, error, drain)
|
|
785
|
+
this.closingTask = this.closeImpl(reason, error, drain).finally(() => {
|
|
786
|
+
this.closingTask = null;
|
|
787
|
+
});
|
|
778
788
|
}
|
|
779
789
|
|
|
780
790
|
/** @internal */
|
|
@@ -845,6 +855,14 @@ export class AgentSession<
|
|
|
845
855
|
this.agentSpeakingSpan = undefined;
|
|
846
856
|
}
|
|
847
857
|
|
|
858
|
+
if (state === 'speaking' && this._aecWarmupRemaining > 0 && this._aecWarmupTimer === null) {
|
|
859
|
+
this._aecWarmupTimer = setTimeout(() => this._onAecWarmupExpired(), this._aecWarmupRemaining);
|
|
860
|
+
this.logger.debug(
|
|
861
|
+
{ warmupDurationMs: this._aecWarmupRemaining },
|
|
862
|
+
'aec warmup active, disabling interruptions',
|
|
863
|
+
);
|
|
864
|
+
}
|
|
865
|
+
|
|
848
866
|
const oldState = this._agentState;
|
|
849
867
|
this._agentState = state;
|
|
850
868
|
|
|
@@ -938,6 +956,19 @@ export class AgentSession<
|
|
|
938
956
|
}
|
|
939
957
|
}
|
|
940
958
|
|
|
959
|
+
/** @internal */
|
|
960
|
+
_onAecWarmupExpired(): void {
|
|
961
|
+
if (this._aecWarmupRemaining > 0) {
|
|
962
|
+
this.logger.debug('aec warmup expired, re-enabling interruptions');
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
this._aecWarmupRemaining = 0;
|
|
966
|
+
if (this._aecWarmupTimer !== null) {
|
|
967
|
+
clearTimeout(this._aecWarmupTimer);
|
|
968
|
+
this._aecWarmupTimer = null;
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
|
|
941
972
|
private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
|
|
942
973
|
if (this.userState === 'away' && ev.isFinal) {
|
|
943
974
|
this.logger.debug('User returned from away state due to speech input');
|
|
@@ -969,6 +1000,7 @@ export class AgentSession<
|
|
|
969
1000
|
}
|
|
970
1001
|
|
|
971
1002
|
this._cancelUserAwayTimer();
|
|
1003
|
+
this._onAecWarmupExpired();
|
|
972
1004
|
this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
|
|
973
1005
|
|
|
974
1006
|
if (this.activity) {
|
|
@@ -976,7 +1008,6 @@ export class AgentSession<
|
|
|
976
1008
|
try {
|
|
977
1009
|
await this.activity.interrupt({ force: true }).await;
|
|
978
1010
|
} catch (error) {
|
|
979
|
-
// Uninterruptible speech can throw during forced interruption.
|
|
980
1011
|
this.logger.warn({ error }, 'Error interrupting activity');
|
|
981
1012
|
}
|
|
982
1013
|
}
|
package/src/worker.ts
CHANGED
|
@@ -13,8 +13,8 @@ import {
|
|
|
13
13
|
import type { ParticipantInfo } from 'livekit-server-sdk';
|
|
14
14
|
import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
|
|
15
15
|
import { EventEmitter } from 'node:events';
|
|
16
|
-
import os from 'node:os';
|
|
17
16
|
import { WebSocket } from 'ws';
|
|
17
|
+
import { getCpuMonitor } from './cpu.js';
|
|
18
18
|
import { HTTPServer } from './http_server.js';
|
|
19
19
|
import { InferenceRunner } from './inference_runner.js';
|
|
20
20
|
import { InferenceProcExecutor } from './ipc/inference_proc_executor.js';
|
|
@@ -79,32 +79,11 @@ const defaultRequestFunc = async (ctx: JobRequest) => {
|
|
|
79
79
|
await ctx.accept();
|
|
80
80
|
};
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
const defaultCpuLoad = async (worker: AgentServer): Promise<number> => {
|
|
84
|
-
return new Promise((resolve) => {
|
|
85
|
-
const cpus1 = os.cpus();
|
|
86
|
-
|
|
87
|
-
setTimeout(() => {
|
|
88
|
-
const cpus2 = os.cpus();
|
|
89
|
-
|
|
90
|
-
let idle = 0;
|
|
91
|
-
let total = 0;
|
|
92
|
-
|
|
93
|
-
for (let i = 0; i < cpus1.length; i++) {
|
|
94
|
-
const cpu1 = cpus1[i]!.times;
|
|
95
|
-
const cpu2 = cpus2[i]!.times;
|
|
96
|
-
|
|
97
|
-
idle += cpu2.idle - cpu1.idle;
|
|
98
|
-
|
|
99
|
-
const total1 = Object.values(cpu1).reduce((acc, i) => acc + i, 0);
|
|
100
|
-
const total2 = Object.values(cpu2).reduce((acc, i) => acc + i, 0);
|
|
82
|
+
const cpuMonitor = getCpuMonitor();
|
|
101
83
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
resolve(+(1 - idle / total).toFixed(2));
|
|
106
|
-
}, UPDATE_LOAD_INTERVAL);
|
|
107
|
-
});
|
|
84
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
85
|
+
const defaultCpuLoad = async (_worker: AgentServer): Promise<number> => {
|
|
86
|
+
return cpuMonitor.cpuPercent(UPDATE_LOAD_INTERVAL);
|
|
108
87
|
};
|
|
109
88
|
|
|
110
89
|
/** Participant permissions to pass to every agent spun up by this worker. */
|
|
@@ -651,33 +630,38 @@ export class AgentServer {
|
|
|
651
630
|
if (closingWS) clearInterval(loadMonitor);
|
|
652
631
|
|
|
653
632
|
const oldStatus = currentStatus;
|
|
654
|
-
this.#opts
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
if (
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
633
|
+
this.#opts
|
|
634
|
+
.loadFunc(this)
|
|
635
|
+
.then((currentLoad: number) => {
|
|
636
|
+
const isFull = currentLoad >= this.#opts.loadThreshold;
|
|
637
|
+
const currentlyAvailable = !isFull;
|
|
638
|
+
currentStatus = currentlyAvailable ? WorkerStatus.WS_AVAILABLE : WorkerStatus.WS_FULL;
|
|
639
|
+
|
|
640
|
+
if (oldStatus != currentStatus) {
|
|
641
|
+
const extra = { load: currentLoad, loadThreshold: this.#opts.loadThreshold };
|
|
642
|
+
if (isFull) {
|
|
643
|
+
this.#logger.child(extra).info('worker is at full capacity, marking as unavailable');
|
|
644
|
+
} else {
|
|
645
|
+
this.#logger.child(extra).info('worker is below capacity, marking as available');
|
|
646
|
+
}
|
|
665
647
|
}
|
|
666
|
-
}
|
|
667
648
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
649
|
+
this.event.emit(
|
|
650
|
+
'worker_msg',
|
|
651
|
+
new WorkerMessage({
|
|
652
|
+
message: {
|
|
653
|
+
case: 'updateWorker',
|
|
654
|
+
value: {
|
|
655
|
+
load: currentLoad,
|
|
656
|
+
status: currentStatus,
|
|
657
|
+
},
|
|
676
658
|
},
|
|
677
|
-
},
|
|
678
|
-
|
|
679
|
-
)
|
|
680
|
-
|
|
659
|
+
}),
|
|
660
|
+
);
|
|
661
|
+
})
|
|
662
|
+
.catch((e) => {
|
|
663
|
+
this.#logger.warn({ error: e }, 'failed to measure CPU load');
|
|
664
|
+
});
|
|
681
665
|
}, UPDATE_LOAD_INTERVAL);
|
|
682
666
|
|
|
683
667
|
await close;
|