@livekit/agents 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio.cjs +10 -0
- package/dist/audio.cjs.map +1 -1
- package/dist/audio.d.cts +1 -1
- package/dist/audio.d.ts +1 -1
- package/dist/audio.d.ts.map +1 -1
- package/dist/audio.js +10 -0
- package/dist/audio.js.map +1 -1
- package/dist/inference/api_protos.d.cts +26 -26
- package/dist/inference/api_protos.d.ts +26 -26
- package/dist/inference/tts.cjs +14 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +24 -3
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +7 -2
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +7 -2
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs +4 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js +4 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/ipc/supervised_proc.test.cjs +82 -0
- package/dist/ipc/supervised_proc.test.cjs.map +1 -1
- package/dist/ipc/supervised_proc.test.js +82 -0
- package/dist/ipc/supervised_proc.test.js.map +1 -1
- package/dist/job.cjs +2 -1
- package/dist/job.cjs.map +1 -1
- package/dist/job.d.ts.map +1 -1
- package/dist/job.js +2 -1
- package/dist/job.js.map +1 -1
- package/dist/utils.cjs +28 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +18 -0
- package/dist/utils.d.ts +18 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +25 -0
- package/dist/utils.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/voice/agent_activity.cjs +10 -0
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +11 -0
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +1 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +4 -2
- package/dist/voice/agent_session.d.ts +4 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +1 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/events.cjs +11 -0
- package/dist/voice/events.cjs.map +1 -1
- package/dist/voice/events.d.cts +12 -1
- package/dist/voice/events.d.ts +12 -1
- package/dist/voice/events.d.ts.map +1 -1
- package/dist/voice/events.js +10 -0
- package/dist/voice/events.js.map +1 -1
- package/dist/voice/generation.cjs +23 -4
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +32 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/generation_tts_timeout.test.cjs +85 -0
- package/dist/voice/generation_tts_timeout.test.cjs.map +1 -0
- package/dist/voice/generation_tts_timeout.test.js +84 -0
- package/dist/voice/generation_tts_timeout.test.js.map +1 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +3 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +1 -2
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +2 -3
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/report.cjs +1 -1
- package/dist/voice/report.cjs.map +1 -1
- package/dist/voice/report.js +1 -1
- package/dist/voice/report.js.map +1 -1
- package/dist/voice/report.test.cjs +70 -0
- package/dist/voice/report.test.cjs.map +1 -1
- package/dist/voice/report.test.js +70 -0
- package/dist/voice/report.test.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs +5 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js +5 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/room_io/room_io.test.cjs +18 -0
- package/dist/voice/room_io/room_io.test.cjs.map +1 -0
- package/dist/voice/room_io/room_io.test.js +17 -0
- package/dist/voice/room_io/room_io.test.js.map +1 -0
- package/package.json +1 -1
- package/src/audio.ts +12 -1
- package/src/inference/tts.ts +25 -3
- package/src/ipc/job_proc_lazy_main.ts +7 -2
- package/src/ipc/supervised_proc.test.ts +96 -0
- package/src/ipc/supervised_proc.ts +8 -1
- package/src/job.ts +1 -0
- package/src/utils.ts +43 -0
- package/src/voice/agent_activity.ts +11 -0
- package/src/voice/agent_session.ts +13 -7
- package/src/voice/events.ts +21 -0
- package/src/voice/generation.ts +35 -8
- package/src/voice/generation_tts_timeout.test.ts +112 -0
- package/src/voice/index.ts +6 -1
- package/src/voice/recorder_io/recorder_io.ts +2 -7
- package/src/voice/report.test.ts +78 -0
- package/src/voice/report.ts +1 -1
- package/src/voice/room_io/room_io.test.ts +38 -0
- package/src/voice/room_io/room_io.ts +7 -2
|
@@ -125,6 +125,102 @@ describe('IPC send on dead process', () => {
|
|
|
125
125
|
});
|
|
126
126
|
});
|
|
127
127
|
|
|
128
|
+
describe('init timeout rejection handling', () => {
|
|
129
|
+
it('does not produce unhandled rejection when init times out', async () => {
|
|
130
|
+
// Regression test: before the fix, run() was called without await in start().
|
|
131
|
+
// When init timed out, the rejection in run()'s `await this.init.await` escaped
|
|
132
|
+
// as an unhandled rejection — crashing the Node.js process.
|
|
133
|
+
const unhandled: unknown[] = [];
|
|
134
|
+
const handler = (reason: unknown) => unhandled.push(reason);
|
|
135
|
+
process.on('unhandledRejection', handler);
|
|
136
|
+
|
|
137
|
+
// Child that responds AFTER the timeout — simulates slow init under CPU pressure.
|
|
138
|
+
// Timeout fires at 50ms (init.reject), child responds at 200ms (once() resolves).
|
|
139
|
+
// Before the fix, init.reject caused an unhandled rejection in run().
|
|
140
|
+
const slowScript = join(tmpdir(), 'test_slow_init_child.mjs');
|
|
141
|
+
writeFileSync(
|
|
142
|
+
slowScript,
|
|
143
|
+
`process.on('message', () => {
|
|
144
|
+
setTimeout(() => process.send({ case: 'initializeResponse' }), 200);
|
|
145
|
+
});
|
|
146
|
+
setInterval(() => {}, 1000);`,
|
|
147
|
+
);
|
|
148
|
+
|
|
149
|
+
const { SupervisedProc } = await import('./supervised_proc.js');
|
|
150
|
+
class TestProc extends SupervisedProc {
|
|
151
|
+
createProcess() {
|
|
152
|
+
return fork(slowScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
|
|
153
|
+
}
|
|
154
|
+
async mainTask() {}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const proc = new TestProc(
|
|
158
|
+
50, // initializeTimeout — fires before child responds at 200ms
|
|
159
|
+
1000, // closeTimeout
|
|
160
|
+
0, // memoryWarnMB
|
|
161
|
+
0, // memoryLimitMB
|
|
162
|
+
5000, // pingInterval
|
|
163
|
+
60000, // pingTimeout
|
|
164
|
+
2500, // highPingThreshold
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
await proc.start();
|
|
168
|
+
// initialize() returns normally: child responds at 200ms, once() resolves,
|
|
169
|
+
// but init was already rejected at 50ms — run() gets the rejection.
|
|
170
|
+
await proc.initialize();
|
|
171
|
+
|
|
172
|
+
// Give the event loop a tick for any unhandled rejection to surface
|
|
173
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
174
|
+
|
|
175
|
+
process.off('unhandledRejection', handler);
|
|
176
|
+
proc.proc?.kill();
|
|
177
|
+
try {
|
|
178
|
+
unlinkSync(slowScript);
|
|
179
|
+
} catch {}
|
|
180
|
+
|
|
181
|
+
expect(unhandled).toEqual([]);
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
it('join() resolves after init timeout instead of hanging forever', async () => {
|
|
185
|
+
// When run() fails early (before registering proc event handlers),
|
|
186
|
+
// #join must still resolve so that join() and close() don't hang.
|
|
187
|
+
const slowScript = join(tmpdir(), 'test_slow_init_child_join.mjs');
|
|
188
|
+
writeFileSync(
|
|
189
|
+
slowScript,
|
|
190
|
+
`process.on('message', () => {
|
|
191
|
+
setTimeout(() => process.send({ case: 'initializeResponse' }), 200);
|
|
192
|
+
});
|
|
193
|
+
setInterval(() => {}, 1000);`,
|
|
194
|
+
);
|
|
195
|
+
|
|
196
|
+
const { SupervisedProc } = await import('./supervised_proc.js');
|
|
197
|
+
class TestProc extends SupervisedProc {
|
|
198
|
+
createProcess() {
|
|
199
|
+
return fork(slowScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
|
|
200
|
+
}
|
|
201
|
+
async mainTask() {}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const proc = new TestProc(50, 1000, 0, 0, 5000, 60000, 2500);
|
|
205
|
+
|
|
206
|
+
await proc.start();
|
|
207
|
+
await proc.initialize();
|
|
208
|
+
|
|
209
|
+
// join() must resolve within a reasonable time, not hang forever
|
|
210
|
+
const result = await Promise.race([
|
|
211
|
+
proc.join().then(() => 'resolved'),
|
|
212
|
+
new Promise((r) => setTimeout(() => r('timeout'), 2000)),
|
|
213
|
+
]);
|
|
214
|
+
|
|
215
|
+
proc.proc?.kill();
|
|
216
|
+
try {
|
|
217
|
+
unlinkSync(slowScript);
|
|
218
|
+
} catch {}
|
|
219
|
+
|
|
220
|
+
expect(result).toBe('resolved');
|
|
221
|
+
});
|
|
222
|
+
});
|
|
223
|
+
|
|
128
224
|
describe('timer cleanup', () => {
|
|
129
225
|
it('clearInterval stops the interval', async () => {
|
|
130
226
|
let count = 0;
|
|
@@ -84,7 +84,14 @@ export abstract class SupervisedProc {
|
|
|
84
84
|
this.proc = this.createProcess();
|
|
85
85
|
|
|
86
86
|
this.#started = true;
|
|
87
|
-
this.run()
|
|
87
|
+
this.run().catch((err) => {
|
|
88
|
+
this.#logger.child({ err }).warn('supervised process run failed');
|
|
89
|
+
// Note: we intentionally do NOT kill the child process here. Killing it
|
|
90
|
+
// would race with initialize()'s `once(proc, 'message')`, causing
|
|
91
|
+
// initialize() to hang forever and deadlocking the caller (proc_pool).
|
|
92
|
+
// The child process is cleaned up when the pool shuts down.
|
|
93
|
+
this.#join.resolve();
|
|
94
|
+
});
|
|
88
95
|
}
|
|
89
96
|
|
|
90
97
|
async run() {
|
package/src/job.ts
CHANGED
package/src/utils.ts
CHANGED
|
@@ -9,6 +9,7 @@ import type {
|
|
|
9
9
|
TrackKind,
|
|
10
10
|
} from '@livekit/rtc-node';
|
|
11
11
|
import { AudioFrame, AudioResampler, RoomEvent } from '@livekit/rtc-node';
|
|
12
|
+
import type { Throws } from '@livekit/throws-transformer/throws';
|
|
12
13
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
13
14
|
import { EventEmitter, once } from 'node:events';
|
|
14
15
|
import type { ReadableStream } from 'node:stream/web';
|
|
@@ -752,6 +753,21 @@ export function isStreamClosedError(error: unknown): boolean {
|
|
|
752
753
|
);
|
|
753
754
|
}
|
|
754
755
|
|
|
756
|
+
/** FFmpeg error messages expected during normal teardown/shutdown. */
|
|
757
|
+
const FFMPEG_TEARDOWN_ERRORS = ['Output stream closed', 'received signal 2', 'SIGKILL', 'SIGINT'];
|
|
758
|
+
|
|
759
|
+
/**
|
|
760
|
+
* Check if an error is an expected FFmpeg teardown error that can be safely ignored during cleanup.
|
|
761
|
+
*
|
|
762
|
+
* @param error - The error to check.
|
|
763
|
+
* @returns True if the error is an expected FFmpeg shutdown error.
|
|
764
|
+
*/
|
|
765
|
+
export function isFfmpegTeardownError(error: unknown): boolean {
|
|
766
|
+
return (
|
|
767
|
+
error instanceof Error && FFMPEG_TEARDOWN_ERRORS.some((msg) => error.message?.includes(msg))
|
|
768
|
+
);
|
|
769
|
+
}
|
|
770
|
+
|
|
755
771
|
/**
|
|
756
772
|
* In JS an error can be any arbitrary value.
|
|
757
773
|
* This function converts an unknown error to an Error and stores the original value in the error object.
|
|
@@ -804,6 +820,33 @@ export function delay(ms: number, options: DelayOptions = {}): Promise<void> {
|
|
|
804
820
|
});
|
|
805
821
|
}
|
|
806
822
|
|
|
823
|
+
export class IdleTimeoutError extends Error {
|
|
824
|
+
constructor(message = 'idle timeout') {
|
|
825
|
+
super(message);
|
|
826
|
+
this.name = 'IdleTimeoutError';
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
/**
|
|
831
|
+
* Race a promise against an idle timeout. If the promise does not settle within
|
|
832
|
+
* `timeoutMs` milliseconds, the returned promise rejects with {@link IdleTimeoutError}
|
|
833
|
+
* (or the error returned by `throwError` when provided).
|
|
834
|
+
* The timer is properly cleaned up on settlement to avoid leaking handles.
|
|
835
|
+
*/
|
|
836
|
+
export function waitUntilTimeout<T, E extends Error = IdleTimeoutError>(
|
|
837
|
+
promise: Promise<T>,
|
|
838
|
+
timeoutMs: number,
|
|
839
|
+
throwError?: () => E,
|
|
840
|
+
): Promise<Throws<T, E>> {
|
|
841
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
842
|
+
return Promise.race([
|
|
843
|
+
promise,
|
|
844
|
+
new Promise<never>((_, reject) => {
|
|
845
|
+
timer = setTimeout(() => reject(throwError?.() ?? new IdleTimeoutError()), timeoutMs);
|
|
846
|
+
}),
|
|
847
|
+
]).finally(() => clearTimeout(timer)) as Promise<Throws<T, E>>;
|
|
848
|
+
}
|
|
849
|
+
|
|
807
850
|
/**
|
|
808
851
|
* Returns a participant that matches the given identity. If identity is None, the first
|
|
809
852
|
* participant that joins the room will be returned.
|
|
@@ -67,6 +67,7 @@ import {
|
|
|
67
67
|
createErrorEvent,
|
|
68
68
|
createFunctionToolsExecutedEvent,
|
|
69
69
|
createMetricsCollectedEvent,
|
|
70
|
+
createSessionUsageUpdatedEvent,
|
|
70
71
|
createSpeechCreatedEvent,
|
|
71
72
|
createUserInputTranscribedEvent,
|
|
72
73
|
} from './events.js';
|
|
@@ -157,10 +158,15 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
157
158
|
|
|
158
159
|
private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
|
|
159
160
|
this.agentSession._usageCollector.collect(ev);
|
|
161
|
+
const usage = this.agentSession.usage;
|
|
160
162
|
this.agentSession.emit(
|
|
161
163
|
AgentSessionEventTypes.MetricsCollected,
|
|
162
164
|
createMetricsCollectedEvent({ metrics: ev }),
|
|
163
165
|
);
|
|
166
|
+
this.agentSession.emit(
|
|
167
|
+
AgentSessionEventTypes.SessionUsageUpdated,
|
|
168
|
+
createSessionUsageUpdatedEvent({ usage }),
|
|
169
|
+
);
|
|
164
170
|
};
|
|
165
171
|
|
|
166
172
|
private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
|
|
@@ -730,11 +736,16 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
730
736
|
}
|
|
731
737
|
|
|
732
738
|
this.agentSession._usageCollector.collect(ev);
|
|
739
|
+
const usage = this.agentSession.usage;
|
|
733
740
|
|
|
734
741
|
this.agentSession.emit(
|
|
735
742
|
AgentSessionEventTypes.MetricsCollected,
|
|
736
743
|
createMetricsCollectedEvent({ metrics: ev }),
|
|
737
744
|
);
|
|
745
|
+
this.agentSession.emit(
|
|
746
|
+
AgentSessionEventTypes.SessionUsageUpdated,
|
|
747
|
+
createSessionUsageUpdatedEvent({ usage }),
|
|
748
|
+
);
|
|
738
749
|
};
|
|
739
750
|
|
|
740
751
|
private onError(ev: RealtimeModelError | STTError | TTSError | LLMError): void {
|
|
@@ -52,6 +52,7 @@ import {
|
|
|
52
52
|
type ErrorEvent,
|
|
53
53
|
type FunctionToolsExecutedEvent,
|
|
54
54
|
type MetricsCollectedEvent,
|
|
55
|
+
type SessionUsageUpdatedEvent,
|
|
55
56
|
type ShutdownReason,
|
|
56
57
|
type SpeechCreatedEvent,
|
|
57
58
|
type UserInputTranscribedEvent,
|
|
@@ -131,6 +132,7 @@ export type AgentSessionCallbacks = {
|
|
|
131
132
|
[AgentSessionEventTypes.ConversationItemAdded]: (ev: ConversationItemAddedEvent) => void;
|
|
132
133
|
[AgentSessionEventTypes.FunctionToolsExecuted]: (ev: FunctionToolsExecutedEvent) => void;
|
|
133
134
|
[AgentSessionEventTypes.MetricsCollected]: (ev: MetricsCollectedEvent) => void;
|
|
135
|
+
[AgentSessionEventTypes.SessionUsageUpdated]: (ev: SessionUsageUpdatedEvent) => void;
|
|
134
136
|
[AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void;
|
|
135
137
|
[AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void;
|
|
136
138
|
[AgentSessionEventTypes.Close]: (ev: CloseEvent) => void;
|
|
@@ -649,7 +651,8 @@ export class AgentSession<
|
|
|
649
651
|
}
|
|
650
652
|
|
|
651
653
|
generateReply(options?: {
|
|
652
|
-
userInput?: string;
|
|
654
|
+
userInput?: string | ChatMessage;
|
|
655
|
+
chatCtx?: ChatContext;
|
|
653
656
|
instructions?: string;
|
|
654
657
|
toolChoice?: ToolChoice;
|
|
655
658
|
allowInterruptions?: boolean;
|
|
@@ -658,12 +661,15 @@ export class AgentSession<
|
|
|
658
661
|
throw new Error('AgentSession is not running');
|
|
659
662
|
}
|
|
660
663
|
|
|
661
|
-
const userMessage =
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
664
|
+
const userMessage =
|
|
665
|
+
options?.userInput instanceof ChatMessage
|
|
666
|
+
? options.userInput
|
|
667
|
+
: options?.userInput
|
|
668
|
+
? new ChatMessage({
|
|
669
|
+
role: 'user',
|
|
670
|
+
content: options.userInput,
|
|
671
|
+
})
|
|
672
|
+
: undefined;
|
|
667
673
|
|
|
668
674
|
const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
|
|
669
675
|
if (activity.schedulingPaused) {
|
package/src/voice/events.ts
CHANGED
|
@@ -18,6 +18,7 @@ import type { STT } from '../stt/index.js';
|
|
|
18
18
|
import type { STTError } from '../stt/stt.js';
|
|
19
19
|
import type { TTS } from '../tts/index.js';
|
|
20
20
|
import type { TTSError } from '../tts/tts.js';
|
|
21
|
+
import type { AgentSessionUsage } from './agent_session.js';
|
|
21
22
|
import type { SpeechHandle } from './speech_handle.js';
|
|
22
23
|
|
|
23
24
|
export enum AgentSessionEventTypes {
|
|
@@ -27,6 +28,7 @@ export enum AgentSessionEventTypes {
|
|
|
27
28
|
ConversationItemAdded = 'conversation_item_added',
|
|
28
29
|
FunctionToolsExecuted = 'function_tools_executed',
|
|
29
30
|
MetricsCollected = 'metrics_collected',
|
|
31
|
+
SessionUsageUpdated = 'session_usage_updated',
|
|
30
32
|
SpeechCreated = 'speech_created',
|
|
31
33
|
OverlappingSpeech = 'overlapping_speech',
|
|
32
34
|
Error = 'error',
|
|
@@ -133,6 +135,24 @@ export const createMetricsCollectedEvent = ({
|
|
|
133
135
|
createdAt,
|
|
134
136
|
});
|
|
135
137
|
|
|
138
|
+
export type SessionUsageUpdatedEvent = {
|
|
139
|
+
type: 'session_usage_updated';
|
|
140
|
+
usage: AgentSessionUsage;
|
|
141
|
+
createdAt: number;
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
export const createSessionUsageUpdatedEvent = ({
|
|
145
|
+
usage,
|
|
146
|
+
createdAt = Date.now(),
|
|
147
|
+
}: {
|
|
148
|
+
usage: AgentSessionUsage;
|
|
149
|
+
createdAt?: number;
|
|
150
|
+
}): SessionUsageUpdatedEvent => ({
|
|
151
|
+
type: 'session_usage_updated',
|
|
152
|
+
usage,
|
|
153
|
+
createdAt,
|
|
154
|
+
});
|
|
155
|
+
|
|
136
156
|
export type ConversationItemAddedEvent = {
|
|
137
157
|
type: 'conversation_item_added';
|
|
138
158
|
item: ChatMessage;
|
|
@@ -264,6 +284,7 @@ export type AgentEvent =
|
|
|
264
284
|
| UserStateChangedEvent
|
|
265
285
|
| AgentStateChangedEvent
|
|
266
286
|
| MetricsCollectedEvent
|
|
287
|
+
| SessionUsageUpdatedEvent
|
|
267
288
|
| ConversationItemAddedEvent
|
|
268
289
|
| FunctionToolsExecutedEvent
|
|
269
290
|
| SpeechCreatedEvent
|
package/src/voice/generation.ts
CHANGED
|
@@ -25,7 +25,15 @@ import { log } from '../log.js';
|
|
|
25
25
|
import { IdentityTransform } from '../stream/identity_transform.js';
|
|
26
26
|
import { traceTypes, tracer } from '../telemetry/index.js';
|
|
27
27
|
import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
|
|
28
|
-
import {
|
|
28
|
+
import {
|
|
29
|
+
Future,
|
|
30
|
+
IdleTimeoutError,
|
|
31
|
+
Task,
|
|
32
|
+
shortuuid,
|
|
33
|
+
toError,
|
|
34
|
+
waitForAbort,
|
|
35
|
+
waitUntilTimeout,
|
|
36
|
+
} from '../utils.js';
|
|
29
37
|
import {
|
|
30
38
|
type Agent,
|
|
31
39
|
type ModelSettings,
|
|
@@ -46,6 +54,8 @@ import {
|
|
|
46
54
|
import { RunContext } from './run_context.js';
|
|
47
55
|
import type { SpeechHandle } from './speech_handle.js';
|
|
48
56
|
|
|
57
|
+
const TTS_READ_IDLE_TIMEOUT_MS = 10_000;
|
|
58
|
+
|
|
49
59
|
/** @internal */
|
|
50
60
|
export class _LLMGenerationData {
|
|
51
61
|
generatedText: string = '';
|
|
@@ -550,6 +560,7 @@ export function performTTSInference(
|
|
|
550
560
|
model?: string,
|
|
551
561
|
provider?: string,
|
|
552
562
|
): [Task<void>, _TTSGenerationData] {
|
|
563
|
+
const logger = log();
|
|
553
564
|
const audioStream = new IdentityTransform<AudioFrame>();
|
|
554
565
|
const outputWriter = audioStream.writable.getWriter();
|
|
555
566
|
const audioOutputStream = audioStream.readable;
|
|
@@ -624,12 +635,15 @@ export function performTTSInference(
|
|
|
624
635
|
// JS currently only does single inference, so initialPushedDuration is always 0.
|
|
625
636
|
// TODO: Add FlushSentinel + multi-segment loop
|
|
626
637
|
const initialPushedDuration = pushedDuration;
|
|
627
|
-
|
|
628
638
|
while (true) {
|
|
629
639
|
if (signal.aborted) {
|
|
630
640
|
break;
|
|
631
641
|
}
|
|
632
|
-
|
|
642
|
+
|
|
643
|
+
const { done, value: frame } = await waitUntilTimeout(
|
|
644
|
+
ttsStreamReader.read(),
|
|
645
|
+
TTS_READ_IDLE_TIMEOUT_MS,
|
|
646
|
+
);
|
|
633
647
|
if (done) {
|
|
634
648
|
break;
|
|
635
649
|
}
|
|
@@ -671,14 +685,15 @@ export function performTTSInference(
|
|
|
671
685
|
pushedDuration += frameDuration;
|
|
672
686
|
}
|
|
673
687
|
} catch (error) {
|
|
674
|
-
if (error instanceof
|
|
675
|
-
|
|
688
|
+
if (error instanceof IdleTimeoutError) {
|
|
689
|
+
logger.warn('TTS stream stalled after producing audio, forcing close');
|
|
690
|
+
} else if (error instanceof DOMException && error.name === 'AbortError') {
|
|
676
691
|
return;
|
|
692
|
+
} else {
|
|
693
|
+
throw error;
|
|
677
694
|
}
|
|
678
|
-
throw error;
|
|
679
695
|
} finally {
|
|
680
696
|
if (!timedTextsFut.done) {
|
|
681
|
-
// Ensure downstream consumers don't hang on errors.
|
|
682
697
|
timedTextsFut.resolve(null);
|
|
683
698
|
}
|
|
684
699
|
ttsStreamReader?.releaseLock();
|
|
@@ -773,9 +788,12 @@ async function forwardAudio(
|
|
|
773
788
|
out: _AudioOut,
|
|
774
789
|
signal?: AbortSignal,
|
|
775
790
|
): Promise<void> {
|
|
791
|
+
const logger = log();
|
|
776
792
|
const reader = ttsStream.getReader();
|
|
777
793
|
let resampler: AudioResampler | null = null;
|
|
778
794
|
|
|
795
|
+
const FORWARD_AUDIO_IDLE_TIMEOUT_MS = 10_000;
|
|
796
|
+
|
|
779
797
|
const onPlaybackStarted = (ev: { createdAt: number }) => {
|
|
780
798
|
if (!out.firstFrameFut.done) {
|
|
781
799
|
out.firstFrameFut.resolve(ev.createdAt);
|
|
@@ -791,7 +809,10 @@ async function forwardAudio(
|
|
|
791
809
|
break;
|
|
792
810
|
}
|
|
793
811
|
|
|
794
|
-
const { done, value: frame } = await
|
|
812
|
+
const { done, value: frame } = await waitUntilTimeout(
|
|
813
|
+
reader.read(),
|
|
814
|
+
FORWARD_AUDIO_IDLE_TIMEOUT_MS,
|
|
815
|
+
);
|
|
795
816
|
if (done) break;
|
|
796
817
|
|
|
797
818
|
out.audio.push(frame);
|
|
@@ -819,6 +840,12 @@ async function forwardAudio(
|
|
|
819
840
|
await audioOutput.captureFrame(f);
|
|
820
841
|
}
|
|
821
842
|
}
|
|
843
|
+
} catch (e) {
|
|
844
|
+
if (e instanceof IdleTimeoutError) {
|
|
845
|
+
logger.warn('audio forwarding stalled waiting for TTS frames, forcing close');
|
|
846
|
+
} else {
|
|
847
|
+
throw e;
|
|
848
|
+
}
|
|
822
849
|
} finally {
|
|
823
850
|
audioOutput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
|
|
824
851
|
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import { ReadableStream } from 'stream/web';
|
|
6
|
+
import { describe, expect, it, vi } from 'vitest';
|
|
7
|
+
import { initializeLogger } from '../log.js';
|
|
8
|
+
import { performAudioForwarding, performTTSInference } from './generation.js';
|
|
9
|
+
import { AudioOutput } from './io.js';
|
|
10
|
+
|
|
11
|
+
function createSilentFrame(sampleRate = 24000, channels = 1, durationMs = 20): AudioFrame {
|
|
12
|
+
const samplesPerChannel = Math.floor((sampleRate * durationMs) / 1000);
|
|
13
|
+
const data = new Int16Array(samplesPerChannel * channels);
|
|
14
|
+
return new AudioFrame(data, sampleRate, channels, samplesPerChannel);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
class MockAudioOutput extends AudioOutput {
|
|
18
|
+
capturedFrames: AudioFrame[] = [];
|
|
19
|
+
|
|
20
|
+
constructor() {
|
|
21
|
+
super(24000);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async captureFrame(frame: AudioFrame): Promise<void> {
|
|
25
|
+
await super.captureFrame(frame);
|
|
26
|
+
this.capturedFrames.push(frame);
|
|
27
|
+
this.onPlaybackStarted(Date.now());
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
clearBuffer(): void {
|
|
31
|
+
// no-op for mock
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
describe('TTS stream idle timeout', () => {
|
|
36
|
+
initializeLogger({ pretty: false, level: 'silent' });
|
|
37
|
+
|
|
38
|
+
it('forwardAudio completes when TTS stream stalls after producing frames', async () => {
|
|
39
|
+
const stalledStream = new ReadableStream<AudioFrame>({
|
|
40
|
+
start(controller) {
|
|
41
|
+
controller.enqueue(createSilentFrame());
|
|
42
|
+
controller.enqueue(createSilentFrame());
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const audioOutput = new MockAudioOutput();
|
|
47
|
+
const controller = new AbortController();
|
|
48
|
+
|
|
49
|
+
const [task, audioOut] = performAudioForwarding(stalledStream, audioOutput, controller);
|
|
50
|
+
|
|
51
|
+
vi.useFakeTimers();
|
|
52
|
+
|
|
53
|
+
const taskPromise = task.result;
|
|
54
|
+
await vi.advanceTimersByTimeAsync(11_000);
|
|
55
|
+
await taskPromise;
|
|
56
|
+
|
|
57
|
+
vi.useRealTimers();
|
|
58
|
+
|
|
59
|
+
expect(audioOutput.capturedFrames.length).toBe(2);
|
|
60
|
+
expect(audioOut.firstFrameFut.done).toBe(true);
|
|
61
|
+
}, 10_000);
|
|
62
|
+
|
|
63
|
+
it('forwardAudio completes normally when TTS stream closes properly', async () => {
|
|
64
|
+
const normalStream = new ReadableStream<AudioFrame>({
|
|
65
|
+
start(controller) {
|
|
66
|
+
controller.enqueue(createSilentFrame());
|
|
67
|
+
controller.enqueue(createSilentFrame());
|
|
68
|
+
controller.enqueue(createSilentFrame());
|
|
69
|
+
controller.close();
|
|
70
|
+
},
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const audioOutput = new MockAudioOutput();
|
|
74
|
+
const controller = new AbortController();
|
|
75
|
+
|
|
76
|
+
const [task, audioOut] = performAudioForwarding(normalStream, audioOutput, controller);
|
|
77
|
+
|
|
78
|
+
await task.result;
|
|
79
|
+
|
|
80
|
+
expect(audioOutput.capturedFrames.length).toBe(3);
|
|
81
|
+
expect(audioOut.firstFrameFut.done).toBe(true);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('performTTSInference completes when TTS node returns stalled stream', async () => {
|
|
85
|
+
const stalledTtsStream = new ReadableStream<AudioFrame>({
|
|
86
|
+
start(controller) {
|
|
87
|
+
controller.enqueue(createSilentFrame());
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
const ttsNode = async () => stalledTtsStream;
|
|
92
|
+
const textInput = new ReadableStream<string>({
|
|
93
|
+
start(controller) {
|
|
94
|
+
controller.enqueue('Hello world');
|
|
95
|
+
controller.close();
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
const controller = new AbortController();
|
|
100
|
+
const [task, genData] = performTTSInference(ttsNode, textInput, {}, controller);
|
|
101
|
+
|
|
102
|
+
vi.useFakeTimers();
|
|
103
|
+
|
|
104
|
+
const taskPromise = task.result;
|
|
105
|
+
await vi.advanceTimersByTimeAsync(11_000);
|
|
106
|
+
await taskPromise;
|
|
107
|
+
|
|
108
|
+
vi.useRealTimers();
|
|
109
|
+
|
|
110
|
+
expect(genData.ttfb).toBeDefined();
|
|
111
|
+
}, 10_000);
|
|
112
|
+
});
|
package/src/voice/index.ts
CHANGED
|
@@ -2,7 +2,12 @@
|
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
|
|
5
|
-
export {
|
|
5
|
+
export {
|
|
6
|
+
AgentSession,
|
|
7
|
+
type AgentSessionOptions,
|
|
8
|
+
type AgentSessionUsage,
|
|
9
|
+
type VoiceOptions,
|
|
10
|
+
} from './agent_session.js';
|
|
6
11
|
export * from './avatar/index.js';
|
|
7
12
|
export * from './background_audio.js';
|
|
8
13
|
export {
|
|
@@ -13,7 +13,7 @@ import { TransformStream } from 'node:stream/web';
|
|
|
13
13
|
import { log } from '../../log.js';
|
|
14
14
|
import { isStreamReaderReleaseError } from '../../stream/deferred_stream.js';
|
|
15
15
|
import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js';
|
|
16
|
-
import { Future, Task, cancelAndWait, delay } from '../../utils.js';
|
|
16
|
+
import { Future, Task, cancelAndWait, delay, isFfmpegTeardownError } from '../../utils.js';
|
|
17
17
|
import type { AgentSession } from '../agent_session.js';
|
|
18
18
|
import { AudioInput, AudioOutput, type PlaybackFinishedEvent } from '../io.js';
|
|
19
19
|
|
|
@@ -203,12 +203,7 @@ export class RecorderIO {
|
|
|
203
203
|
})
|
|
204
204
|
.on('error', (err) => {
|
|
205
205
|
// Ignore errors from intentional stream closure or SIGINT during shutdown
|
|
206
|
-
if (
|
|
207
|
-
err.message?.includes('Output stream closed') ||
|
|
208
|
-
err.message?.includes('received signal 2') ||
|
|
209
|
-
err.message?.includes('SIGKILL') ||
|
|
210
|
-
err.message?.includes('SIGINT')
|
|
211
|
-
) {
|
|
206
|
+
if (isFfmpegTeardownError(err)) {
|
|
212
207
|
resolve();
|
|
213
208
|
} else {
|
|
214
209
|
this.logger.error({ err }, 'FFmpeg encoding error');
|
package/src/voice/report.test.ts
CHANGED
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import { describe, expect, it } from 'vitest';
|
|
5
5
|
import { ChatContext } from '../llm/chat_context.js';
|
|
6
|
+
import type { ModelUsage } from '../metrics/model_usage.js';
|
|
6
7
|
import type { AgentSessionOptions, VoiceOptions } from './agent_session.js';
|
|
8
|
+
import { AgentSessionEventTypes, createSessionUsageUpdatedEvent } from './events.js';
|
|
9
|
+
import type { AgentSessionUsage } from './index.js';
|
|
7
10
|
import { createSessionReport, sessionReportToJSON } from './report.js';
|
|
8
11
|
|
|
9
12
|
type ReportOptions = AgentSessionOptions & Partial<VoiceOptions>;
|
|
@@ -133,4 +136,79 @@ describe('sessionReportToJSON', () => {
|
|
|
133
136
|
max_tool_steps: 3,
|
|
134
137
|
});
|
|
135
138
|
});
|
|
139
|
+
|
|
140
|
+
it('serializes model usage as usage', () => {
|
|
141
|
+
const usage: ModelUsage[] = [
|
|
142
|
+
{
|
|
143
|
+
type: 'tts_usage',
|
|
144
|
+
provider: 'elevenlabs',
|
|
145
|
+
model: 'eleven_flash_v2_5',
|
|
146
|
+
inputTokens: 0,
|
|
147
|
+
outputTokens: 0,
|
|
148
|
+
charactersCount: 42,
|
|
149
|
+
audioDurationMs: 1200,
|
|
150
|
+
},
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
const report = createSessionReport({
|
|
154
|
+
jobId: 'job',
|
|
155
|
+
roomId: 'room-id',
|
|
156
|
+
room: 'room',
|
|
157
|
+
options: baseOptions(),
|
|
158
|
+
events: [],
|
|
159
|
+
chatHistory: ChatContext.empty(),
|
|
160
|
+
enableRecording: false,
|
|
161
|
+
timestamp: 0,
|
|
162
|
+
startedAt: 0,
|
|
163
|
+
modelUsage: usage,
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
const payload = sessionReportToJSON(report);
|
|
167
|
+
expect(payload.usage).toEqual([
|
|
168
|
+
{
|
|
169
|
+
type: 'tts_usage',
|
|
170
|
+
provider: 'elevenlabs',
|
|
171
|
+
model: 'eleven_flash_v2_5',
|
|
172
|
+
charactersCount: 42,
|
|
173
|
+
audioDurationMs: 1200,
|
|
174
|
+
},
|
|
175
|
+
]);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('omits session usage update events from serialized events', () => {
|
|
179
|
+
const report = createSessionReport({
|
|
180
|
+
jobId: 'job',
|
|
181
|
+
roomId: 'room-id',
|
|
182
|
+
room: 'room',
|
|
183
|
+
options: baseOptions(),
|
|
184
|
+
events: [
|
|
185
|
+
createSessionUsageUpdatedEvent({
|
|
186
|
+
usage: {
|
|
187
|
+
modelUsage: [
|
|
188
|
+
{
|
|
189
|
+
type: 'tts_usage',
|
|
190
|
+
provider: 'elevenlabs',
|
|
191
|
+
model: 'eleven_flash_v2_5',
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
},
|
|
195
|
+
createdAt: 123,
|
|
196
|
+
}),
|
|
197
|
+
],
|
|
198
|
+
chatHistory: ChatContext.empty(),
|
|
199
|
+
enableRecording: false,
|
|
200
|
+
timestamp: 0,
|
|
201
|
+
startedAt: 0,
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
const payload = sessionReportToJSON(report);
|
|
205
|
+
expect(payload.events).toEqual([]);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
it('exports AgentSessionUsage from the voice barrel', () => {
|
|
209
|
+
const usage: AgentSessionUsage = { modelUsage: [] };
|
|
210
|
+
const eventType: AgentSessionEventTypes = AgentSessionEventTypes.SessionUsageUpdated;
|
|
211
|
+
expect(usage.modelUsage).toEqual([]);
|
|
212
|
+
expect(eventType).toBe('session_usage_updated');
|
|
213
|
+
});
|
|
136
214
|
});
|
package/src/voice/report.ts
CHANGED
|
@@ -111,7 +111,7 @@ export function sessionReportToJSON(report: SessionReport): Record<string, unkno
|
|
|
111
111
|
options.voiceOptions?.maxEndpointingDelay;
|
|
112
112
|
|
|
113
113
|
for (const event of report.events) {
|
|
114
|
-
if (event.type === 'metrics_collected') {
|
|
114
|
+
if (event.type === 'metrics_collected' || event.type === 'session_usage_updated') {
|
|
115
115
|
continue; // metrics are too noisy, Cloud is using the chat_history as the source of truth
|
|
116
116
|
}
|
|
117
117
|
|