@livekit/agents 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/dist/audio.cjs +10 -0
  2. package/dist/audio.cjs.map +1 -1
  3. package/dist/audio.d.cts +1 -1
  4. package/dist/audio.d.ts +1 -1
  5. package/dist/audio.d.ts.map +1 -1
  6. package/dist/audio.js +10 -0
  7. package/dist/audio.js.map +1 -1
  8. package/dist/inference/api_protos.d.cts +26 -26
  9. package/dist/inference/api_protos.d.ts +26 -26
  10. package/dist/inference/tts.cjs +14 -1
  11. package/dist/inference/tts.cjs.map +1 -1
  12. package/dist/inference/tts.d.ts.map +1 -1
  13. package/dist/inference/tts.js +24 -3
  14. package/dist/inference/tts.js.map +1 -1
  15. package/dist/ipc/job_proc_lazy_main.cjs +7 -2
  16. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  17. package/dist/ipc/job_proc_lazy_main.js +7 -2
  18. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  19. package/dist/ipc/supervised_proc.cjs +4 -1
  20. package/dist/ipc/supervised_proc.cjs.map +1 -1
  21. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  22. package/dist/ipc/supervised_proc.js +4 -1
  23. package/dist/ipc/supervised_proc.js.map +1 -1
  24. package/dist/ipc/supervised_proc.test.cjs +82 -0
  25. package/dist/ipc/supervised_proc.test.cjs.map +1 -1
  26. package/dist/ipc/supervised_proc.test.js +82 -0
  27. package/dist/ipc/supervised_proc.test.js.map +1 -1
  28. package/dist/job.cjs +2 -1
  29. package/dist/job.cjs.map +1 -1
  30. package/dist/job.d.ts.map +1 -1
  31. package/dist/job.js +2 -1
  32. package/dist/job.js.map +1 -1
  33. package/dist/utils.cjs +28 -0
  34. package/dist/utils.cjs.map +1 -1
  35. package/dist/utils.d.cts +18 -0
  36. package/dist/utils.d.ts +18 -0
  37. package/dist/utils.d.ts.map +1 -1
  38. package/dist/utils.js +25 -0
  39. package/dist/utils.js.map +1 -1
  40. package/dist/version.cjs +1 -1
  41. package/dist/version.js +1 -1
  42. package/dist/voice/agent_activity.cjs +10 -0
  43. package/dist/voice/agent_activity.cjs.map +1 -1
  44. package/dist/voice/agent_activity.d.ts.map +1 -1
  45. package/dist/voice/agent_activity.js +11 -0
  46. package/dist/voice/agent_activity.js.map +1 -1
  47. package/dist/voice/agent_session.cjs +1 -1
  48. package/dist/voice/agent_session.cjs.map +1 -1
  49. package/dist/voice/agent_session.d.cts +4 -2
  50. package/dist/voice/agent_session.d.ts +4 -2
  51. package/dist/voice/agent_session.d.ts.map +1 -1
  52. package/dist/voice/agent_session.js +1 -1
  53. package/dist/voice/agent_session.js.map +1 -1
  54. package/dist/voice/events.cjs +11 -0
  55. package/dist/voice/events.cjs.map +1 -1
  56. package/dist/voice/events.d.cts +12 -1
  57. package/dist/voice/events.d.ts +12 -1
  58. package/dist/voice/events.d.ts.map +1 -1
  59. package/dist/voice/events.js +10 -0
  60. package/dist/voice/events.js.map +1 -1
  61. package/dist/voice/generation.cjs +23 -4
  62. package/dist/voice/generation.cjs.map +1 -1
  63. package/dist/voice/generation.d.ts.map +1 -1
  64. package/dist/voice/generation.js +32 -5
  65. package/dist/voice/generation.js.map +1 -1
  66. package/dist/voice/generation_tts_timeout.test.cjs +85 -0
  67. package/dist/voice/generation_tts_timeout.test.cjs.map +1 -0
  68. package/dist/voice/generation_tts_timeout.test.js +84 -0
  69. package/dist/voice/generation_tts_timeout.test.js.map +1 -0
  70. package/dist/voice/index.cjs.map +1 -1
  71. package/dist/voice/index.d.cts +1 -1
  72. package/dist/voice/index.d.ts +1 -1
  73. package/dist/voice/index.d.ts.map +1 -1
  74. package/dist/voice/index.js +3 -1
  75. package/dist/voice/index.js.map +1 -1
  76. package/dist/voice/recorder_io/recorder_io.cjs +1 -2
  77. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  78. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  79. package/dist/voice/recorder_io/recorder_io.js +2 -3
  80. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  81. package/dist/voice/report.cjs +1 -1
  82. package/dist/voice/report.cjs.map +1 -1
  83. package/dist/voice/report.js +1 -1
  84. package/dist/voice/report.js.map +1 -1
  85. package/dist/voice/report.test.cjs +70 -0
  86. package/dist/voice/report.test.cjs.map +1 -1
  87. package/dist/voice/report.test.js +70 -0
  88. package/dist/voice/report.test.js.map +1 -1
  89. package/dist/voice/room_io/room_io.cjs +5 -1
  90. package/dist/voice/room_io/room_io.cjs.map +1 -1
  91. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  92. package/dist/voice/room_io/room_io.js +5 -1
  93. package/dist/voice/room_io/room_io.js.map +1 -1
  94. package/dist/voice/room_io/room_io.test.cjs +18 -0
  95. package/dist/voice/room_io/room_io.test.cjs.map +1 -0
  96. package/dist/voice/room_io/room_io.test.js +17 -0
  97. package/dist/voice/room_io/room_io.test.js.map +1 -0
  98. package/package.json +1 -1
  99. package/src/audio.ts +12 -1
  100. package/src/inference/tts.ts +25 -3
  101. package/src/ipc/job_proc_lazy_main.ts +7 -2
  102. package/src/ipc/supervised_proc.test.ts +96 -0
  103. package/src/ipc/supervised_proc.ts +8 -1
  104. package/src/job.ts +1 -0
  105. package/src/utils.ts +43 -0
  106. package/src/voice/agent_activity.ts +11 -0
  107. package/src/voice/agent_session.ts +13 -7
  108. package/src/voice/events.ts +21 -0
  109. package/src/voice/generation.ts +35 -8
  110. package/src/voice/generation_tts_timeout.test.ts +112 -0
  111. package/src/voice/index.ts +6 -1
  112. package/src/voice/recorder_io/recorder_io.ts +2 -7
  113. package/src/voice/report.test.ts +78 -0
  114. package/src/voice/report.ts +1 -1
  115. package/src/voice/room_io/room_io.test.ts +38 -0
  116. package/src/voice/room_io/room_io.ts +7 -2
@@ -125,6 +125,102 @@ describe('IPC send on dead process', () => {
125
125
  });
126
126
  });
127
127
 
128
+ describe('init timeout rejection handling', () => {
129
+ it('does not produce unhandled rejection when init times out', async () => {
130
+ // Regression test: before the fix, run() was called without await in start().
131
+ // When init timed out, the rejection in run()'s `await this.init.await` escaped
132
+ // as an unhandled rejection — crashing the Node.js process.
133
+ const unhandled: unknown[] = [];
134
+ const handler = (reason: unknown) => unhandled.push(reason);
135
+ process.on('unhandledRejection', handler);
136
+
137
+ // Child that responds AFTER the timeout — simulates slow init under CPU pressure.
138
+ // Timeout fires at 50ms (init.reject), child responds at 200ms (once() resolves).
139
+ // Before the fix, init.reject caused an unhandled rejection in run().
140
+ const slowScript = join(tmpdir(), 'test_slow_init_child.mjs');
141
+ writeFileSync(
142
+ slowScript,
143
+ `process.on('message', () => {
144
+ setTimeout(() => process.send({ case: 'initializeResponse' }), 200);
145
+ });
146
+ setInterval(() => {}, 1000);`,
147
+ );
148
+
149
+ const { SupervisedProc } = await import('./supervised_proc.js');
150
+ class TestProc extends SupervisedProc {
151
+ createProcess() {
152
+ return fork(slowScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
153
+ }
154
+ async mainTask() {}
155
+ }
156
+
157
+ const proc = new TestProc(
158
+ 50, // initializeTimeout — fires before child responds at 200ms
159
+ 1000, // closeTimeout
160
+ 0, // memoryWarnMB
161
+ 0, // memoryLimitMB
162
+ 5000, // pingInterval
163
+ 60000, // pingTimeout
164
+ 2500, // highPingThreshold
165
+ );
166
+
167
+ await proc.start();
168
+ // initialize() returns normally: child responds at 200ms, once() resolves,
169
+ // but init was already rejected at 50ms — run() gets the rejection.
170
+ await proc.initialize();
171
+
172
+ // Give the event loop a tick for any unhandled rejection to surface
173
+ await new Promise((r) => setTimeout(r, 100));
174
+
175
+ process.off('unhandledRejection', handler);
176
+ proc.proc?.kill();
177
+ try {
178
+ unlinkSync(slowScript);
179
+ } catch {}
180
+
181
+ expect(unhandled).toEqual([]);
182
+ });
183
+
184
+ it('join() resolves after init timeout instead of hanging forever', async () => {
185
+ // When run() fails early (before registering proc event handlers),
186
+ // #join must still resolve so that join() and close() don't hang.
187
+ const slowScript = join(tmpdir(), 'test_slow_init_child_join.mjs');
188
+ writeFileSync(
189
+ slowScript,
190
+ `process.on('message', () => {
191
+ setTimeout(() => process.send({ case: 'initializeResponse' }), 200);
192
+ });
193
+ setInterval(() => {}, 1000);`,
194
+ );
195
+
196
+ const { SupervisedProc } = await import('./supervised_proc.js');
197
+ class TestProc extends SupervisedProc {
198
+ createProcess() {
199
+ return fork(slowScript, [], { stdio: ['pipe', 'pipe', 'pipe', 'ipc'] });
200
+ }
201
+ async mainTask() {}
202
+ }
203
+
204
+ const proc = new TestProc(50, 1000, 0, 0, 5000, 60000, 2500);
205
+
206
+ await proc.start();
207
+ await proc.initialize();
208
+
209
+ // join() must resolve within a reasonable time, not hang forever
210
+ const result = await Promise.race([
211
+ proc.join().then(() => 'resolved'),
212
+ new Promise((r) => setTimeout(() => r('timeout'), 2000)),
213
+ ]);
214
+
215
+ proc.proc?.kill();
216
+ try {
217
+ unlinkSync(slowScript);
218
+ } catch {}
219
+
220
+ expect(result).toBe('resolved');
221
+ });
222
+ });
223
+
128
224
  describe('timer cleanup', () => {
129
225
  it('clearInterval stops the interval', async () => {
130
226
  let count = 0;
@@ -84,7 +84,14 @@ export abstract class SupervisedProc {
84
84
  this.proc = this.createProcess();
85
85
 
86
86
  this.#started = true;
87
- this.run();
87
+ this.run().catch((err) => {
88
+ this.#logger.child({ err }).warn('supervised process run failed');
89
+ // Note: we intentionally do NOT kill the child process here. Killing it
90
+ // would race with initialize()'s `once(proc, 'message')`, causing
91
+ // initialize() to hang forever and deadlocking the caller (proc_pool).
92
+ // The child process is cleaned up when the pool shuts down.
93
+ this.#join.resolve();
94
+ });
88
95
  }
89
96
 
90
97
  async run() {
package/src/job.ts CHANGED
@@ -283,6 +283,7 @@ export class JobContext {
283
283
  startedAt: targetSession._startedAt,
284
284
  audioRecordingPath: recorderIO?.outputPath,
285
285
  audioRecordingStartedAt: recorderIO?.recordingStartedAt,
286
+ modelUsage: targetSession._usageCollector.flatten(),
286
287
  });
287
288
  }
288
289
 
package/src/utils.ts CHANGED
@@ -9,6 +9,7 @@ import type {
9
9
  TrackKind,
10
10
  } from '@livekit/rtc-node';
11
11
  import { AudioFrame, AudioResampler, RoomEvent } from '@livekit/rtc-node';
12
+ import type { Throws } from '@livekit/throws-transformer/throws';
12
13
  import { AsyncLocalStorage } from 'node:async_hooks';
13
14
  import { EventEmitter, once } from 'node:events';
14
15
  import type { ReadableStream } from 'node:stream/web';
@@ -752,6 +753,21 @@ export function isStreamClosedError(error: unknown): boolean {
752
753
  );
753
754
  }
754
755
 
756
+ /** FFmpeg error messages expected during normal teardown/shutdown. */
757
+ const FFMPEG_TEARDOWN_ERRORS = ['Output stream closed', 'received signal 2', 'SIGKILL', 'SIGINT'];
758
+
759
+ /**
760
+ * Check if an error is an expected FFmpeg teardown error that can be safely ignored during cleanup.
761
+ *
762
+ * @param error - The error to check.
763
+ * @returns True if the error is an expected FFmpeg shutdown error.
764
+ */
765
+ export function isFfmpegTeardownError(error: unknown): boolean {
766
+ return (
767
+ error instanceof Error && FFMPEG_TEARDOWN_ERRORS.some((msg) => error.message?.includes(msg))
768
+ );
769
+ }
770
+
755
771
  /**
756
772
  * In JS an error can be any arbitrary value.
757
773
  * This function converts an unknown error to an Error and stores the original value in the error object.
@@ -804,6 +820,33 @@ export function delay(ms: number, options: DelayOptions = {}): Promise<void> {
804
820
  });
805
821
  }
806
822
 
823
+ export class IdleTimeoutError extends Error {
824
+ constructor(message = 'idle timeout') {
825
+ super(message);
826
+ this.name = 'IdleTimeoutError';
827
+ }
828
+ }
829
+
830
+ /**
831
+ * Race a promise against an idle timeout. If the promise does not settle within
832
+ * `timeoutMs` milliseconds, the returned promise rejects with {@link IdleTimeoutError}
833
+ * (or the error returned by `throwError` when provided).
834
+ * The timer is properly cleaned up on settlement to avoid leaking handles.
835
+ */
836
+ export function waitUntilTimeout<T, E extends Error = IdleTimeoutError>(
837
+ promise: Promise<T>,
838
+ timeoutMs: number,
839
+ throwError?: () => E,
840
+ ): Promise<Throws<T, E>> {
841
+ let timer: ReturnType<typeof setTimeout> | undefined;
842
+ return Promise.race([
843
+ promise,
844
+ new Promise<never>((_, reject) => {
845
+ timer = setTimeout(() => reject(throwError?.() ?? new IdleTimeoutError()), timeoutMs);
846
+ }),
847
+ ]).finally(() => clearTimeout(timer)) as Promise<Throws<T, E>>;
848
+ }
849
+
807
850
  /**
808
851
  * Returns a participant that matches the given identity. If identity is None, the first
809
852
  * participant that joins the room will be returned.
@@ -67,6 +67,7 @@ import {
67
67
  createErrorEvent,
68
68
  createFunctionToolsExecutedEvent,
69
69
  createMetricsCollectedEvent,
70
+ createSessionUsageUpdatedEvent,
70
71
  createSpeechCreatedEvent,
71
72
  createUserInputTranscribedEvent,
72
73
  } from './events.js';
@@ -157,10 +158,15 @@ export class AgentActivity implements RecognitionHooks {
157
158
 
158
159
  private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
159
160
  this.agentSession._usageCollector.collect(ev);
161
+ const usage = this.agentSession.usage;
160
162
  this.agentSession.emit(
161
163
  AgentSessionEventTypes.MetricsCollected,
162
164
  createMetricsCollectedEvent({ metrics: ev }),
163
165
  );
166
+ this.agentSession.emit(
167
+ AgentSessionEventTypes.SessionUsageUpdated,
168
+ createSessionUsageUpdatedEvent({ usage }),
169
+ );
164
170
  };
165
171
 
166
172
  private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
@@ -730,11 +736,16 @@ export class AgentActivity implements RecognitionHooks {
730
736
  }
731
737
 
732
738
  this.agentSession._usageCollector.collect(ev);
739
+ const usage = this.agentSession.usage;
733
740
 
734
741
  this.agentSession.emit(
735
742
  AgentSessionEventTypes.MetricsCollected,
736
743
  createMetricsCollectedEvent({ metrics: ev }),
737
744
  );
745
+ this.agentSession.emit(
746
+ AgentSessionEventTypes.SessionUsageUpdated,
747
+ createSessionUsageUpdatedEvent({ usage }),
748
+ );
738
749
  };
739
750
 
740
751
  private onError(ev: RealtimeModelError | STTError | TTSError | LLMError): void {
@@ -52,6 +52,7 @@ import {
52
52
  type ErrorEvent,
53
53
  type FunctionToolsExecutedEvent,
54
54
  type MetricsCollectedEvent,
55
+ type SessionUsageUpdatedEvent,
55
56
  type ShutdownReason,
56
57
  type SpeechCreatedEvent,
57
58
  type UserInputTranscribedEvent,
@@ -131,6 +132,7 @@ export type AgentSessionCallbacks = {
131
132
  [AgentSessionEventTypes.ConversationItemAdded]: (ev: ConversationItemAddedEvent) => void;
132
133
  [AgentSessionEventTypes.FunctionToolsExecuted]: (ev: FunctionToolsExecutedEvent) => void;
133
134
  [AgentSessionEventTypes.MetricsCollected]: (ev: MetricsCollectedEvent) => void;
135
+ [AgentSessionEventTypes.SessionUsageUpdated]: (ev: SessionUsageUpdatedEvent) => void;
134
136
  [AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void;
135
137
  [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void;
136
138
  [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void;
@@ -649,7 +651,8 @@ export class AgentSession<
649
651
  }
650
652
 
651
653
  generateReply(options?: {
652
- userInput?: string;
654
+ userInput?: string | ChatMessage;
655
+ chatCtx?: ChatContext;
653
656
  instructions?: string;
654
657
  toolChoice?: ToolChoice;
655
658
  allowInterruptions?: boolean;
@@ -658,12 +661,15 @@ export class AgentSession<
658
661
  throw new Error('AgentSession is not running');
659
662
  }
660
663
 
661
- const userMessage = options?.userInput
662
- ? new ChatMessage({
663
- role: 'user',
664
- content: options.userInput,
665
- })
666
- : undefined;
664
+ const userMessage =
665
+ options?.userInput instanceof ChatMessage
666
+ ? options.userInput
667
+ : options?.userInput
668
+ ? new ChatMessage({
669
+ role: 'user',
670
+ content: options.userInput,
671
+ })
672
+ : undefined;
667
673
 
668
674
  const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
669
675
  if (activity.schedulingPaused) {
@@ -18,6 +18,7 @@ import type { STT } from '../stt/index.js';
18
18
  import type { STTError } from '../stt/stt.js';
19
19
  import type { TTS } from '../tts/index.js';
20
20
  import type { TTSError } from '../tts/tts.js';
21
+ import type { AgentSessionUsage } from './agent_session.js';
21
22
  import type { SpeechHandle } from './speech_handle.js';
22
23
 
23
24
  export enum AgentSessionEventTypes {
@@ -27,6 +28,7 @@ export enum AgentSessionEventTypes {
27
28
  ConversationItemAdded = 'conversation_item_added',
28
29
  FunctionToolsExecuted = 'function_tools_executed',
29
30
  MetricsCollected = 'metrics_collected',
31
+ SessionUsageUpdated = 'session_usage_updated',
30
32
  SpeechCreated = 'speech_created',
31
33
  OverlappingSpeech = 'overlapping_speech',
32
34
  Error = 'error',
@@ -133,6 +135,24 @@ export const createMetricsCollectedEvent = ({
133
135
  createdAt,
134
136
  });
135
137
 
138
+ export type SessionUsageUpdatedEvent = {
139
+ type: 'session_usage_updated';
140
+ usage: AgentSessionUsage;
141
+ createdAt: number;
142
+ };
143
+
144
+ export const createSessionUsageUpdatedEvent = ({
145
+ usage,
146
+ createdAt = Date.now(),
147
+ }: {
148
+ usage: AgentSessionUsage;
149
+ createdAt?: number;
150
+ }): SessionUsageUpdatedEvent => ({
151
+ type: 'session_usage_updated',
152
+ usage,
153
+ createdAt,
154
+ });
155
+
136
156
  export type ConversationItemAddedEvent = {
137
157
  type: 'conversation_item_added';
138
158
  item: ChatMessage;
@@ -264,6 +284,7 @@ export type AgentEvent =
264
284
  | UserStateChangedEvent
265
285
  | AgentStateChangedEvent
266
286
  | MetricsCollectedEvent
287
+ | SessionUsageUpdatedEvent
267
288
  | ConversationItemAddedEvent
268
289
  | FunctionToolsExecutedEvent
269
290
  | SpeechCreatedEvent
@@ -25,7 +25,15 @@ import { log } from '../log.js';
25
25
  import { IdentityTransform } from '../stream/identity_transform.js';
26
26
  import { traceTypes, tracer } from '../telemetry/index.js';
27
27
  import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
28
- import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
28
+ import {
29
+ Future,
30
+ IdleTimeoutError,
31
+ Task,
32
+ shortuuid,
33
+ toError,
34
+ waitForAbort,
35
+ waitUntilTimeout,
36
+ } from '../utils.js';
29
37
  import {
30
38
  type Agent,
31
39
  type ModelSettings,
@@ -46,6 +54,8 @@ import {
46
54
  import { RunContext } from './run_context.js';
47
55
  import type { SpeechHandle } from './speech_handle.js';
48
56
 
57
+ const TTS_READ_IDLE_TIMEOUT_MS = 10_000;
58
+
49
59
  /** @internal */
50
60
  export class _LLMGenerationData {
51
61
  generatedText: string = '';
@@ -550,6 +560,7 @@ export function performTTSInference(
550
560
  model?: string,
551
561
  provider?: string,
552
562
  ): [Task<void>, _TTSGenerationData] {
563
+ const logger = log();
553
564
  const audioStream = new IdentityTransform<AudioFrame>();
554
565
  const outputWriter = audioStream.writable.getWriter();
555
566
  const audioOutputStream = audioStream.readable;
@@ -624,12 +635,15 @@ export function performTTSInference(
624
635
  // JS currently only does single inference, so initialPushedDuration is always 0.
625
636
  // TODO: Add FlushSentinel + multi-segment loop
626
637
  const initialPushedDuration = pushedDuration;
627
-
628
638
  while (true) {
629
639
  if (signal.aborted) {
630
640
  break;
631
641
  }
632
- const { done, value: frame } = await ttsStreamReader.read();
642
+
643
+ const { done, value: frame } = await waitUntilTimeout(
644
+ ttsStreamReader.read(),
645
+ TTS_READ_IDLE_TIMEOUT_MS,
646
+ );
633
647
  if (done) {
634
648
  break;
635
649
  }
@@ -671,14 +685,15 @@ export function performTTSInference(
671
685
  pushedDuration += frameDuration;
672
686
  }
673
687
  } catch (error) {
674
- if (error instanceof DOMException && error.name === 'AbortError') {
675
- // Abort signal was triggered, handle gracefully
688
+ if (error instanceof IdleTimeoutError) {
689
+ logger.warn('TTS stream stalled after producing audio, forcing close');
690
+ } else if (error instanceof DOMException && error.name === 'AbortError') {
676
691
  return;
692
+ } else {
693
+ throw error;
677
694
  }
678
- throw error;
679
695
  } finally {
680
696
  if (!timedTextsFut.done) {
681
- // Ensure downstream consumers don't hang on errors.
682
697
  timedTextsFut.resolve(null);
683
698
  }
684
699
  ttsStreamReader?.releaseLock();
@@ -773,9 +788,12 @@ async function forwardAudio(
773
788
  out: _AudioOut,
774
789
  signal?: AbortSignal,
775
790
  ): Promise<void> {
791
+ const logger = log();
776
792
  const reader = ttsStream.getReader();
777
793
  let resampler: AudioResampler | null = null;
778
794
 
795
+ const FORWARD_AUDIO_IDLE_TIMEOUT_MS = 10_000;
796
+
779
797
  const onPlaybackStarted = (ev: { createdAt: number }) => {
780
798
  if (!out.firstFrameFut.done) {
781
799
  out.firstFrameFut.resolve(ev.createdAt);
@@ -791,7 +809,10 @@ async function forwardAudio(
791
809
  break;
792
810
  }
793
811
 
794
- const { done, value: frame } = await reader.read();
812
+ const { done, value: frame } = await waitUntilTimeout(
813
+ reader.read(),
814
+ FORWARD_AUDIO_IDLE_TIMEOUT_MS,
815
+ );
795
816
  if (done) break;
796
817
 
797
818
  out.audio.push(frame);
@@ -819,6 +840,12 @@ async function forwardAudio(
819
840
  await audioOutput.captureFrame(f);
820
841
  }
821
842
  }
843
+ } catch (e) {
844
+ if (e instanceof IdleTimeoutError) {
845
+ logger.warn('audio forwarding stalled waiting for TTS frames, forcing close');
846
+ } else {
847
+ throw e;
848
+ }
822
849
  } finally {
823
850
  audioOutput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
824
851
 
@@ -0,0 +1,112 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AudioFrame } from '@livekit/rtc-node';
5
+ import { ReadableStream } from 'stream/web';
6
+ import { describe, expect, it, vi } from 'vitest';
7
+ import { initializeLogger } from '../log.js';
8
+ import { performAudioForwarding, performTTSInference } from './generation.js';
9
+ import { AudioOutput } from './io.js';
10
+
11
+ function createSilentFrame(sampleRate = 24000, channels = 1, durationMs = 20): AudioFrame {
12
+ const samplesPerChannel = Math.floor((sampleRate * durationMs) / 1000);
13
+ const data = new Int16Array(samplesPerChannel * channels);
14
+ return new AudioFrame(data, sampleRate, channels, samplesPerChannel);
15
+ }
16
+
17
+ class MockAudioOutput extends AudioOutput {
18
+ capturedFrames: AudioFrame[] = [];
19
+
20
+ constructor() {
21
+ super(24000);
22
+ }
23
+
24
+ async captureFrame(frame: AudioFrame): Promise<void> {
25
+ await super.captureFrame(frame);
26
+ this.capturedFrames.push(frame);
27
+ this.onPlaybackStarted(Date.now());
28
+ }
29
+
30
+ clearBuffer(): void {
31
+ // no-op for mock
32
+ }
33
+ }
34
+
35
+ describe('TTS stream idle timeout', () => {
36
+ initializeLogger({ pretty: false, level: 'silent' });
37
+
38
+ it('forwardAudio completes when TTS stream stalls after producing frames', async () => {
39
+ const stalledStream = new ReadableStream<AudioFrame>({
40
+ start(controller) {
41
+ controller.enqueue(createSilentFrame());
42
+ controller.enqueue(createSilentFrame());
43
+ },
44
+ });
45
+
46
+ const audioOutput = new MockAudioOutput();
47
+ const controller = new AbortController();
48
+
49
+ const [task, audioOut] = performAudioForwarding(stalledStream, audioOutput, controller);
50
+
51
+ vi.useFakeTimers();
52
+
53
+ const taskPromise = task.result;
54
+ await vi.advanceTimersByTimeAsync(11_000);
55
+ await taskPromise;
56
+
57
+ vi.useRealTimers();
58
+
59
+ expect(audioOutput.capturedFrames.length).toBe(2);
60
+ expect(audioOut.firstFrameFut.done).toBe(true);
61
+ }, 10_000);
62
+
63
+ it('forwardAudio completes normally when TTS stream closes properly', async () => {
64
+ const normalStream = new ReadableStream<AudioFrame>({
65
+ start(controller) {
66
+ controller.enqueue(createSilentFrame());
67
+ controller.enqueue(createSilentFrame());
68
+ controller.enqueue(createSilentFrame());
69
+ controller.close();
70
+ },
71
+ });
72
+
73
+ const audioOutput = new MockAudioOutput();
74
+ const controller = new AbortController();
75
+
76
+ const [task, audioOut] = performAudioForwarding(normalStream, audioOutput, controller);
77
+
78
+ await task.result;
79
+
80
+ expect(audioOutput.capturedFrames.length).toBe(3);
81
+ expect(audioOut.firstFrameFut.done).toBe(true);
82
+ });
83
+
84
+ it('performTTSInference completes when TTS node returns stalled stream', async () => {
85
+ const stalledTtsStream = new ReadableStream<AudioFrame>({
86
+ start(controller) {
87
+ controller.enqueue(createSilentFrame());
88
+ },
89
+ });
90
+
91
+ const ttsNode = async () => stalledTtsStream;
92
+ const textInput = new ReadableStream<string>({
93
+ start(controller) {
94
+ controller.enqueue('Hello world');
95
+ controller.close();
96
+ },
97
+ });
98
+
99
+ const controller = new AbortController();
100
+ const [task, genData] = performTTSInference(ttsNode, textInput, {}, controller);
101
+
102
+ vi.useFakeTimers();
103
+
104
+ const taskPromise = task.result;
105
+ await vi.advanceTimersByTimeAsync(11_000);
106
+ await taskPromise;
107
+
108
+ vi.useRealTimers();
109
+
110
+ expect(genData.ttfb).toBeDefined();
111
+ }, 10_000);
112
+ });
@@ -2,7 +2,12 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
5
- export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
5
+ export {
6
+ AgentSession,
7
+ type AgentSessionOptions,
8
+ type AgentSessionUsage,
9
+ type VoiceOptions,
10
+ } from './agent_session.js';
6
11
  export * from './avatar/index.js';
7
12
  export * from './background_audio.js';
8
13
  export {
@@ -13,7 +13,7 @@ import { TransformStream } from 'node:stream/web';
13
13
  import { log } from '../../log.js';
14
14
  import { isStreamReaderReleaseError } from '../../stream/deferred_stream.js';
15
15
  import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js';
16
- import { Future, Task, cancelAndWait, delay } from '../../utils.js';
16
+ import { Future, Task, cancelAndWait, delay, isFfmpegTeardownError } from '../../utils.js';
17
17
  import type { AgentSession } from '../agent_session.js';
18
18
  import { AudioInput, AudioOutput, type PlaybackFinishedEvent } from '../io.js';
19
19
 
@@ -203,12 +203,7 @@ export class RecorderIO {
203
203
  })
204
204
  .on('error', (err) => {
205
205
  // Ignore errors from intentional stream closure or SIGINT during shutdown
206
- if (
207
- err.message?.includes('Output stream closed') ||
208
- err.message?.includes('received signal 2') ||
209
- err.message?.includes('SIGKILL') ||
210
- err.message?.includes('SIGINT')
211
- ) {
206
+ if (isFfmpegTeardownError(err)) {
212
207
  resolve();
213
208
  } else {
214
209
  this.logger.error({ err }, 'FFmpeg encoding error');
@@ -3,7 +3,10 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { describe, expect, it } from 'vitest';
5
5
  import { ChatContext } from '../llm/chat_context.js';
6
+ import type { ModelUsage } from '../metrics/model_usage.js';
6
7
  import type { AgentSessionOptions, VoiceOptions } from './agent_session.js';
8
+ import { AgentSessionEventTypes, createSessionUsageUpdatedEvent } from './events.js';
9
+ import type { AgentSessionUsage } from './index.js';
7
10
  import { createSessionReport, sessionReportToJSON } from './report.js';
8
11
 
9
12
  type ReportOptions = AgentSessionOptions & Partial<VoiceOptions>;
@@ -133,4 +136,79 @@ describe('sessionReportToJSON', () => {
133
136
  max_tool_steps: 3,
134
137
  });
135
138
  });
139
+
140
+ it('serializes model usage as usage', () => {
141
+ const usage: ModelUsage[] = [
142
+ {
143
+ type: 'tts_usage',
144
+ provider: 'elevenlabs',
145
+ model: 'eleven_flash_v2_5',
146
+ inputTokens: 0,
147
+ outputTokens: 0,
148
+ charactersCount: 42,
149
+ audioDurationMs: 1200,
150
+ },
151
+ ];
152
+
153
+ const report = createSessionReport({
154
+ jobId: 'job',
155
+ roomId: 'room-id',
156
+ room: 'room',
157
+ options: baseOptions(),
158
+ events: [],
159
+ chatHistory: ChatContext.empty(),
160
+ enableRecording: false,
161
+ timestamp: 0,
162
+ startedAt: 0,
163
+ modelUsage: usage,
164
+ });
165
+
166
+ const payload = sessionReportToJSON(report);
167
+ expect(payload.usage).toEqual([
168
+ {
169
+ type: 'tts_usage',
170
+ provider: 'elevenlabs',
171
+ model: 'eleven_flash_v2_5',
172
+ charactersCount: 42,
173
+ audioDurationMs: 1200,
174
+ },
175
+ ]);
176
+ });
177
+
178
+ it('omits session usage update events from serialized events', () => {
179
+ const report = createSessionReport({
180
+ jobId: 'job',
181
+ roomId: 'room-id',
182
+ room: 'room',
183
+ options: baseOptions(),
184
+ events: [
185
+ createSessionUsageUpdatedEvent({
186
+ usage: {
187
+ modelUsage: [
188
+ {
189
+ type: 'tts_usage',
190
+ provider: 'elevenlabs',
191
+ model: 'eleven_flash_v2_5',
192
+ },
193
+ ],
194
+ },
195
+ createdAt: 123,
196
+ }),
197
+ ],
198
+ chatHistory: ChatContext.empty(),
199
+ enableRecording: false,
200
+ timestamp: 0,
201
+ startedAt: 0,
202
+ });
203
+
204
+ const payload = sessionReportToJSON(report);
205
+ expect(payload.events).toEqual([]);
206
+ });
207
+
208
+ it('exports AgentSessionUsage from the voice barrel', () => {
209
+ const usage: AgentSessionUsage = { modelUsage: [] };
210
+ const eventType: AgentSessionEventTypes = AgentSessionEventTypes.SessionUsageUpdated;
211
+ expect(usage.modelUsage).toEqual([]);
212
+ expect(eventType).toBe('session_usage_updated');
213
+ });
136
214
  });
@@ -111,7 +111,7 @@ export function sessionReportToJSON(report: SessionReport): Record<string, unkno
111
111
  options.voiceOptions?.maxEndpointingDelay;
112
112
 
113
113
  for (const event of report.events) {
114
- if (event.type === 'metrics_collected') {
114
+ if (event.type === 'metrics_collected' || event.type === 'session_usage_updated') {
115
115
  continue; // metrics are too noisy, Cloud is using the chat_history as the source of truth
116
116
  }
117
117