@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/stream/index.cjs +3 -0
  56. package/dist/stream/index.cjs.map +1 -1
  57. package/dist/stream/index.d.cts +1 -0
  58. package/dist/stream/index.d.ts +1 -0
  59. package/dist/stream/index.d.ts.map +1 -1
  60. package/dist/stream/index.js +2 -0
  61. package/dist/stream/index.js.map +1 -1
  62. package/dist/stream/multi_input_stream.cjs +139 -0
  63. package/dist/stream/multi_input_stream.cjs.map +1 -0
  64. package/dist/stream/multi_input_stream.d.cts +55 -0
  65. package/dist/stream/multi_input_stream.d.ts +55 -0
  66. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  67. package/dist/stream/multi_input_stream.js +115 -0
  68. package/dist/stream/multi_input_stream.js.map +1 -0
  69. package/dist/stream/multi_input_stream.test.cjs +340 -0
  70. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  71. package/dist/stream/multi_input_stream.test.js +339 -0
  72. package/dist/stream/multi_input_stream.test.js.map +1 -0
  73. package/dist/telemetry/trace_types.cjs +42 -0
  74. package/dist/telemetry/trace_types.cjs.map +1 -1
  75. package/dist/telemetry/trace_types.d.cts +14 -0
  76. package/dist/telemetry/trace_types.d.ts +14 -0
  77. package/dist/telemetry/trace_types.d.ts.map +1 -1
  78. package/dist/telemetry/trace_types.js +28 -0
  79. package/dist/telemetry/trace_types.js.map +1 -1
  80. package/dist/utils.cjs +44 -2
  81. package/dist/utils.cjs.map +1 -1
  82. package/dist/utils.d.cts +8 -0
  83. package/dist/utils.d.ts +8 -0
  84. package/dist/utils.d.ts.map +1 -1
  85. package/dist/utils.js +44 -2
  86. package/dist/utils.js.map +1 -1
  87. package/dist/utils.test.cjs +71 -0
  88. package/dist/utils.test.cjs.map +1 -1
  89. package/dist/utils.test.js +71 -0
  90. package/dist/utils.test.js.map +1 -1
  91. package/dist/version.cjs +1 -1
  92. package/dist/version.cjs.map +1 -1
  93. package/dist/version.d.cts +1 -1
  94. package/dist/version.d.ts +1 -1
  95. package/dist/version.d.ts.map +1 -1
  96. package/dist/version.js +1 -1
  97. package/dist/version.js.map +1 -1
  98. package/dist/voice/agent.cjs +144 -12
  99. package/dist/voice/agent.cjs.map +1 -1
  100. package/dist/voice/agent.d.cts +29 -4
  101. package/dist/voice/agent.d.ts +29 -4
  102. package/dist/voice/agent.d.ts.map +1 -1
  103. package/dist/voice/agent.js +140 -11
  104. package/dist/voice/agent.js.map +1 -1
  105. package/dist/voice/agent.test.cjs +120 -0
  106. package/dist/voice/agent.test.cjs.map +1 -1
  107. package/dist/voice/agent.test.js +122 -2
  108. package/dist/voice/agent.test.js.map +1 -1
  109. package/dist/voice/agent_activity.cjs +402 -292
  110. package/dist/voice/agent_activity.cjs.map +1 -1
  111. package/dist/voice/agent_activity.d.cts +35 -7
  112. package/dist/voice/agent_activity.d.ts +35 -7
  113. package/dist/voice/agent_activity.d.ts.map +1 -1
  114. package/dist/voice/agent_activity.js +402 -287
  115. package/dist/voice/agent_activity.js.map +1 -1
  116. package/dist/voice/agent_session.cjs +156 -44
  117. package/dist/voice/agent_session.cjs.map +1 -1
  118. package/dist/voice/agent_session.d.cts +22 -9
  119. package/dist/voice/agent_session.d.ts +22 -9
  120. package/dist/voice/agent_session.d.ts.map +1 -1
  121. package/dist/voice/agent_session.js +156 -44
  122. package/dist/voice/agent_session.js.map +1 -1
  123. package/dist/voice/audio_recognition.cjs +89 -36
  124. package/dist/voice/audio_recognition.cjs.map +1 -1
  125. package/dist/voice/audio_recognition.d.cts +22 -1
  126. package/dist/voice/audio_recognition.d.ts +22 -1
  127. package/dist/voice/audio_recognition.d.ts.map +1 -1
  128. package/dist/voice/audio_recognition.js +93 -36
  129. package/dist/voice/audio_recognition.js.map +1 -1
  130. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  131. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  132. package/dist/voice/audio_recognition_span.test.js +232 -0
  133. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  134. package/dist/voice/generation.cjs +39 -19
  135. package/dist/voice/generation.cjs.map +1 -1
  136. package/dist/voice/generation.d.ts.map +1 -1
  137. package/dist/voice/generation.js +44 -20
  138. package/dist/voice/generation.js.map +1 -1
  139. package/dist/voice/index.cjs +2 -0
  140. package/dist/voice/index.cjs.map +1 -1
  141. package/dist/voice/index.d.cts +1 -1
  142. package/dist/voice/index.d.ts +1 -1
  143. package/dist/voice/index.d.ts.map +1 -1
  144. package/dist/voice/index.js +2 -1
  145. package/dist/voice/index.js.map +1 -1
  146. package/dist/voice/io.cjs +6 -3
  147. package/dist/voice/io.cjs.map +1 -1
  148. package/dist/voice/io.d.cts +3 -2
  149. package/dist/voice/io.d.ts +3 -2
  150. package/dist/voice/io.d.ts.map +1 -1
  151. package/dist/voice/io.js +6 -3
  152. package/dist/voice/io.js.map +1 -1
  153. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  154. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  155. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  156. package/dist/voice/recorder_io/recorder_io.js +3 -1
  157. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  158. package/dist/voice/room_io/_input.cjs +17 -17
  159. package/dist/voice/room_io/_input.cjs.map +1 -1
  160. package/dist/voice/room_io/_input.d.cts +2 -2
  161. package/dist/voice/room_io/_input.d.ts +2 -2
  162. package/dist/voice/room_io/_input.d.ts.map +1 -1
  163. package/dist/voice/room_io/_input.js +7 -6
  164. package/dist/voice/room_io/_input.js.map +1 -1
  165. package/dist/voice/room_io/room_io.cjs +9 -0
  166. package/dist/voice/room_io/room_io.cjs.map +1 -1
  167. package/dist/voice/room_io/room_io.d.cts +3 -1
  168. package/dist/voice/room_io/room_io.d.ts +3 -1
  169. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  170. package/dist/voice/room_io/room_io.js +9 -0
  171. package/dist/voice/room_io/room_io.js.map +1 -1
  172. package/dist/voice/speech_handle.cjs +7 -1
  173. package/dist/voice/speech_handle.cjs.map +1 -1
  174. package/dist/voice/speech_handle.d.cts +2 -0
  175. package/dist/voice/speech_handle.d.ts +2 -0
  176. package/dist/voice/speech_handle.d.ts.map +1 -1
  177. package/dist/voice/speech_handle.js +8 -2
  178. package/dist/voice/speech_handle.js.map +1 -1
  179. package/dist/voice/testing/run_result.cjs +66 -15
  180. package/dist/voice/testing/run_result.cjs.map +1 -1
  181. package/dist/voice/testing/run_result.d.cts +14 -3
  182. package/dist/voice/testing/run_result.d.ts +14 -3
  183. package/dist/voice/testing/run_result.d.ts.map +1 -1
  184. package/dist/voice/testing/run_result.js +66 -15
  185. package/dist/voice/testing/run_result.js.map +1 -1
  186. package/dist/voice/utils.cjs +47 -0
  187. package/dist/voice/utils.cjs.map +1 -0
  188. package/dist/voice/utils.d.cts +4 -0
  189. package/dist/voice/utils.d.ts +4 -0
  190. package/dist/voice/utils.d.ts.map +1 -0
  191. package/dist/voice/utils.js +23 -0
  192. package/dist/voice/utils.js.map +1 -0
  193. package/package.json +1 -1
  194. package/src/cli.ts +20 -33
  195. package/src/ipc/job_proc_lazy_main.ts +16 -5
  196. package/src/llm/chat_context.ts +35 -0
  197. package/src/llm/provider_format/index.ts +7 -2
  198. package/src/llm/provider_format/openai.test.ts +385 -1
  199. package/src/llm/provider_format/openai.ts +103 -0
  200. package/src/llm/provider_format/utils.ts +6 -4
  201. package/src/llm/realtime.ts +1 -0
  202. package/src/log.ts +5 -2
  203. package/src/stream/deferred_stream.ts +17 -6
  204. package/src/stream/index.ts +1 -0
  205. package/src/stream/multi_input_stream.test.ts +540 -0
  206. package/src/stream/multi_input_stream.ts +172 -0
  207. package/src/telemetry/trace_types.ts +18 -0
  208. package/src/utils.test.ts +87 -0
  209. package/src/utils.ts +52 -2
  210. package/src/version.ts +1 -1
  211. package/src/voice/agent.test.ts +140 -2
  212. package/src/voice/agent.ts +189 -10
  213. package/src/voice/agent_activity.ts +449 -286
  214. package/src/voice/agent_session.ts +195 -51
  215. package/src/voice/audio_recognition.ts +118 -38
  216. package/src/voice/audio_recognition_span.test.ts +261 -0
  217. package/src/voice/generation.ts +52 -23
  218. package/src/voice/index.ts +1 -1
  219. package/src/voice/io.ts +7 -4
  220. package/src/voice/recorder_io/recorder_io.ts +2 -1
  221. package/src/voice/room_io/_input.ts +11 -7
  222. package/src/voice/room_io/room_io.ts +12 -0
  223. package/src/voice/speech_handle.ts +9 -2
  224. package/src/voice/testing/run_result.ts +81 -23
  225. package/src/voice/utils.ts +29 -0
@@ -0,0 +1,261 @@
1
+ // SPDX-FileCopyrightText: 2026 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { ParticipantKind } from '@livekit/rtc-node';
5
+ import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
6
+ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
7
+ import { describe, expect, it, vi } from 'vitest';
8
+ import { initializeLogger } from '../log.js';
9
+ import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
10
+ import { setTracerProvider } from '../telemetry/index.js';
11
+ import { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js';
12
+ import { AudioRecognition, type _TurnDetector } from './audio_recognition.js';
13
+
14
+ function setupInMemoryTracing() {
15
+ const exporter = new InMemorySpanExporter();
16
+ const provider = new NodeTracerProvider();
17
+ provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
18
+ provider.register();
19
+ setTracerProvider(provider);
20
+ return { exporter };
21
+ }
22
+
23
+ function spanByName(spans: any[], name: string) {
24
+ return spans.find((s) => s.name === name);
25
+ }
26
+
27
+ class FakeVADStream extends (Object as unknown as { new (): VADStream }) {
28
+ // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output
29
+ // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition.
30
+ private events: VADEvent[];
31
+ private idx = 0;
32
+ constructor(events: VADEvent[]) {
33
+ super();
34
+ this.events = events;
35
+ }
36
+ updateInputStream() {}
37
+ detachInputStream() {}
38
+ close() {}
39
+ [Symbol.asyncIterator]() {
40
+ return this;
41
+ }
42
+ async next(): Promise<IteratorResult<VADEvent>> {
43
+ if (this.idx >= this.events.length) {
44
+ return { done: true, value: undefined };
45
+ }
46
+ const value = this.events[this.idx++]!;
47
+ return { done: false, value };
48
+ }
49
+ }
50
+
51
+ class FakeVAD extends VAD {
52
+ label = 'fake-vad';
53
+ private events: VADEvent[];
54
+ constructor(events: VADEvent[]) {
55
+ super({ updateInterval: 1 });
56
+ this.events = events;
57
+ }
58
+ stream(): any {
59
+ return new FakeVADStream(this.events);
60
+ }
61
+ }
62
+
63
+ const alwaysTrueTurnDetector: _TurnDetector = {
64
+ supportsLanguage: async () => true,
65
+ unlikelyThreshold: async () => undefined,
66
+ predictEndOfTurn: async () => 1.0,
67
+ };
68
+
69
+ describe('AudioRecognition user_turn span parity', () => {
70
+ initializeLogger({ pretty: false, level: 'silent' });
71
+
72
+ it('creates user_turn and parents eou_detection under it (stt mode)', async () => {
73
+ const { exporter } = setupInMemoryTracing();
74
+
75
+ const hooks = {
76
+ onStartOfSpeech: vi.fn(),
77
+ onVADInferenceDone: vi.fn(),
78
+ onEndOfSpeech: vi.fn(),
79
+ onInterimTranscript: vi.fn(),
80
+ onFinalTranscript: vi.fn(),
81
+ onPreemptiveGeneration: vi.fn(),
82
+ retrieveChatCtx: () =>
83
+ ({
84
+ copy() {
85
+ return this;
86
+ },
87
+ addMessage() {},
88
+ toJSON() {
89
+ return { items: [] };
90
+ },
91
+ }) as any,
92
+ onEndOfTurn: vi.fn(async () => true),
93
+ };
94
+
95
+ const sttEvents: SpeechEvent[] = [
96
+ { type: SpeechEventType.START_OF_SPEECH },
97
+ {
98
+ type: SpeechEventType.FINAL_TRANSCRIPT,
99
+ alternatives: [
100
+ {
101
+ language: 'en',
102
+ text: 'hello',
103
+ startTime: 0,
104
+ endTime: 0,
105
+ confidence: 0.9,
106
+ },
107
+ ],
108
+ },
109
+ { type: SpeechEventType.END_OF_SPEECH },
110
+ ];
111
+
112
+ const sttNode = async () =>
113
+ new ReadableStream<SpeechEvent>({
114
+ start(controller) {
115
+ for (const ev of sttEvents) controller.enqueue(ev);
116
+ controller.close();
117
+ },
118
+ });
119
+
120
+ const ar = new AudioRecognition({
121
+ recognitionHooks: hooks as any,
122
+ stt: sttNode as any,
123
+ vad: undefined,
124
+ turnDetector: alwaysTrueTurnDetector,
125
+ turnDetectionMode: 'stt',
126
+ minEndpointingDelay: 0,
127
+ maxEndpointingDelay: 0,
128
+ sttModel: 'deepgram-nova2',
129
+ sttProvider: 'deepgram',
130
+ getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }),
131
+ });
132
+
133
+ await ar.start();
134
+ // allow background task to drain
135
+ await new Promise((r) => setTimeout(r, 20));
136
+ await ar.close();
137
+
138
+ const spans = exporter.getFinishedSpans();
139
+ const userTurn = spanByName(spans, 'user_turn');
140
+ const eou = spanByName(spans, 'eou_detection');
141
+ expect(userTurn, 'user_turn span missing').toBeTruthy();
142
+ expect(eou, 'eou_detection span missing').toBeTruthy();
143
+
144
+ expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
145
+
146
+ // creation-time attributes
147
+ expect(userTurn.attributes['lk.participant_id']).toBe('p1');
148
+ expect(userTurn.attributes['lk.participant_identity']).toBe('bob');
149
+ expect(userTurn.attributes['lk.participant_kind']).toBe('AGENT');
150
+ expect(userTurn.attributes['gen_ai.request.model']).toBe('deepgram-nova2');
151
+ expect(userTurn.attributes['gen_ai.provider.name']).toBe('deepgram');
152
+
153
+ // end-of-turn attributes
154
+ expect(userTurn.attributes['lk.user_transcript']).toContain('hello');
155
+ expect(userTurn.attributes['lk.transcript_confidence']).toBeGreaterThan(0);
156
+ });
157
+
158
+ it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => {
159
+ const { exporter } = setupInMemoryTracing();
160
+
161
+ const hooks = {
162
+ onStartOfSpeech: vi.fn(),
163
+ onVADInferenceDone: vi.fn(),
164
+ onEndOfSpeech: vi.fn(),
165
+ onInterimTranscript: vi.fn(),
166
+ onFinalTranscript: vi.fn(),
167
+ onPreemptiveGeneration: vi.fn(),
168
+ retrieveChatCtx: () =>
169
+ ({
170
+ copy() {
171
+ return this;
172
+ },
173
+ addMessage() {},
174
+ toJSON() {
175
+ return { items: [] };
176
+ },
177
+ }) as any,
178
+ onEndOfTurn: vi.fn(async () => true),
179
+ };
180
+
181
+ const now = Date.now();
182
+ const vadEvents: VADEvent[] = [
183
+ {
184
+ type: VADEventType.START_OF_SPEECH,
185
+ samplesIndex: 0,
186
+ timestamp: now,
187
+ speechDuration: 100,
188
+ silenceDuration: 0,
189
+ frames: [],
190
+ probability: 0,
191
+ inferenceDuration: 0,
192
+ speaking: true,
193
+ rawAccumulatedSilence: 0,
194
+ rawAccumulatedSpeech: 0,
195
+ },
196
+ {
197
+ type: VADEventType.END_OF_SPEECH,
198
+ samplesIndex: 0,
199
+ timestamp: now + 200,
200
+ speechDuration: 100,
201
+ silenceDuration: 100,
202
+ frames: [],
203
+ probability: 0,
204
+ inferenceDuration: 0,
205
+ speaking: false,
206
+ rawAccumulatedSilence: 0,
207
+ rawAccumulatedSpeech: 0,
208
+ },
209
+ ];
210
+
211
+ const sttEvents: SpeechEvent[] = [
212
+ {
213
+ type: SpeechEventType.FINAL_TRANSCRIPT,
214
+ alternatives: [
215
+ {
216
+ language: 'en',
217
+ text: 'test',
218
+ startTime: 0,
219
+ endTime: 0,
220
+ confidence: 0.8,
221
+ },
222
+ ],
223
+ },
224
+ ];
225
+
226
+ const sttNode = async () =>
227
+ new ReadableStream<SpeechEvent>({
228
+ start(controller) {
229
+ for (const ev of sttEvents) controller.enqueue(ev);
230
+ controller.close();
231
+ },
232
+ });
233
+
234
+ const ar = new AudioRecognition({
235
+ recognitionHooks: hooks as any,
236
+ stt: sttNode as any,
237
+ vad: new FakeVAD(vadEvents) as any,
238
+ turnDetector: alwaysTrueTurnDetector,
239
+ turnDetectionMode: 'vad',
240
+ minEndpointingDelay: 0,
241
+ maxEndpointingDelay: 0,
242
+ sttModel: 'stt-model',
243
+ sttProvider: 'stt-provider',
244
+ getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }),
245
+ });
246
+
247
+ await ar.start();
248
+ await new Promise((r) => setTimeout(r, 20));
249
+ await ar.close();
250
+
251
+ const spans = exporter.getFinishedSpans();
252
+ const userTurn = spanByName(spans, 'user_turn');
253
+ const eou = spanByName(spans, 'eou_detection');
254
+ expect(userTurn).toBeTruthy();
255
+ expect(eou).toBeTruthy();
256
+ expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId);
257
+
258
+ expect(hooks.onStartOfSpeech).toHaveBeenCalled();
259
+ expect(hooks.onEndOfSpeech).toHaveBeenCalled();
260
+ });
261
+ });
@@ -26,7 +26,13 @@ import { IdentityTransform } from '../stream/identity_transform.js';
26
26
  import { traceTypes, tracer } from '../telemetry/index.js';
27
27
  import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
28
28
  import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js';
29
- import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js';
29
+ import {
30
+ type Agent,
31
+ type ModelSettings,
32
+ _setActivityTaskInfo,
33
+ functionCallStorage,
34
+ isStopResponse,
35
+ } from './agent.js';
30
36
  import type { AgentSession } from './agent_session.js';
31
37
  import {
32
38
  AudioOutput,
@@ -719,7 +725,7 @@ export interface _AudioOut {
719
725
 
720
726
  async function forwardAudio(
721
727
  ttsStream: ReadableStream<AudioFrame>,
722
- audioOuput: AudioOutput,
728
+ audioOutput: AudioOutput,
723
729
  out: _AudioOut,
724
730
  signal?: AbortSignal,
725
731
  ): Promise<void> {
@@ -733,8 +739,8 @@ async function forwardAudio(
733
739
  };
734
740
 
735
741
  try {
736
- audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
737
- audioOuput.resume();
742
+ audioOutput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
743
+ audioOutput.resume();
738
744
 
739
745
  while (true) {
740
746
  if (signal?.aborted) {
@@ -748,36 +754,36 @@ async function forwardAudio(
748
754
 
749
755
  if (
750
756
  !out.firstFrameFut.done &&
751
- audioOuput.sampleRate &&
752
- audioOuput.sampleRate !== frame.sampleRate &&
757
+ audioOutput.sampleRate &&
758
+ audioOutput.sampleRate !== frame.sampleRate &&
753
759
  !resampler
754
760
  ) {
755
- resampler = new AudioResampler(frame.sampleRate, audioOuput.sampleRate, 1);
761
+ resampler = new AudioResampler(frame.sampleRate, audioOutput.sampleRate, 1);
756
762
  }
757
763
 
758
764
  if (resampler) {
759
765
  for (const f of resampler.push(frame)) {
760
- await audioOuput.captureFrame(f);
766
+ await audioOutput.captureFrame(f);
761
767
  }
762
768
  } else {
763
- await audioOuput.captureFrame(frame);
769
+ await audioOutput.captureFrame(frame);
764
770
  }
765
771
  }
766
772
 
767
773
  if (resampler) {
768
774
  for (const f of resampler.flush()) {
769
- await audioOuput.captureFrame(f);
775
+ await audioOutput.captureFrame(f);
770
776
  }
771
777
  }
772
778
  } finally {
773
- audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
779
+ audioOutput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted);
774
780
 
775
781
  if (!out.firstFrameFut.done) {
776
782
  out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started'));
777
783
  }
778
784
 
779
785
  reader?.releaseLock();
780
- audioOuput.flush();
786
+ audioOutput.flush();
781
787
  }
782
788
  }
783
789
 
@@ -836,7 +842,7 @@ export function performToolExecutions({
836
842
  const signal = controller.signal;
837
843
  const reader = toolCallStream.getReader();
838
844
 
839
- const tasks: Promise<any>[] = [];
845
+ const tasks: Task<void>[] = [];
840
846
  while (!signal.aborted) {
841
847
  const { done, value: toolCall } = await reader.read();
842
848
  if (signal.aborted) break;
@@ -929,14 +935,6 @@ export function performToolExecutions({
929
935
  'Executing LLM tool call',
930
936
  );
931
937
 
932
- const toolExecution = asyncLocalStorage.run({ functionCall: toolCall }, async () => {
933
- return await tool.execute(parsedArgs, {
934
- ctx: new RunContext(session, speechHandle, toolCall),
935
- toolCallId: toolCall.callId,
936
- abortSignal: signal,
937
- });
938
- });
939
-
940
938
  const _tracableToolExecutionImpl = async (toolExecTask: Promise<unknown>, span: Span) => {
941
939
  span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name);
942
940
  span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args);
@@ -993,11 +991,42 @@ export function performToolExecutions({
993
991
  name: 'function_tool',
994
992
  });
995
993
 
994
+ const toolTask = Task.from(
995
+ async () => {
996
+ // Ensure this task is marked inline before user tool code executes.
997
+ const currentTask = Task.current();
998
+ if (currentTask) {
999
+ _setActivityTaskInfo(currentTask, {
1000
+ speechHandle,
1001
+ functionCall: toolCall,
1002
+ inlineTask: true,
1003
+ });
1004
+ }
1005
+
1006
+ const toolExecution = functionCallStorage.run({ functionCall: toolCall }, async () => {
1007
+ return await tool.execute(parsedArgs, {
1008
+ ctx: new RunContext(session, speechHandle, toolCall),
1009
+ toolCallId: toolCall.callId,
1010
+ abortSignal: signal,
1011
+ });
1012
+ });
1013
+
1014
+ await tracableToolExecution(toolExecution);
1015
+ },
1016
+ controller,
1017
+ `performToolExecution:${toolCall.name}`,
1018
+ );
1019
+
1020
+ _setActivityTaskInfo(toolTask, {
1021
+ speechHandle,
1022
+ functionCall: toolCall,
1023
+ inlineTask: true,
1024
+ });
996
1025
  // wait, not cancelling all tool calling tasks
997
- tasks.push(tracableToolExecution(toolExecution));
1026
+ tasks.push(toolTask);
998
1027
  }
999
1028
 
1000
- await Promise.allSettled(tasks);
1029
+ await Promise.allSettled(tasks.map((task) => task.result));
1001
1030
  if (toolOutput.output.length > 0) {
1002
1031
  logger.debug(
1003
1032
  {
@@ -1,7 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
4
+ export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } from './agent.js';
5
5
  export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js';
6
6
  export * from './avatar/index.js';
7
7
  export * from './background_audio.js';
package/src/voice/io.ts CHANGED
@@ -8,7 +8,7 @@ import type { ChatContext } from '../llm/chat_context.js';
8
8
  import type { ChatChunk } from '../llm/llm.js';
9
9
  import type { ToolContext } from '../llm/tool_context.js';
10
10
  import { log } from '../log.js';
11
- import { DeferredReadableStream } from '../stream/deferred_stream.js';
11
+ import { MultiInputStream } from '../stream/multi_input_stream.js';
12
12
  import type { SpeechEvent } from '../stt/stt.js';
13
13
  import { Future } from '../utils.js';
14
14
  import type { ModelSettings } from './agent.js';
@@ -84,11 +84,14 @@ export interface AudioOutputCapabilities {
84
84
  }
85
85
 
86
86
  export abstract class AudioInput {
87
- protected deferredStream: DeferredReadableStream<AudioFrame> =
88
- new DeferredReadableStream<AudioFrame>();
87
+ protected multiStream: MultiInputStream<AudioFrame> = new MultiInputStream<AudioFrame>();
89
88
 
90
89
  get stream(): ReadableStream<AudioFrame> {
91
- return this.deferredStream.stream;
90
+ return this.multiStream.stream;
91
+ }
92
+
93
+ async close(): Promise<void> {
94
+ await this.multiStream.close();
92
95
  }
93
96
 
94
97
  onAttached(): void {}
@@ -105,6 +105,7 @@ export class RecorderIO {
105
105
  await this.outChan.close();
106
106
  await this.closeFuture.await;
107
107
  await cancelAndWait([this.forwardTask!, this.encodeTask!]);
108
+ await this.inRecord?.close();
108
109
 
109
110
  this.started = false;
110
111
  } finally {
@@ -378,7 +379,7 @@ class RecorderAudioInput extends AudioInput {
378
379
  this.source = source;
379
380
 
380
381
  // Set up the intercepting stream
381
- this.deferredStream.setSource(this.createInterceptingStream());
382
+ this.multiStream.addInputStream(this.createInterceptingStream());
382
383
  }
383
384
 
384
385
  /**
@@ -1,9 +1,10 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
5
4
  import {
5
+ type AudioFrame,
6
6
  AudioStream,
7
+ FrameProcessor,
7
8
  type NoiseCancellationOptions,
8
9
  RemoteParticipant,
9
10
  type RemoteTrack,
@@ -25,7 +26,9 @@ export class ParticipantAudioInputStream extends AudioInput {
25
26
  private frameProcessor?: FrameProcessor<AudioFrame>;
26
27
  private publication: RemoteTrackPublication | null = null;
27
28
  private participantIdentity: string | null = null;
29
+ private currentInputId: string | null = null;
28
30
  private logger = log();
31
+
29
32
  constructor({
30
33
  room,
31
34
  sampleRate,
@@ -121,8 +124,9 @@ export class ParticipantAudioInputStream extends AudioInput {
121
124
  };
122
125
 
123
126
  private closeStream() {
124
- if (this.deferredStream.isSourceSet) {
125
- this.deferredStream.detachSource();
127
+ if (this.currentInputId) {
128
+ void this.multiStream.removeInputStream(this.currentInputId);
129
+ this.currentInputId = null;
126
130
  }
127
131
 
128
132
  this.publication = null;
@@ -143,7 +147,7 @@ export class ParticipantAudioInputStream extends AudioInput {
143
147
  }
144
148
  this.closeStream();
145
149
  this.publication = publication;
146
- this.deferredStream.setSource(
150
+ this.currentInputId = this.multiStream.addInputStream(
147
151
  resampleStream({
148
152
  stream: this.createStream(track),
149
153
  outputRate: this.sampleRate,
@@ -179,14 +183,14 @@ export class ParticipantAudioInputStream extends AudioInput {
179
183
  }) as unknown as ReadableStream<AudioFrame>;
180
184
  }
181
185
 
182
- async close() {
186
+ override async close() {
183
187
  this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
184
188
  this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
185
189
  this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
186
190
  this.closeStream();
191
+ await super.close();
192
+
187
193
  this.frameProcessor?.close();
188
194
  this.frameProcessor = undefined;
189
- // Ignore errors - stream may be locked by RecorderIO or already cancelled
190
- await this.deferredStream.stream.cancel().catch(() => {});
191
195
  }
192
196
  }
@@ -376,6 +376,18 @@ export class RoomIO {
376
376
  return this.participantAvailableFuture.done;
377
377
  }
378
378
 
379
+ get linkedParticipant(): RemoteParticipant | undefined {
380
+ if (!this.isParticipantAvailable) {
381
+ return undefined;
382
+ }
383
+
384
+ return this.participantAvailableFuture.result;
385
+ }
386
+
387
+ get localParticipant(): Participant | undefined {
388
+ return this.room.localParticipant ?? undefined;
389
+ }
390
+
379
391
  /** Switch to a different participant */
380
392
  setParticipant(participantIdentity: string | null) {
381
393
  this.logger.debug({ participantIdentity }, 'setting participant');
@@ -5,7 +5,7 @@ import type { Context } from '@opentelemetry/api';
5
5
  import type { ChatItem } from '../llm/index.js';
6
6
  import type { Task } from '../utils.js';
7
7
  import { Event, Future, shortuuid } from '../utils.js';
8
- import { asyncLocalStorage } from './agent.js';
8
+ import { functionCallStorage } from './agent.js';
9
9
 
10
10
  /** Symbol used to identify SpeechHandle instances */
11
11
  const SPEECH_HANDLE_SYMBOL = Symbol.for('livekit.agents.SpeechHandle');
@@ -46,6 +46,9 @@ export class SpeechHandle {
46
46
  /** @internal - OpenTelemetry context for the agent turn span */
47
47
  _agentTurnContext?: Context;
48
48
 
49
+ /** @internal - used by AgentTask/RunResult final output plumbing */
50
+ _maybeRunFinalOutput?: unknown;
51
+
49
52
  private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set();
50
53
  private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set();
51
54
 
@@ -148,7 +151,7 @@ export class SpeechHandle {
148
151
  * has entirely played out, including any tool calls and response follow-ups.
149
152
  */
150
153
  async waitForPlayout(): Promise<void> {
151
- const store = asyncLocalStorage.getStore();
154
+ const store = functionCallStorage.getStore();
152
155
  if (store && store?.functionCall) {
153
156
  throw new Error(
154
157
  `Cannot call 'SpeechHandle.waitForPlayout()' from inside the function tool '${store.functionCall.name}'. ` +
@@ -167,6 +170,10 @@ export class SpeechHandle {
167
170
  }
168
171
 
169
172
  addDoneCallback(callback: (sh: SpeechHandle) => void) {
173
+ if (this.done()) {
174
+ queueMicrotask(() => callback(this));
175
+ return;
176
+ }
170
177
  this.doneCallbacks.add(callback);
171
178
  }
172
179