@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/stream/index.cjs +3 -0
  56. package/dist/stream/index.cjs.map +1 -1
  57. package/dist/stream/index.d.cts +1 -0
  58. package/dist/stream/index.d.ts +1 -0
  59. package/dist/stream/index.d.ts.map +1 -1
  60. package/dist/stream/index.js +2 -0
  61. package/dist/stream/index.js.map +1 -1
  62. package/dist/stream/multi_input_stream.cjs +139 -0
  63. package/dist/stream/multi_input_stream.cjs.map +1 -0
  64. package/dist/stream/multi_input_stream.d.cts +55 -0
  65. package/dist/stream/multi_input_stream.d.ts +55 -0
  66. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  67. package/dist/stream/multi_input_stream.js +115 -0
  68. package/dist/stream/multi_input_stream.js.map +1 -0
  69. package/dist/stream/multi_input_stream.test.cjs +340 -0
  70. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  71. package/dist/stream/multi_input_stream.test.js +339 -0
  72. package/dist/stream/multi_input_stream.test.js.map +1 -0
  73. package/dist/telemetry/trace_types.cjs +42 -0
  74. package/dist/telemetry/trace_types.cjs.map +1 -1
  75. package/dist/telemetry/trace_types.d.cts +14 -0
  76. package/dist/telemetry/trace_types.d.ts +14 -0
  77. package/dist/telemetry/trace_types.d.ts.map +1 -1
  78. package/dist/telemetry/trace_types.js +28 -0
  79. package/dist/telemetry/trace_types.js.map +1 -1
  80. package/dist/utils.cjs +44 -2
  81. package/dist/utils.cjs.map +1 -1
  82. package/dist/utils.d.cts +8 -0
  83. package/dist/utils.d.ts +8 -0
  84. package/dist/utils.d.ts.map +1 -1
  85. package/dist/utils.js +44 -2
  86. package/dist/utils.js.map +1 -1
  87. package/dist/utils.test.cjs +71 -0
  88. package/dist/utils.test.cjs.map +1 -1
  89. package/dist/utils.test.js +71 -0
  90. package/dist/utils.test.js.map +1 -1
  91. package/dist/version.cjs +1 -1
  92. package/dist/version.cjs.map +1 -1
  93. package/dist/version.d.cts +1 -1
  94. package/dist/version.d.ts +1 -1
  95. package/dist/version.d.ts.map +1 -1
  96. package/dist/version.js +1 -1
  97. package/dist/version.js.map +1 -1
  98. package/dist/voice/agent.cjs +144 -12
  99. package/dist/voice/agent.cjs.map +1 -1
  100. package/dist/voice/agent.d.cts +29 -4
  101. package/dist/voice/agent.d.ts +29 -4
  102. package/dist/voice/agent.d.ts.map +1 -1
  103. package/dist/voice/agent.js +140 -11
  104. package/dist/voice/agent.js.map +1 -1
  105. package/dist/voice/agent.test.cjs +120 -0
  106. package/dist/voice/agent.test.cjs.map +1 -1
  107. package/dist/voice/agent.test.js +122 -2
  108. package/dist/voice/agent.test.js.map +1 -1
  109. package/dist/voice/agent_activity.cjs +402 -292
  110. package/dist/voice/agent_activity.cjs.map +1 -1
  111. package/dist/voice/agent_activity.d.cts +35 -7
  112. package/dist/voice/agent_activity.d.ts +35 -7
  113. package/dist/voice/agent_activity.d.ts.map +1 -1
  114. package/dist/voice/agent_activity.js +402 -287
  115. package/dist/voice/agent_activity.js.map +1 -1
  116. package/dist/voice/agent_session.cjs +156 -44
  117. package/dist/voice/agent_session.cjs.map +1 -1
  118. package/dist/voice/agent_session.d.cts +22 -9
  119. package/dist/voice/agent_session.d.ts +22 -9
  120. package/dist/voice/agent_session.d.ts.map +1 -1
  121. package/dist/voice/agent_session.js +156 -44
  122. package/dist/voice/agent_session.js.map +1 -1
  123. package/dist/voice/audio_recognition.cjs +89 -36
  124. package/dist/voice/audio_recognition.cjs.map +1 -1
  125. package/dist/voice/audio_recognition.d.cts +22 -1
  126. package/dist/voice/audio_recognition.d.ts +22 -1
  127. package/dist/voice/audio_recognition.d.ts.map +1 -1
  128. package/dist/voice/audio_recognition.js +93 -36
  129. package/dist/voice/audio_recognition.js.map +1 -1
  130. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  131. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  132. package/dist/voice/audio_recognition_span.test.js +232 -0
  133. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  134. package/dist/voice/generation.cjs +39 -19
  135. package/dist/voice/generation.cjs.map +1 -1
  136. package/dist/voice/generation.d.ts.map +1 -1
  137. package/dist/voice/generation.js +44 -20
  138. package/dist/voice/generation.js.map +1 -1
  139. package/dist/voice/index.cjs +2 -0
  140. package/dist/voice/index.cjs.map +1 -1
  141. package/dist/voice/index.d.cts +1 -1
  142. package/dist/voice/index.d.ts +1 -1
  143. package/dist/voice/index.d.ts.map +1 -1
  144. package/dist/voice/index.js +2 -1
  145. package/dist/voice/index.js.map +1 -1
  146. package/dist/voice/io.cjs +6 -3
  147. package/dist/voice/io.cjs.map +1 -1
  148. package/dist/voice/io.d.cts +3 -2
  149. package/dist/voice/io.d.ts +3 -2
  150. package/dist/voice/io.d.ts.map +1 -1
  151. package/dist/voice/io.js +6 -3
  152. package/dist/voice/io.js.map +1 -1
  153. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  154. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  155. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  156. package/dist/voice/recorder_io/recorder_io.js +3 -1
  157. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  158. package/dist/voice/room_io/_input.cjs +17 -17
  159. package/dist/voice/room_io/_input.cjs.map +1 -1
  160. package/dist/voice/room_io/_input.d.cts +2 -2
  161. package/dist/voice/room_io/_input.d.ts +2 -2
  162. package/dist/voice/room_io/_input.d.ts.map +1 -1
  163. package/dist/voice/room_io/_input.js +7 -6
  164. package/dist/voice/room_io/_input.js.map +1 -1
  165. package/dist/voice/room_io/room_io.cjs +9 -0
  166. package/dist/voice/room_io/room_io.cjs.map +1 -1
  167. package/dist/voice/room_io/room_io.d.cts +3 -1
  168. package/dist/voice/room_io/room_io.d.ts +3 -1
  169. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  170. package/dist/voice/room_io/room_io.js +9 -0
  171. package/dist/voice/room_io/room_io.js.map +1 -1
  172. package/dist/voice/speech_handle.cjs +7 -1
  173. package/dist/voice/speech_handle.cjs.map +1 -1
  174. package/dist/voice/speech_handle.d.cts +2 -0
  175. package/dist/voice/speech_handle.d.ts +2 -0
  176. package/dist/voice/speech_handle.d.ts.map +1 -1
  177. package/dist/voice/speech_handle.js +8 -2
  178. package/dist/voice/speech_handle.js.map +1 -1
  179. package/dist/voice/testing/run_result.cjs +66 -15
  180. package/dist/voice/testing/run_result.cjs.map +1 -1
  181. package/dist/voice/testing/run_result.d.cts +14 -3
  182. package/dist/voice/testing/run_result.d.ts +14 -3
  183. package/dist/voice/testing/run_result.d.ts.map +1 -1
  184. package/dist/voice/testing/run_result.js +66 -15
  185. package/dist/voice/testing/run_result.js.map +1 -1
  186. package/dist/voice/utils.cjs +47 -0
  187. package/dist/voice/utils.cjs.map +1 -0
  188. package/dist/voice/utils.d.cts +4 -0
  189. package/dist/voice/utils.d.ts +4 -0
  190. package/dist/voice/utils.d.ts.map +1 -0
  191. package/dist/voice/utils.js +23 -0
  192. package/dist/voice/utils.js.map +1 -0
  193. package/package.json +1 -1
  194. package/src/cli.ts +20 -33
  195. package/src/ipc/job_proc_lazy_main.ts +16 -5
  196. package/src/llm/chat_context.ts +35 -0
  197. package/src/llm/provider_format/index.ts +7 -2
  198. package/src/llm/provider_format/openai.test.ts +385 -1
  199. package/src/llm/provider_format/openai.ts +103 -0
  200. package/src/llm/provider_format/utils.ts +6 -4
  201. package/src/llm/realtime.ts +1 -0
  202. package/src/log.ts +5 -2
  203. package/src/stream/deferred_stream.ts +17 -6
  204. package/src/stream/index.ts +1 -0
  205. package/src/stream/multi_input_stream.test.ts +540 -0
  206. package/src/stream/multi_input_stream.ts +172 -0
  207. package/src/telemetry/trace_types.ts +18 -0
  208. package/src/utils.test.ts +87 -0
  209. package/src/utils.ts +52 -2
  210. package/src/version.ts +1 -1
  211. package/src/voice/agent.test.ts +140 -2
  212. package/src/voice/agent.ts +189 -10
  213. package/src/voice/agent_activity.ts +449 -286
  214. package/src/voice/agent_session.ts +195 -51
  215. package/src/voice/audio_recognition.ts +118 -38
  216. package/src/voice/audio_recognition_span.test.ts +261 -0
  217. package/src/voice/generation.ts +52 -23
  218. package/src/voice/index.ts +1 -1
  219. package/src/voice/io.ts +7 -4
  220. package/src/voice/recorder_io/recorder_io.ts +2 -1
  221. package/src/voice/room_io/_input.ts +11 -7
  222. package/src/voice/room_io/room_io.ts +12 -0
  223. package/src/voice/speech_handle.ts +9 -2
  224. package/src/voice/testing/run_result.ts +81 -23
  225. package/src/voice/utils.ts +29 -0
@@ -1,12 +1,14 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import { Mutex } from '@livekit/mutex';
4
5
  import type { AudioFrame, Room } from '@livekit/rtc-node';
5
6
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
7
  import type { Context, Span } from '@opentelemetry/api';
7
8
  import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
9
  import { EventEmitter } from 'node:events';
9
10
  import type { ReadableStream } from 'node:stream/web';
11
+ import type { z } from 'zod';
10
12
  import {
11
13
  LLM as InferenceLLM,
12
14
  STT as InferenceSTT,
@@ -31,6 +33,7 @@ import {
31
33
  type ResolvedSessionConnectOptions,
32
34
  type SessionConnectOptions,
33
35
  } from '../types.js';
36
+ import { Task } from '../utils.js';
34
37
  import type { VAD } from '../vad.js';
35
38
  import type { Agent } from './agent.js';
36
39
  import { AgentActivity } from './agent_activity.js';
@@ -62,6 +65,7 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io
62
65
  import type { UnknownUserData } from './run_context.js';
63
66
  import type { SpeechHandle } from './speech_handle.js';
64
67
  import { RunResult } from './testing/run_result.js';
68
+ import { setParticipantSpanAttributes } from './utils.js';
65
69
 
66
70
  export interface VoiceOptions {
67
71
  allowInterruptions: boolean;
@@ -114,6 +118,13 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
114
118
  connOptions?: SessionConnectOptions;
115
119
  };
116
120
 
121
+ type ActivityTransitionOptions = {
122
+ previousActivity?: 'close' | 'pause';
123
+ newActivity?: 'start' | 'resume';
124
+ blockedTasks?: Task<any>[];
125
+ waitOnEnter?: boolean;
126
+ };
127
+
117
128
  export class AgentSession<
118
129
  UserData = UnknownUserData,
119
130
  > extends (EventEmitter as new () => TypedEmitter<AgentSessionCallbacks>) {
@@ -128,10 +139,13 @@ export class AgentSession<
128
139
  private agent?: Agent;
129
140
  private activity?: AgentActivity;
130
141
  private nextActivity?: AgentActivity;
142
+ private updateActivityTask?: Task<void>;
131
143
  private started = false;
132
144
  private userState: UserState = 'listening';
145
+ private readonly activityLock = new Mutex();
133
146
 
134
- private roomIO?: RoomIO;
147
+ /** @internal */
148
+ _roomIO?: RoomIO;
135
149
  private logger = log();
136
150
 
137
151
  private _chatCtx: ChatContext;
@@ -294,7 +308,7 @@ export class AgentSession<
294
308
 
295
309
  const tasks: Promise<void>[] = [];
296
310
 
297
- if (room && !this.roomIO) {
311
+ if (room && !this._roomIO) {
298
312
  // Check for existing input/output configuration and warn if needed
299
313
  if (this.input.audio && inputOptions?.audioEnabled !== false) {
300
314
  this.logger.warn(
@@ -314,13 +328,13 @@ export class AgentSession<
314
328
  );
315
329
  }
316
330
 
317
- this.roomIO = new RoomIO({
331
+ this._roomIO = new RoomIO({
318
332
  agentSession: this,
319
333
  room,
320
334
  inputOptions,
321
335
  outputOptions,
322
336
  });
323
- this.roomIO.start();
337
+ this._roomIO.start();
324
338
  }
325
339
 
326
340
  let ctx: JobContext | undefined = undefined;
@@ -358,7 +372,8 @@ export class AgentSession<
358
372
  }
359
373
 
360
374
  // TODO(AJS-265): add shutdown callback to job context
361
- tasks.push(this.updateActivity(this.agent));
375
+ // Initial start does not wait on onEnter
376
+ tasks.push(this._updateActivity(this.agent, { waitOnEnter: false }));
362
377
 
363
378
  await Promise.allSettled(tasks);
364
379
 
@@ -430,8 +445,34 @@ export class AgentSession<
430
445
  updateAgent(agent: Agent): void {
431
446
  this.agent = agent;
432
447
 
433
- if (this.started) {
434
- this.updateActivity(agent);
448
+ if (!this.started) {
449
+ return;
450
+ }
451
+
452
+ const _updateActivityTask = async (oldTask: Task<void> | undefined, agent: Agent) => {
453
+ if (oldTask) {
454
+ try {
455
+ await oldTask.result;
456
+ } catch (error) {
457
+ this.logger.error(error, 'previous updateAgent transition failed');
458
+ }
459
+ }
460
+
461
+ await this._updateActivity(agent);
462
+ };
463
+
464
+ const oldTask = this.updateActivityTask;
465
+ this.updateActivityTask = Task.from(
466
+ async () => _updateActivityTask(oldTask, agent),
467
+ undefined,
468
+ 'AgentSession_updateActivityTask',
469
+ );
470
+
471
+ const runState = this._globalRunState;
472
+ if (runState) {
473
+ // Don't mark the RunResult as done, if there is currently an agent transition happening.
474
+ // (used to make sure we're correctly adding the AgentHandoffResult before completion)
475
+ runState._watchHandle(this.updateActivityTask);
435
476
  }
436
477
  }
437
478
 
@@ -462,24 +503,42 @@ export class AgentSession<
462
503
  throw new Error('AgentSession is not running');
463
504
  }
464
505
 
465
- const doSay = (activity: AgentActivity) => {
506
+ const doSay = (activity: AgentActivity, nextActivity?: AgentActivity) => {
507
+ if (activity.schedulingPaused) {
508
+ if (!nextActivity) {
509
+ throw new Error('AgentSession is closing, cannot use say()');
510
+ }
511
+ return nextActivity.say(text, options);
512
+ }
466
513
  return activity.say(text, options);
467
514
  };
468
515
 
516
+ const runState = this._globalRunState;
517
+ let handle: SpeechHandle;
518
+
469
519
  // attach to the session span if called outside of the AgentSession
470
520
  const activeSpan = trace.getActiveSpan();
471
521
  if (!activeSpan && this.rootSpanContext) {
472
- return otelContext.with(this.rootSpanContext, () => doSay(this.activity!));
522
+ handle = otelContext.with(this.rootSpanContext, () =>
523
+ doSay(this.activity!, this.nextActivity),
524
+ );
525
+ } else {
526
+ handle = doSay(this.activity, this.nextActivity);
527
+ }
528
+
529
+ if (runState) {
530
+ runState._watchHandle(handle);
473
531
  }
474
532
 
475
- return doSay(this.activity);
533
+ return handle;
476
534
  }
477
535
 
478
- interrupt() {
536
+ interrupt(options?: { force?: boolean }) {
479
537
  if (!this.activity) {
480
538
  throw new Error('AgentSession is not running');
481
539
  }
482
- return this.activity.interrupt();
540
+
541
+ return this.activity.interrupt(options);
483
542
  }
484
543
 
485
544
  generateReply(options?: {
@@ -500,7 +559,7 @@ export class AgentSession<
500
559
  : undefined;
501
560
 
502
561
  const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
503
- if (activity.draining) {
562
+ if (activity.schedulingPaused) {
504
563
  if (!nextActivity) {
505
564
  throw new Error('AgentSession is closing, cannot use generateReply()');
506
565
  }
@@ -540,53 +599,128 @@ export class AgentSession<
540
599
  * result.expect.noMoreEvents();
541
600
  * ```
542
601
  *
543
- * @param options - Run options including user input
602
+ * @param options - Run options including user input and optional output type
544
603
  * @returns A RunResult that resolves when the agent finishes responding
545
- *
546
- * TODO: Add outputType parameter for typed outputs (parity with Python)
547
604
  */
548
- run(options: { userInput: string }): RunResult {
605
+ run<T = unknown>({
606
+ userInput,
607
+ outputType,
608
+ }: {
609
+ userInput: string;
610
+ outputType?: z.ZodType<T>;
611
+ }): RunResult<T> {
549
612
  if (this._globalRunState && !this._globalRunState.done()) {
550
613
  throw new Error('nested runs are not supported');
551
614
  }
552
615
 
553
- const runState = new RunResult({ userInput: options.userInput });
616
+ const runState = new RunResult<T>({
617
+ userInput,
618
+ outputType,
619
+ });
620
+
554
621
  this._globalRunState = runState;
555
- this.generateReply({ userInput: options.userInput });
622
+
623
+ // Defer generateReply through the activityLock to ensure any in-progress
624
+ // activity transition (e.g. AgentTask started from onEnter) completes first.
625
+ // TS Task.from starts onEnter synchronously, so the transition may already be
626
+ // mid-flight by the time run() is called after session.start() resolves.
627
+ // Acquiring and immediately releasing the lock guarantees FIFO ordering:
628
+ // the transition's lock section finishes before we route generateReply.
629
+ (async () => {
630
+ try {
631
+ const unlock = await this.activityLock.lock();
632
+ unlock();
633
+ this.generateReply({ userInput });
634
+ } catch (e) {
635
+ runState._reject(e instanceof Error ? e : new Error(String(e)));
636
+ }
637
+ })();
556
638
 
557
639
  return runState;
558
640
  }
559
641
 
560
- private async updateActivity(agent: Agent): Promise<void> {
642
+ /** @internal */
643
+ async _updateActivity(agent: Agent, options: ActivityTransitionOptions = {}): Promise<void> {
644
+ const { previousActivity = 'close', newActivity = 'start', blockedTasks = [] } = options;
645
+ const waitOnEnter = options.waitOnEnter ?? newActivity === 'start';
646
+
561
647
  const runWithContext = async () => {
562
- // TODO(AJS-129): add lock to agent activity core lifecycle
563
- this.nextActivity = new AgentActivity(agent, this);
648
+ const unlock = await this.activityLock.lock();
649
+ let onEnterTask: Task<void> | undefined;
564
650
 
565
- const previousActivity = this.activity;
651
+ try {
652
+ this.agent = agent;
653
+ const prevActivityObj = this.activity;
654
+
655
+ if (newActivity === 'start') {
656
+ const prevAgent = prevActivityObj?.agent;
657
+ if (
658
+ agent._agentActivity &&
659
+ // allow updating the same agent that is running
660
+ (agent !== prevAgent || previousActivity !== 'close')
661
+ ) {
662
+ throw new Error('Cannot start agent: an activity is already running');
663
+ }
664
+ this.nextActivity = new AgentActivity(agent, this);
665
+ } else if (newActivity === 'resume') {
666
+ if (!agent._agentActivity) {
667
+ throw new Error('Cannot resume agent: no existing activity to resume');
668
+ }
669
+ this.nextActivity = agent._agentActivity;
670
+ }
566
671
 
567
- if (this.activity) {
568
- await this.activity.drain();
569
- await this.activity.close();
570
- }
672
+ if (prevActivityObj && prevActivityObj !== this.nextActivity) {
673
+ if (previousActivity === 'pause') {
674
+ await prevActivityObj.pause({ blockedTasks });
675
+ } else {
676
+ await prevActivityObj.drain();
677
+ await prevActivityObj.close();
678
+ }
679
+ }
571
680
 
572
- this.activity = this.nextActivity;
573
- this.nextActivity = undefined;
681
+ this.activity = this.nextActivity;
682
+ this.nextActivity = undefined;
574
683
 
575
- this._chatCtx.insert(
576
- new AgentHandoffItem({
577
- oldAgentId: previousActivity?.agent.id,
684
+ const runState = this._globalRunState;
685
+ const handoffItem = new AgentHandoffItem({
686
+ oldAgentId: prevActivityObj?.agent.id,
578
687
  newAgentId: agent.id,
579
- }),
580
- );
581
- this.logger.debug(
582
- { previousAgentId: previousActivity?.agent.id, newAgentId: agent.id },
583
- 'Agent handoff inserted into chat context',
584
- );
688
+ });
689
+
690
+ if (runState) {
691
+ runState._agentHandoff({
692
+ item: handoffItem,
693
+ oldAgent: prevActivityObj?.agent,
694
+ newAgent: this.activity!.agent,
695
+ });
696
+ }
697
+
698
+ this._chatCtx.insert(handoffItem);
699
+ this.logger.debug(
700
+ { previousAgentId: prevActivityObj?.agent.id, newAgentId: agent.id },
701
+ 'Agent handoff inserted into chat context',
702
+ );
703
+
704
+ if (newActivity === 'start') {
705
+ await this.activity!.start();
706
+ } else {
707
+ await this.activity!.resume();
708
+ }
585
709
 
586
- await this.activity.start();
710
+ onEnterTask = this.activity!._onEnterTask;
587
711
 
588
- if (this._input.audio) {
589
- this.activity.attachAudioInput(this._input.audio.stream);
712
+ if (this._input.audio) {
713
+ this.activity!.attachAudioInput(this._input.audio.stream);
714
+ }
715
+ } finally {
716
+ unlock();
717
+ }
718
+
719
+ if (waitOnEnter) {
720
+ if (!onEnterTask) {
721
+ throw new Error('expected onEnter task to be available while waitOnEnter=true');
722
+ }
723
+ await onEnterTask.result;
590
724
  }
591
725
  };
592
726
 
@@ -700,8 +834,10 @@ export class AgentSession<
700
834
  startTime: options?.startTime,
701
835
  });
702
836
 
703
- // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
704
- // (Ref: Python agent_session.py line 1161-1164)
837
+ const localParticipant = this._roomIO?.localParticipant;
838
+ if (localParticipant) {
839
+ setParticipantSpanAttributes(this.agentSpeakingSpan, localParticipant);
840
+ }
705
841
  }
706
842
  } else if (this.agentSpeakingSpan !== undefined) {
707
843
  // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
@@ -738,8 +874,10 @@ export class AgentSession<
738
874
  startTime: lastSpeakingTime,
739
875
  });
740
876
 
741
- // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available
742
- // (Ref: Python agent_session.py line 1192-1195)
877
+ const linked = this._roomIO?.linkedParticipant;
878
+ if (linked) {
879
+ setParticipantSpanAttributes(this.userSpeakingSpan, linked);
880
+ }
743
881
  } else if (this.userSpeakingSpan !== undefined) {
744
882
  this.userSpeakingSpan.end(lastSpeakingTime);
745
883
  this.userSpeakingSpan = undefined;
@@ -783,7 +921,7 @@ export class AgentSession<
783
921
  return;
784
922
  }
785
923
 
786
- if (this.roomIO && !this.roomIO.isParticipantAvailable) {
924
+ if (this._roomIO && !this._roomIO.isParticipantAvailable) {
787
925
  return;
788
926
  }
789
927
 
@@ -836,15 +974,21 @@ export class AgentSession<
836
974
  if (this.activity) {
837
975
  if (!drain) {
838
976
  try {
839
- this.activity.interrupt();
977
+ await this.activity.interrupt({ force: true }).await;
840
978
  } catch (error) {
841
- // TODO(shubhra): force interrupt or wait for it to finish?
842
- // it might be an audio played from the error callback
979
+ // Uninterruptible speech can throw during forced interruption.
980
+ this.logger.warn({ error }, 'Error interrupting activity');
843
981
  }
844
982
  }
983
+
845
984
  await this.activity.drain();
846
985
  // wait any uninterruptible speech to finish
847
986
  await this.activity.currentSpeech?.waitForPlayout();
987
+
988
+ if (reason !== CloseReason.ERROR) {
989
+ this.activity.commitUserTurn({ audioDetached: true, throwIfNotReady: false });
990
+ }
991
+
848
992
  try {
849
993
  this.activity.detachAudioInput();
850
994
  } catch (error) {
@@ -862,8 +1006,8 @@ export class AgentSession<
862
1006
  this.output.audio = null;
863
1007
  this.output.transcription = null;
864
1008
 
865
- await this.roomIO?.close();
866
- this.roomIO = undefined;
1009
+ await this._roomIO?.close();
1010
+ this._roomIO = undefined;
867
1011
 
868
1012
  await this.activity?.close();
869
1013
  this.activity = undefined;
@@ -1,8 +1,15 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { ParticipantKind } from '@livekit/rtc-node';
4
5
  import { AudioFrame } from '@livekit/rtc-node';
5
- import type { Context, Span } from '@opentelemetry/api';
6
+ import {
7
+ type Context,
8
+ ROOT_CONTEXT,
9
+ type Span,
10
+ context as otelContext,
11
+ trace,
12
+ } from '@opentelemetry/api';
6
13
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
7
14
  import { ReadableStream } from 'node:stream/web';
8
15
  import { type ChatContext } from '../llm/chat_context.js';
@@ -16,6 +23,7 @@ import { Task, delay } from '../utils.js';
16
23
  import { type VAD, type VADEvent, VADEventType } from '../vad.js';
17
24
  import type { TurnDetectionMode } from './agent_session.js';
18
25
  import type { STTNode } from './io.js';
26
+ import { setParticipantSpanAttributes } from './utils.js';
19
27
 
20
28
  export interface EndOfTurnInfo {
21
29
  /** The new transcript text from the user's speech. */
@@ -72,6 +80,22 @@ export interface AudioRecognitionOptions {
72
80
  maxEndpointingDelay: number;
73
81
  /** Root span context for tracing. */
74
82
  rootSpanContext?: Context;
83
+ /** STT model name for tracing */
84
+ sttModel?: string;
85
+ /** STT provider name for tracing */
86
+ sttProvider?: string;
87
+ /** Getter for linked participant for span attribution */
88
+ getLinkedParticipant?: () => ParticipantLike | undefined;
89
+ }
90
+
91
+ /**
92
+ * Minimal participant shape for span attribution.
93
+ * Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
94
+ */
95
+ export interface ParticipantLike {
96
+ sid: string | undefined;
97
+ identity: string;
98
+ kind: ParticipantKind;
75
99
  }
76
100
 
77
101
  export class AudioRecognition {
@@ -84,6 +108,9 @@ export class AudioRecognition {
84
108
  private maxEndpointingDelay: number;
85
109
  private lastLanguage?: string;
86
110
  private rootSpanContext?: Context;
111
+ private sttModel?: string;
112
+ private sttProvider?: string;
113
+ private getLinkedParticipant?: () => ParticipantLike | undefined;
87
114
 
88
115
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
89
116
  private logger = log();
@@ -121,6 +148,9 @@ export class AudioRecognition {
121
148
  this.maxEndpointingDelay = opts.maxEndpointingDelay;
122
149
  this.lastLanguage = undefined;
123
150
  this.rootSpanContext = opts.rootSpanContext;
151
+ this.sttModel = opts.sttModel;
152
+ this.sttProvider = opts.sttProvider;
153
+ this.getLinkedParticipant = opts.getLinkedParticipant;
124
154
 
125
155
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
126
156
  const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
@@ -151,6 +181,37 @@ export class AudioRecognition {
151
181
  });
152
182
  }
153
183
 
184
+ private ensureUserTurnSpan(startTime?: number): Span {
185
+ if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
186
+ return this.userTurnSpan;
187
+ }
188
+
189
+ this.userTurnSpan = tracer.startSpan({
190
+ name: 'user_turn',
191
+ context: this.rootSpanContext,
192
+ startTime,
193
+ });
194
+
195
+ const participant = this.getLinkedParticipant?.();
196
+ if (participant) {
197
+ setParticipantSpanAttributes(this.userTurnSpan, participant);
198
+ }
199
+
200
+ if (this.sttModel) {
201
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
202
+ }
203
+ if (this.sttProvider) {
204
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
205
+ }
206
+
207
+ return this.userTurnSpan;
208
+ }
209
+
210
+ private userTurnContext(span: Span): Context {
211
+ const base = this.rootSpanContext ?? ROOT_CONTEXT;
212
+ return trace.setSpan(base, span);
213
+ }
214
+
154
215
  private async onSTTEvent(ev: SpeechEvent) {
155
216
  if (
156
217
  this.turnDetectionMode === 'manual' &&
@@ -299,19 +360,25 @@ export class AudioRecognition {
299
360
  break;
300
361
  case SpeechEventType.START_OF_SPEECH:
301
362
  if (this.turnDetectionMode !== 'stt') break;
302
- this.hooks.onStartOfSpeech({
303
- type: VADEventType.START_OF_SPEECH,
304
- samplesIndex: 0,
305
- timestamp: Date.now(),
306
- speechDuration: 0,
307
- silenceDuration: 0,
308
- frames: [],
309
- probability: 0,
310
- inferenceDuration: 0,
311
- speaking: true,
312
- rawAccumulatedSilence: 0,
313
- rawAccumulatedSpeech: 0,
314
- });
363
+ {
364
+ const span = this.ensureUserTurnSpan(Date.now());
365
+ const ctx = this.userTurnContext(span);
366
+ otelContext.with(ctx, () => {
367
+ this.hooks.onStartOfSpeech({
368
+ type: VADEventType.START_OF_SPEECH,
369
+ samplesIndex: 0,
370
+ timestamp: Date.now(),
371
+ speechDuration: 0,
372
+ silenceDuration: 0,
373
+ frames: [],
374
+ probability: 0,
375
+ inferenceDuration: 0,
376
+ speaking: true,
377
+ rawAccumulatedSilence: 0,
378
+ rawAccumulatedSpeech: 0,
379
+ });
380
+ });
381
+ }
315
382
  this.speaking = true;
316
383
  this.lastSpeakingTime = Date.now();
317
384
 
@@ -319,19 +386,25 @@ export class AudioRecognition {
319
386
  break;
320
387
  case SpeechEventType.END_OF_SPEECH:
321
388
  if (this.turnDetectionMode !== 'stt') break;
322
- this.hooks.onEndOfSpeech({
323
- type: VADEventType.END_OF_SPEECH,
324
- samplesIndex: 0,
325
- timestamp: Date.now(),
326
- speechDuration: 0,
327
- silenceDuration: 0,
328
- frames: [],
329
- probability: 0,
330
- inferenceDuration: 0,
331
- speaking: false,
332
- rawAccumulatedSilence: 0,
333
- rawAccumulatedSpeech: 0,
334
- });
389
+ {
390
+ const span = this.ensureUserTurnSpan();
391
+ const ctx = this.userTurnContext(span);
392
+ otelContext.with(ctx, () => {
393
+ this.hooks.onEndOfSpeech({
394
+ type: VADEventType.END_OF_SPEECH,
395
+ samplesIndex: 0,
396
+ timestamp: Date.now(),
397
+ speechDuration: 0,
398
+ silenceDuration: 0,
399
+ frames: [],
400
+ probability: 0,
401
+ inferenceDuration: 0,
402
+ speaking: false,
403
+ rawAccumulatedSilence: 0,
404
+ rawAccumulatedSpeech: 0,
405
+ });
406
+ });
407
+ }
335
408
  this.speaking = false;
336
409
  this.userTurnCommitted = true;
337
410
  this.lastSpeakingTime = Date.now();
@@ -376,6 +449,9 @@ export class AudioRecognition {
376
449
  async (controller: AbortController) => {
377
450
  let endpointingDelay = this.minEndpointingDelay;
378
451
 
452
+ const userTurnSpan = this.ensureUserTurnSpan();
453
+ const userTurnCtx = this.userTurnContext(userTurnSpan);
454
+
379
455
  if (turnDetector) {
380
456
  await tracer.startActiveSpan(
381
457
  async (span) => {
@@ -415,7 +491,7 @@ export class AudioRecognition {
415
491
  },
416
492
  {
417
493
  name: 'eou_detection',
418
- context: this.rootSpanContext,
494
+ context: userTurnCtx,
419
495
  },
420
496
  );
421
497
  }
@@ -577,17 +653,13 @@ export class AudioRecognition {
577
653
  switch (ev.type) {
578
654
  case VADEventType.START_OF_SPEECH:
579
655
  this.logger.debug('VAD task: START_OF_SPEECH');
580
- this.hooks.onStartOfSpeech(ev);
581
- this.speaking = true;
582
-
583
- if (!this.userTurnSpan) {
656
+ {
584
657
  const startTime = Date.now() - ev.speechDuration;
585
- this.userTurnSpan = tracer.startSpan({
586
- name: 'user_turn',
587
- context: this.rootSpanContext,
588
- startTime,
589
- });
658
+ const span = this.ensureUserTurnSpan(startTime);
659
+ const ctx = this.userTurnContext(span);
660
+ otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
590
661
  }
662
+ this.speaking = true;
591
663
 
592
664
  // Capture sample rate from the first VAD event if not already set
593
665
  if (ev.frames.length > 0 && ev.frames[0]) {
@@ -609,7 +681,11 @@ export class AudioRecognition {
609
681
  break;
610
682
  case VADEventType.END_OF_SPEECH:
611
683
  this.logger.debug('VAD task: END_OF_SPEECH');
612
- this.hooks.onEndOfSpeech(ev);
684
+ {
685
+ const span = this.ensureUserTurnSpan();
686
+ const ctx = this.userTurnContext(span);
687
+ otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
688
+ }
613
689
 
614
690
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
615
691
  this.speaking = false;
@@ -692,6 +768,10 @@ export class AudioRecognition {
692
768
  this.logger.debug('User turn committed');
693
769
  })
694
770
  .catch((err: unknown) => {
771
+ if (err instanceof Error && err.name === 'AbortError') {
772
+ this.logger.debug('User turn commit task cancelled');
773
+ return;
774
+ }
695
775
  this.logger.error(err, 'Error in user turn commit task:');
696
776
  });
697
777
  }