@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/stream/index.cjs +3 -0
  56. package/dist/stream/index.cjs.map +1 -1
  57. package/dist/stream/index.d.cts +1 -0
  58. package/dist/stream/index.d.ts +1 -0
  59. package/dist/stream/index.d.ts.map +1 -1
  60. package/dist/stream/index.js +2 -0
  61. package/dist/stream/index.js.map +1 -1
  62. package/dist/stream/multi_input_stream.cjs +139 -0
  63. package/dist/stream/multi_input_stream.cjs.map +1 -0
  64. package/dist/stream/multi_input_stream.d.cts +55 -0
  65. package/dist/stream/multi_input_stream.d.ts +55 -0
  66. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  67. package/dist/stream/multi_input_stream.js +115 -0
  68. package/dist/stream/multi_input_stream.js.map +1 -0
  69. package/dist/stream/multi_input_stream.test.cjs +340 -0
  70. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  71. package/dist/stream/multi_input_stream.test.js +339 -0
  72. package/dist/stream/multi_input_stream.test.js.map +1 -0
  73. package/dist/telemetry/trace_types.cjs +42 -0
  74. package/dist/telemetry/trace_types.cjs.map +1 -1
  75. package/dist/telemetry/trace_types.d.cts +14 -0
  76. package/dist/telemetry/trace_types.d.ts +14 -0
  77. package/dist/telemetry/trace_types.d.ts.map +1 -1
  78. package/dist/telemetry/trace_types.js +28 -0
  79. package/dist/telemetry/trace_types.js.map +1 -1
  80. package/dist/utils.cjs +44 -2
  81. package/dist/utils.cjs.map +1 -1
  82. package/dist/utils.d.cts +8 -0
  83. package/dist/utils.d.ts +8 -0
  84. package/dist/utils.d.ts.map +1 -1
  85. package/dist/utils.js +44 -2
  86. package/dist/utils.js.map +1 -1
  87. package/dist/utils.test.cjs +71 -0
  88. package/dist/utils.test.cjs.map +1 -1
  89. package/dist/utils.test.js +71 -0
  90. package/dist/utils.test.js.map +1 -1
  91. package/dist/version.cjs +1 -1
  92. package/dist/version.cjs.map +1 -1
  93. package/dist/version.d.cts +1 -1
  94. package/dist/version.d.ts +1 -1
  95. package/dist/version.d.ts.map +1 -1
  96. package/dist/version.js +1 -1
  97. package/dist/version.js.map +1 -1
  98. package/dist/voice/agent.cjs +144 -12
  99. package/dist/voice/agent.cjs.map +1 -1
  100. package/dist/voice/agent.d.cts +29 -4
  101. package/dist/voice/agent.d.ts +29 -4
  102. package/dist/voice/agent.d.ts.map +1 -1
  103. package/dist/voice/agent.js +140 -11
  104. package/dist/voice/agent.js.map +1 -1
  105. package/dist/voice/agent.test.cjs +120 -0
  106. package/dist/voice/agent.test.cjs.map +1 -1
  107. package/dist/voice/agent.test.js +122 -2
  108. package/dist/voice/agent.test.js.map +1 -1
  109. package/dist/voice/agent_activity.cjs +402 -292
  110. package/dist/voice/agent_activity.cjs.map +1 -1
  111. package/dist/voice/agent_activity.d.cts +35 -7
  112. package/dist/voice/agent_activity.d.ts +35 -7
  113. package/dist/voice/agent_activity.d.ts.map +1 -1
  114. package/dist/voice/agent_activity.js +402 -287
  115. package/dist/voice/agent_activity.js.map +1 -1
  116. package/dist/voice/agent_session.cjs +156 -44
  117. package/dist/voice/agent_session.cjs.map +1 -1
  118. package/dist/voice/agent_session.d.cts +22 -9
  119. package/dist/voice/agent_session.d.ts +22 -9
  120. package/dist/voice/agent_session.d.ts.map +1 -1
  121. package/dist/voice/agent_session.js +156 -44
  122. package/dist/voice/agent_session.js.map +1 -1
  123. package/dist/voice/audio_recognition.cjs +89 -36
  124. package/dist/voice/audio_recognition.cjs.map +1 -1
  125. package/dist/voice/audio_recognition.d.cts +22 -1
  126. package/dist/voice/audio_recognition.d.ts +22 -1
  127. package/dist/voice/audio_recognition.d.ts.map +1 -1
  128. package/dist/voice/audio_recognition.js +93 -36
  129. package/dist/voice/audio_recognition.js.map +1 -1
  130. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  131. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  132. package/dist/voice/audio_recognition_span.test.js +232 -0
  133. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  134. package/dist/voice/generation.cjs +39 -19
  135. package/dist/voice/generation.cjs.map +1 -1
  136. package/dist/voice/generation.d.ts.map +1 -1
  137. package/dist/voice/generation.js +44 -20
  138. package/dist/voice/generation.js.map +1 -1
  139. package/dist/voice/index.cjs +2 -0
  140. package/dist/voice/index.cjs.map +1 -1
  141. package/dist/voice/index.d.cts +1 -1
  142. package/dist/voice/index.d.ts +1 -1
  143. package/dist/voice/index.d.ts.map +1 -1
  144. package/dist/voice/index.js +2 -1
  145. package/dist/voice/index.js.map +1 -1
  146. package/dist/voice/io.cjs +6 -3
  147. package/dist/voice/io.cjs.map +1 -1
  148. package/dist/voice/io.d.cts +3 -2
  149. package/dist/voice/io.d.ts +3 -2
  150. package/dist/voice/io.d.ts.map +1 -1
  151. package/dist/voice/io.js +6 -3
  152. package/dist/voice/io.js.map +1 -1
  153. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  154. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  155. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  156. package/dist/voice/recorder_io/recorder_io.js +3 -1
  157. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  158. package/dist/voice/room_io/_input.cjs +17 -17
  159. package/dist/voice/room_io/_input.cjs.map +1 -1
  160. package/dist/voice/room_io/_input.d.cts +2 -2
  161. package/dist/voice/room_io/_input.d.ts +2 -2
  162. package/dist/voice/room_io/_input.d.ts.map +1 -1
  163. package/dist/voice/room_io/_input.js +7 -6
  164. package/dist/voice/room_io/_input.js.map +1 -1
  165. package/dist/voice/room_io/room_io.cjs +9 -0
  166. package/dist/voice/room_io/room_io.cjs.map +1 -1
  167. package/dist/voice/room_io/room_io.d.cts +3 -1
  168. package/dist/voice/room_io/room_io.d.ts +3 -1
  169. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  170. package/dist/voice/room_io/room_io.js +9 -0
  171. package/dist/voice/room_io/room_io.js.map +1 -1
  172. package/dist/voice/speech_handle.cjs +7 -1
  173. package/dist/voice/speech_handle.cjs.map +1 -1
  174. package/dist/voice/speech_handle.d.cts +2 -0
  175. package/dist/voice/speech_handle.d.ts +2 -0
  176. package/dist/voice/speech_handle.d.ts.map +1 -1
  177. package/dist/voice/speech_handle.js +8 -2
  178. package/dist/voice/speech_handle.js.map +1 -1
  179. package/dist/voice/testing/run_result.cjs +66 -15
  180. package/dist/voice/testing/run_result.cjs.map +1 -1
  181. package/dist/voice/testing/run_result.d.cts +14 -3
  182. package/dist/voice/testing/run_result.d.ts +14 -3
  183. package/dist/voice/testing/run_result.d.ts.map +1 -1
  184. package/dist/voice/testing/run_result.js +66 -15
  185. package/dist/voice/testing/run_result.js.map +1 -1
  186. package/dist/voice/utils.cjs +47 -0
  187. package/dist/voice/utils.cjs.map +1 -0
  188. package/dist/voice/utils.d.cts +4 -0
  189. package/dist/voice/utils.d.ts +4 -0
  190. package/dist/voice/utils.d.ts.map +1 -0
  191. package/dist/voice/utils.js +23 -0
  192. package/dist/voice/utils.js.map +1 -0
  193. package/package.json +1 -1
  194. package/src/cli.ts +20 -33
  195. package/src/ipc/job_proc_lazy_main.ts +16 -5
  196. package/src/llm/chat_context.ts +35 -0
  197. package/src/llm/provider_format/index.ts +7 -2
  198. package/src/llm/provider_format/openai.test.ts +385 -1
  199. package/src/llm/provider_format/openai.ts +103 -0
  200. package/src/llm/provider_format/utils.ts +6 -4
  201. package/src/llm/realtime.ts +1 -0
  202. package/src/log.ts +5 -2
  203. package/src/stream/deferred_stream.ts +17 -6
  204. package/src/stream/index.ts +1 -0
  205. package/src/stream/multi_input_stream.test.ts +540 -0
  206. package/src/stream/multi_input_stream.ts +172 -0
  207. package/src/telemetry/trace_types.ts +18 -0
  208. package/src/utils.test.ts +87 -0
  209. package/src/utils.ts +52 -2
  210. package/src/version.ts +1 -1
  211. package/src/voice/agent.test.ts +140 -2
  212. package/src/voice/agent.ts +189 -10
  213. package/src/voice/agent_activity.ts +449 -286
  214. package/src/voice/agent_session.ts +195 -51
  215. package/src/voice/audio_recognition.ts +118 -38
  216. package/src/voice/audio_recognition_span.test.ts +261 -0
  217. package/src/voice/generation.ts +52 -23
  218. package/src/voice/index.ts +1 -1
  219. package/src/voice/io.ts +7 -4
  220. package/src/voice/recorder_io/recorder_io.ts +2 -1
  221. package/src/voice/room_io/_input.ts +11 -7
  222. package/src/voice/room_io/room_io.ts +12 -0
  223. package/src/voice/speech_handle.ts +9 -2
  224. package/src/voice/testing/run_result.ts +81 -23
  225. package/src/voice/utils.ts +29 -0
@@ -35,7 +35,7 @@ import type {
35
35
  TTSMetrics,
36
36
  VADMetrics,
37
37
  } from '../metrics/base.js';
38
- import { DeferredReadableStream } from '../stream/deferred_stream.js';
38
+ import { MultiInputStream } from '../stream/multi_input_stream.js';
39
39
  import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
40
40
  import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
41
41
  import { splitWords } from '../tokenize/basic/word.js';
@@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
43
43
  import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
44
44
  import { VAD, type VADEvent } from '../vad.js';
45
45
  import type { Agent, ModelSettings } from './agent.js';
46
- import { StopResponse, asyncLocalStorage } from './agent.js';
46
+ import {
47
+ StopResponse,
48
+ _getActivityTaskInfo,
49
+ _setActivityTaskInfo,
50
+ functionCallStorage,
51
+ speechHandleStorage,
52
+ } from './agent.js';
47
53
  import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
48
54
  import {
49
55
  AudioRecognition,
@@ -60,7 +66,7 @@ import {
60
66
  createSpeechCreatedEvent,
61
67
  createUserInputTranscribedEvent,
62
68
  } from './events.js';
63
- import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
69
+ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
64
70
  import {
65
71
  type _AudioOut,
66
72
  type _TextOut,
@@ -74,8 +80,9 @@ import {
74
80
  } from './generation.js';
75
81
  import type { TimedString } from './io.js';
76
82
  import { SpeechHandle } from './speech_handle.js';
83
+ import { setParticipantSpanAttributes } from './utils.js';
77
84
 
78
- const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
85
+ export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
79
86
 
80
87
  interface PreemptiveGeneration {
81
88
  speechHandle: SpeechHandle;
@@ -88,31 +95,47 @@ interface PreemptiveGeneration {
88
95
  }
89
96
 
90
97
  export class AgentActivity implements RecognitionHooks {
98
+ agent: Agent;
99
+ agentSession: AgentSession;
100
+
91
101
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
102
+
92
103
  private started = false;
93
104
  private audioRecognition?: AudioRecognition;
94
105
  private realtimeSession?: RealtimeSession;
95
106
  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
96
107
  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
97
108
  private logger = log();
98
- private _draining = false;
109
+ private _schedulingPaused = true;
110
+ private _drainBlockedTasks: Task<any>[] = [];
99
111
  private _currentSpeech?: SpeechHandle;
100
112
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
101
113
  private q_updated: Future;
102
114
  private speechTasks: Set<Task<void>> = new Set();
103
115
  private lock = new Mutex();
104
- private audioStream = new DeferredReadableStream<AudioFrame>();
116
+ private audioStream = new MultiInputStream<AudioFrame>();
117
+ private audioStreamId?: string;
118
+
105
119
  // default to null as None, which maps to the default provider tool choice value
106
120
  private toolChoice: ToolChoice | null = null;
107
121
  private _preemptiveGeneration?: PreemptiveGeneration;
108
122
 
109
- agent: Agent;
110
- agentSession: AgentSession;
111
-
112
123
  /** @internal */
113
124
  _mainTask?: Task<void>;
114
- _userTurnCompletedTask?: Promise<void>;
115
-
125
+ _onEnterTask?: Task<void>;
126
+ _onExitTask?: Task<void>;
127
+ _userTurnCompletedTask?: Task<void>;
128
+
129
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
130
+ this.onGenerationCreated(ev);
131
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
132
+ this.onInputSpeechStarted(ev);
133
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
134
+ this.onInputSpeechStopped(ev);
135
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
136
+ this.onInputAudioTranscriptionCompleted(ev);
137
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
138
+ this.onError(ev);
116
139
  constructor(agent: Agent, agentSession: AgentSession) {
117
140
  this.agent = agent;
118
141
  this.agentSession = agentSession;
@@ -132,7 +155,7 @@ export class AgentActivity implements RecognitionHooks {
132
155
 
133
156
  if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
134
157
  this.logger.warn(
135
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
158
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
136
159
  );
137
160
  this.turnDetectionMode = undefined;
138
161
  }
@@ -210,117 +233,138 @@ export class AgentActivity implements RecognitionHooks {
210
233
  async start(): Promise<void> {
211
234
  const unlock = await this.lock.lock();
212
235
  try {
213
- // Create start_agent_activity as a ROOT span (new trace) to match Python behavior
214
- const startSpan = tracer.startSpan({
215
- name: 'start_agent_activity',
216
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
217
- context: ROOT_CONTEXT,
218
- });
236
+ await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
237
+ } finally {
238
+ unlock();
239
+ }
240
+ }
219
241
 
220
- this.agent._agentActivity = this;
242
+ async resume(): Promise<void> {
243
+ const unlock = await this.lock.lock();
244
+ try {
245
+ await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
246
+ } finally {
247
+ unlock();
248
+ }
249
+ }
221
250
 
222
- if (this.llm instanceof RealtimeModel) {
223
- this.realtimeSession = this.llm.session();
224
- this.realtimeSpans = new Map<string, Span>();
225
- this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
226
- this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
227
- this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
228
- this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
229
- this.onInputAudioTranscriptionCompleted(ev),
230
- );
231
- this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
232
- this.realtimeSession.on('error', (ev) => this.onError(ev));
233
-
234
- removeInstructions(this.agent._chatCtx);
235
- try {
236
- await this.realtimeSession.updateInstructions(this.agent.instructions);
237
- } catch (error) {
238
- this.logger.error(error, 'failed to update the instructions');
239
- }
251
+ private async _startSession(options: {
252
+ spanName: 'start_agent_activity' | 'resume_agent_activity';
253
+ runOnEnter: boolean;
254
+ }): Promise<void> {
255
+ const { spanName, runOnEnter } = options;
256
+ const startSpan = tracer.startSpan({
257
+ name: spanName,
258
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
259
+ context: ROOT_CONTEXT,
260
+ });
240
261
 
241
- try {
242
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
243
- } catch (error) {
244
- this.logger.error(error, 'failed to update the chat context');
245
- }
262
+ this.agent._agentActivity = this;
246
263
 
247
- try {
248
- await this.realtimeSession.updateTools(this.tools);
249
- } catch (error) {
250
- this.logger.error(error, 'failed to update the tools');
251
- }
264
+ if (this.llm instanceof RealtimeModel) {
265
+ this.realtimeSession = this.llm.session();
266
+ this.realtimeSpans = new Map<string, Span>();
267
+ this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
268
+ this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
269
+ this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
270
+ this.realtimeSession.on(
271
+ 'input_audio_transcription_completed',
272
+ this.onRealtimeInputAudioTranscriptionCompleted,
273
+ );
274
+ this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
275
+ this.realtimeSession.on('error', this.onModelError);
252
276
 
253
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
254
- this.logger.error(
255
- 'audio output is enabled but RealtimeModel has no audio modality ' +
256
- 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
257
- 'or set a TTS model.',
258
- );
259
- }
260
- } else if (this.llm instanceof LLM) {
261
- try {
262
- updateInstructions({
263
- chatCtx: this.agent._chatCtx,
264
- instructions: this.agent.instructions,
265
- addIfMissing: true,
266
- });
267
- } catch (error) {
268
- this.logger.error('failed to update the instructions', error);
269
- }
277
+ removeInstructions(this.agent._chatCtx);
278
+ try {
279
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
280
+ } catch (error) {
281
+ this.logger.error(error, 'failed to update the instructions');
270
282
  }
271
283
 
272
- // metrics and error handling
273
- if (this.llm instanceof LLM) {
274
- this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
275
- this.llm.on('error', (ev) => this.onError(ev));
284
+ try {
285
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
286
+ } catch (error) {
287
+ this.logger.error(error, 'failed to update the chat context');
276
288
  }
277
289
 
278
- if (this.stt instanceof STT) {
279
- this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
280
- this.stt.on('error', (ev) => this.onError(ev));
290
+ try {
291
+ await this.realtimeSession.updateTools(this.tools);
292
+ } catch (error) {
293
+ this.logger.error(error, 'failed to update the tools');
281
294
  }
282
295
 
283
- if (this.tts instanceof TTS) {
284
- this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
285
- this.tts.on('error', (ev) => this.onError(ev));
296
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
297
+ this.logger.error(
298
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
299
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
300
+ 'or set a TTS model.',
301
+ );
286
302
  }
287
-
288
- if (this.vad instanceof VAD) {
289
- this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
303
+ } else if (this.llm instanceof LLM) {
304
+ try {
305
+ updateInstructions({
306
+ chatCtx: this.agent._chatCtx,
307
+ instructions: this.agent.instructions,
308
+ addIfMissing: true,
309
+ });
310
+ } catch (error) {
311
+ this.logger.error('failed to update the instructions', error);
290
312
  }
313
+ }
291
314
 
292
- this.audioRecognition = new AudioRecognition({
293
- recognitionHooks: this,
294
- // Disable stt node if stt is not provided
295
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
296
- vad: this.vad,
297
- turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
298
- turnDetectionMode: this.turnDetectionMode,
299
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
300
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
301
- rootSpanContext: this.agentSession.rootSpanContext,
302
- });
303
- this.audioRecognition.start();
304
- this.started = true;
315
+ // metrics and error handling
316
+ if (this.llm instanceof LLM) {
317
+ this.llm.on('metrics_collected', this.onMetricsCollected);
318
+ this.llm.on('error', this.onModelError);
319
+ }
305
320
 
306
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
321
+ if (this.stt instanceof STT) {
322
+ this.stt.on('metrics_collected', this.onMetricsCollected);
323
+ this.stt.on('error', this.onModelError);
324
+ }
307
325
 
308
- // Create on_enter as a child of start_agent_activity in the new trace
309
- const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
310
- name: 'on_enter',
311
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
312
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
313
- });
326
+ if (this.tts instanceof TTS) {
327
+ this.tts.on('metrics_collected', this.onMetricsCollected);
328
+ this.tts.on('error', this.onModelError);
329
+ }
314
330
 
315
- this.createSpeechTask({
316
- task: Task.from(() => onEnterTask),
331
+ if (this.vad instanceof VAD) {
332
+ this.vad.on('metrics_collected', this.onMetricsCollected);
333
+ }
334
+
335
+ this.audioRecognition = new AudioRecognition({
336
+ recognitionHooks: this,
337
+ // Disable stt node if stt is not provided
338
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
339
+ vad: this.vad,
340
+ turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
341
+ turnDetectionMode: this.turnDetectionMode,
342
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
343
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
344
+ rootSpanContext: this.agentSession.rootSpanContext,
345
+ sttModel: this.stt?.label,
346
+ sttProvider: this.getSttProvider(),
347
+ getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
348
+ });
349
+ this.audioRecognition.start();
350
+ this.started = true;
351
+
352
+ this._resumeSchedulingTask();
353
+
354
+ if (runOnEnter) {
355
+ this._onEnterTask = this.createSpeechTask({
356
+ taskFn: () =>
357
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
358
+ name: 'on_enter',
359
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
360
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
361
+ }),
362
+ inlineTask: true,
317
363
  name: 'AgentActivity_onEnter',
318
364
  });
319
-
320
- startSpan.end();
321
- } finally {
322
- unlock();
323
365
  }
366
+
367
+ startSpan.end();
324
368
  }
325
369
 
326
370
  get currentSpeech(): SpeechHandle | undefined {
@@ -335,6 +379,17 @@ export class AgentActivity implements RecognitionHooks {
335
379
  return this.agent.stt || this.agentSession.stt;
336
380
  }
337
381
 
382
+ private getSttProvider(): string | undefined {
383
+ const label = this.stt?.label;
384
+ if (!label) {
385
+ return undefined;
386
+ }
387
+
388
+ // Heuristic: most labels look like "<provider>-<model>"
389
+ const [provider] = label.split('-', 1);
390
+ return provider || label;
391
+ }
392
+
338
393
  get llm(): LLM | RealtimeModel | undefined {
339
394
  return this.agent.llm || this.agentSession.llm;
340
395
  }
@@ -347,8 +402,8 @@ export class AgentActivity implements RecognitionHooks {
347
402
  return this.agent.toolCtx;
348
403
  }
349
404
 
350
- get draining(): boolean {
351
- return this._draining;
405
+ get schedulingPaused(): boolean {
406
+ return this._schedulingPaused;
352
407
  }
353
408
 
354
409
  get realtimeLLMSession(): RealtimeSession | undefined {
@@ -402,18 +457,10 @@ export class AgentActivity implements RecognitionHooks {
402
457
  }
403
458
 
404
459
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
405
- if (this.audioStream.isSourceSet) {
406
- this.logger.debug('detaching existing audio input in agent activity');
407
- this.audioStream.detachSource();
408
- }
460
+ void this.audioStream.close();
461
+ this.audioStream = new MultiInputStream<AudioFrame>();
409
462
 
410
- /**
411
- * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
412
- * The tee() operation should be applied to the deferred stream, not the original audioStream.
413
- * This is important because teeing the original stream directly makes it very difficult—if not
414
- * impossible—to implement stream unlock logic cleanly.
415
- */
416
- this.audioStream.setSource(audioStream);
463
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
417
464
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
418
465
 
419
466
  if (this.realtimeSession) {
@@ -426,16 +473,29 @@ export class AgentActivity implements RecognitionHooks {
426
473
  }
427
474
 
428
475
  detachAudioInput(): void {
429
- this.audioStream.detachSource();
476
+ if (this.audioStreamId === undefined) {
477
+ return;
478
+ }
479
+
480
+ void this.audioStream.close();
481
+ this.audioStream = new MultiInputStream<AudioFrame>();
482
+ this.audioStreamId = undefined;
430
483
  }
431
484
 
432
- commitUserTurn() {
485
+ commitUserTurn(
486
+ options: {
487
+ audioDetached?: boolean;
488
+ throwIfNotReady?: boolean;
489
+ } = {},
490
+ ) {
491
+ const { audioDetached = false, throwIfNotReady = true } = options;
433
492
  if (!this.audioRecognition) {
434
- throw new Error('AudioRecognition is not initialized');
493
+ if (throwIfNotReady) {
494
+ throw new Error('AudioRecognition is not initialized');
495
+ }
496
+ return;
435
497
  }
436
498
 
437
- // TODO(brian): add audio_detached flag
438
- const audioDetached = false;
439
499
  this.audioRecognition.commitUserTurn(audioDetached);
440
500
  }
441
501
 
@@ -493,14 +553,13 @@ export class AgentActivity implements RecognitionHooks {
493
553
  }),
494
554
  );
495
555
  const task = this.createSpeechTask({
496
- task: Task.from((abortController: AbortController) =>
556
+ taskFn: (abortController: AbortController) =>
497
557
  this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
498
- ),
499
558
  ownedSpeechHandle: handle,
500
559
  name: 'AgentActivity.say_tts',
501
560
  });
502
561
 
503
- task.finally(() => this.onPipelineReplyDone());
562
+ task.result.finally(() => this.onPipelineReplyDone());
504
563
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
505
564
  return handle;
506
565
  }
@@ -613,9 +672,9 @@ export class AgentActivity implements RecognitionHooks {
613
672
  return;
614
673
  }
615
674
 
616
- if (this.draining) {
675
+ if (this.schedulingPaused) {
617
676
  // TODO(shubhra): should we "forward" this new turn to the next agent?
618
- this.logger.warn('skipping new realtime generation, the agent is draining');
677
+ this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
619
678
  return;
620
679
  }
621
680
 
@@ -633,9 +692,8 @@ export class AgentActivity implements RecognitionHooks {
633
692
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
634
693
 
635
694
  this.createSpeechTask({
636
- task: Task.from((abortController: AbortController) =>
695
+ taskFn: (abortController: AbortController) =>
637
696
  this.realtimeGenerationTask(handle, ev, {}, abortController),
638
- ),
639
697
  ownedSpeechHandle: handle,
640
698
  name: 'AgentActivity.realtimeGeneration',
641
699
  });
@@ -767,7 +825,7 @@ export class AgentActivity implements RecognitionHooks {
767
825
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
768
826
  if (
769
827
  !this.agentSession.options.preemptiveGeneration ||
770
- this.draining ||
828
+ this.schedulingPaused ||
771
829
  (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
772
830
  !(this.llm instanceof LLM)
773
831
  ) {
@@ -814,11 +872,32 @@ export class AgentActivity implements RecognitionHooks {
814
872
  }
815
873
 
816
874
  private createSpeechTask(options: {
817
- task: Task<void>;
875
+ taskFn: (controller: AbortController) => Promise<void>;
876
+ controller?: AbortController;
818
877
  ownedSpeechHandle?: SpeechHandle;
878
+ inlineTask?: boolean;
819
879
  name?: string;
820
- }): Promise<void> {
821
- const { task, ownedSpeechHandle } = options;
880
+ }): Task<void> {
881
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
882
+
883
+ const wrappedFn = (ctrl: AbortController) => {
884
+ return agentActivityStorage.run(this, () => {
885
+ // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
886
+ // before post-construction metadata is attached to the Task instance.
887
+ const currentTask = Task.current();
888
+ if (currentTask) {
889
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
890
+ }
891
+
892
+ if (ownedSpeechHandle) {
893
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
894
+ }
895
+ return taskFn(ctrl);
896
+ });
897
+ };
898
+
899
+ const task = Task.from(wrappedFn, controller, name);
900
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
822
901
 
823
902
  this.speechTasks.add(task);
824
903
  task.addDoneCallback(() => {
@@ -838,13 +917,16 @@ export class AgentActivity implements RecognitionHooks {
838
917
  this.wakeupMainTask();
839
918
  });
840
919
 
841
- return task.result;
920
+ return task;
842
921
  }
843
922
 
844
923
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
845
- if (this.draining) {
924
+ if (this.schedulingPaused) {
846
925
  this.cancelPreemptiveGeneration();
847
- this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
926
+ this.logger.warn(
927
+ { user_input: info.newTranscript },
928
+ 'skipping user input, speech scheduling is paused',
929
+ );
848
930
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
849
931
  return true;
850
932
  }
@@ -877,7 +959,7 @@ export class AgentActivity implements RecognitionHooks {
877
959
 
878
960
  const oldTask = this._userTurnCompletedTask;
879
961
  this._userTurnCompletedTask = this.createSpeechTask({
880
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
962
+ taskFn: () => this.userTurnCompleted(info, oldTask),
881
963
  name: 'AgentActivity.userTurnCompleted',
882
964
  });
883
965
  return true;
@@ -913,10 +995,12 @@ export class AgentActivity implements RecognitionHooks {
913
995
  this._currentSpeech = undefined;
914
996
  }
915
997
 
916
- // If we're draining and there are no more speech tasks, we can exit.
917
- // Only speech tasks can bypass draining to create a tool response
918
- if (this.draining && this.speechTasks.size === 0) {
919
- this.logger.info('mainTask: draining and no more speech tasks');
998
+ // if we're draining/pausing and there are no more speech tasks, we can exit.
999
+ // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
1000
+ const toWait = this.getDrainPendingSpeechTasks();
1001
+
1002
+ if (this._schedulingPaused && toWait.length === 0) {
1003
+ this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
920
1004
  break;
921
1005
  }
922
1006
 
@@ -926,6 +1010,39 @@ export class AgentActivity implements RecognitionHooks {
926
1010
  this.logger.info('AgentActivity mainTask: exiting');
927
1011
  }
928
1012
 
1013
+ private getDrainPendingSpeechTasks(): Task<void>[] {
1014
+ const blockedHandles: SpeechHandle[] = [];
1015
+
1016
+ for (const task of this._drainBlockedTasks) {
1017
+ const info = _getActivityTaskInfo(task);
1018
+ if (!info) {
1019
+ this.logger.error('blocked task without activity info; skipping.');
1020
+ continue;
1021
+ }
1022
+
1023
+ if (!info.speechHandle) {
1024
+ continue; // onEnter/onExit
1025
+ }
1026
+
1027
+ blockedHandles.push(info.speechHandle);
1028
+ }
1029
+
1030
+ const toWait: Task<void>[] = [];
1031
+ for (const task of this.speechTasks) {
1032
+ if (this._drainBlockedTasks.includes(task)) {
1033
+ continue;
1034
+ }
1035
+
1036
+ const info = _getActivityTaskInfo(task);
1037
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
1038
+ continue;
1039
+ }
1040
+
1041
+ toWait.push(task);
1042
+ }
1043
+ return toWait;
1044
+ }
1045
+
929
1046
  private wakeupMainTask(): void {
930
1047
  this.q_updated.resolve();
931
1048
  }
@@ -967,7 +1084,7 @@ export class AgentActivity implements RecognitionHooks {
967
1084
  throw new Error('trying to generate reply without an LLM model');
968
1085
  }
969
1086
 
970
- const functionCall = asyncLocalStorage.getStore()?.functionCall;
1087
+ const functionCall = functionCallStorage.getStore()?.functionCall;
971
1088
  if (toolChoice === undefined && functionCall !== undefined) {
972
1089
  // when generateReply is called inside a tool, set toolChoice to 'none' by default
973
1090
  toolChoice = 'none';
@@ -989,7 +1106,7 @@ export class AgentActivity implements RecognitionHooks {
989
1106
 
990
1107
  if (this.llm instanceof RealtimeModel) {
991
1108
  this.createSpeechTask({
992
- task: Task.from((abortController: AbortController) =>
1109
+ taskFn: (abortController: AbortController) =>
993
1110
  this.realtimeReplyTask({
994
1111
  speechHandle: handle,
995
1112
  // TODO(brian): support llm.ChatMessage for the realtime model
@@ -1001,7 +1118,6 @@ export class AgentActivity implements RecognitionHooks {
1001
1118
  },
1002
1119
  abortController,
1003
1120
  }),
1004
- ),
1005
1121
  ownedSpeechHandle: handle,
1006
1122
  name: 'AgentActivity.realtimeReply',
1007
1123
  });
@@ -1014,7 +1130,7 @@ export class AgentActivity implements RecognitionHooks {
1014
1130
  }
1015
1131
 
1016
1132
  const task = this.createSpeechTask({
1017
- task: Task.from((abortController: AbortController) =>
1133
+ taskFn: (abortController: AbortController) =>
1018
1134
  this.pipelineReplyTask(
1019
1135
  handle,
1020
1136
  chatCtx ?? this.agent.chatCtx,
@@ -1026,12 +1142,11 @@ export class AgentActivity implements RecognitionHooks {
1026
1142
  instructions,
1027
1143
  userMessage,
1028
1144
  ),
1029
- ),
1030
1145
  ownedSpeechHandle: handle,
1031
1146
  name: 'AgentActivity.pipelineReply',
1032
1147
  });
1033
1148
 
1034
- task.finally(() => this.onPipelineReplyDone());
1149
+ task.result.finally(() => this.onPipelineReplyDone());
1035
1150
  }
1036
1151
 
1037
1152
  if (scheduleSpeech) {
@@ -1040,16 +1155,19 @@ export class AgentActivity implements RecognitionHooks {
1040
1155
  return handle;
1041
1156
  }
1042
1157
 
1043
- interrupt(): Future<void> {
1158
+ interrupt(options: { force?: boolean } = {}): Future<void> {
1159
+ const { force = false } = options;
1160
+ this.cancelPreemptiveGeneration();
1161
+
1044
1162
  const future = new Future<void>();
1045
1163
  const currentSpeech = this._currentSpeech;
1046
1164
 
1047
1165
  //TODO(AJS-273): add interrupt for background speeches
1048
1166
 
1049
- currentSpeech?.interrupt();
1167
+ currentSpeech?.interrupt(force);
1050
1168
 
1051
1169
  for (const [_, __, speech] of this.speechQueue) {
1052
- speech.interrupt();
1170
+ speech.interrupt(force);
1053
1171
  }
1054
1172
 
1055
1173
  this.realtimeSession?.interrupt();
@@ -1072,13 +1190,13 @@ export class AgentActivity implements RecognitionHooks {
1072
1190
  }
1073
1191
  }
1074
1192
 
1075
- private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
1193
+ private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
1076
1194
  if (oldTask) {
1077
1195
  // We never cancel user code as this is very confusing.
1078
1196
  // So we wait for the old execution of onUserTurnCompleted to finish.
1079
1197
  // In practice this is OK because most speeches will be interrupted if a new turn
1080
1198
  // is detected. So the previous execution should complete quickly.
1081
- await oldTask;
1199
+ await oldTask.result;
1082
1200
  }
1083
1201
 
1084
1202
  // When the audio recognition detects the end of a user turn:
@@ -1355,6 +1473,11 @@ export class AgentActivity implements RecognitionHooks {
1355
1473
  span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
1356
1474
  }
1357
1475
 
1476
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
1477
+ if (localParticipant) {
1478
+ setParticipantSpanAttributes(span, localParticipant);
1479
+ }
1480
+
1358
1481
  speechHandleStorage.enterWith(speechHandle);
1359
1482
 
1360
1483
  const audioOutput = this.agentSession.output.audioEnabled
@@ -1531,13 +1654,15 @@ export class AgentActivity implements RecognitionHooks {
1531
1654
  for (const msg of toolsMessages) {
1532
1655
  msg.createdAt = replyStartedAt;
1533
1656
  }
1534
- this.agent._chatCtx.insert(toolsMessages);
1535
- // Only add FunctionCallOutput items to session history since FunctionCall items
1536
- // were already added by onToolExecutionStarted when the tool execution began
1657
+ // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
1658
+ // were already added by onToolExecutionStarted when the tool execution began.
1659
+ // Inserting function_calls again would create duplicates that break provider APIs
1660
+ // (e.g. Google's "function response parts != function call parts" error).
1537
1661
  const toolCallOutputs = toolsMessages.filter(
1538
1662
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1539
1663
  );
1540
1664
  if (toolCallOutputs.length > 0) {
1665
+ this.agent._chatCtx.insert(toolCallOutputs);
1541
1666
  this.agentSession._toolItemsAdded(toolCallOutputs);
1542
1667
  }
1543
1668
  }
@@ -1645,52 +1770,18 @@ export class AgentActivity implements RecognitionHooks {
1645
1770
  return;
1646
1771
  }
1647
1772
 
1648
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1649
- functionCalls: [],
1650
- functionCallOutputs: [],
1651
- });
1652
- let shouldGenerateToolReply: boolean = false;
1653
- let newAgentTask: Agent | null = null;
1654
- let ignoreTaskSwitch: boolean = false;
1655
-
1656
- for (const sanitizedOut of toolOutput.output) {
1657
- if (sanitizedOut.toolCallOutput !== undefined) {
1658
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1659
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1660
- if (sanitizedOut.replyRequired) {
1661
- shouldGenerateToolReply = true;
1662
- }
1663
- }
1664
-
1665
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1666
- this.logger.error('expected to receive only one agent task from the tool executions');
1667
- ignoreTaskSwitch = true;
1668
- // TODO(brian): should we mark the function call as failed to notify the LLM?
1669
- }
1670
-
1671
- newAgentTask = sanitizedOut.agentTask ?? null;
1672
-
1673
- this.logger.debug(
1674
- {
1675
- speechId: speechHandle.id,
1676
- name: sanitizedOut.toolCall?.name,
1677
- args: sanitizedOut.toolCall.args,
1678
- output: sanitizedOut.toolCallOutput?.output,
1679
- isError: sanitizedOut.toolCallOutput?.isError,
1680
- },
1681
- 'Tool call execution finished',
1682
- );
1683
- }
1773
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
1774
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1684
1775
 
1685
1776
  this.agentSession.emit(
1686
1777
  AgentSessionEventTypes.FunctionToolsExecuted,
1687
1778
  functionToolsExecutedEvent,
1688
1779
  );
1689
1780
 
1690
- let draining = this.draining;
1781
+ let schedulingPaused = this.schedulingPaused;
1691
1782
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1692
1783
  this.agentSession.updateAgent(newAgentTask);
1693
- draining = true;
1784
+ schedulingPaused = true;
1694
1785
  }
1695
1786
 
1696
1787
  const toolMessages = [
@@ -1705,11 +1796,12 @@ export class AgentActivity implements RecognitionHooks {
1705
1796
 
1706
1797
  // Avoid setting tool_choice to "required" or a specific function when
1707
1798
  // passing tool response back to the LLM
1708
- const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1799
+ const respondToolChoice =
1800
+ schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1709
1801
 
1710
1802
  // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
1711
1803
  const toolResponseTask = this.createSpeechTask({
1712
- task: Task.from(() =>
1804
+ taskFn: () =>
1713
1805
  this.pipelineReplyTask(
1714
1806
  speechHandle,
1715
1807
  chatCtx,
@@ -1720,12 +1812,11 @@ export class AgentActivity implements RecognitionHooks {
1720
1812
  undefined,
1721
1813
  toolMessages,
1722
1814
  ),
1723
- ),
1724
1815
  ownedSpeechHandle: speechHandle,
1725
1816
  name: 'AgentActivity.pipelineReply',
1726
1817
  });
1727
1818
 
1728
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1819
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1729
1820
 
1730
1821
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1731
1822
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -1733,15 +1824,12 @@ export class AgentActivity implements RecognitionHooks {
1733
1824
  msg.createdAt = replyStartedAt;
1734
1825
  }
1735
1826
 
1736
- this.agent._chatCtx.insert(toolMessages);
1737
-
1738
- // Only add FunctionCallOutput items to session history since FunctionCall items
1739
- // were already added by onToolExecutionStarted when the tool execution began
1740
1827
  const toolCallOutputs = toolMessages.filter(
1741
1828
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1742
1829
  );
1743
1830
 
1744
1831
  if (toolCallOutputs.length > 0) {
1832
+ this.agent._chatCtx.insert(toolCallOutputs);
1745
1833
  this.agentSession._toolItemsAdded(toolCallOutputs);
1746
1834
  }
1747
1835
  }
@@ -1815,6 +1903,11 @@ export class AgentActivity implements RecognitionHooks {
1815
1903
 
1816
1904
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1817
1905
 
1906
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
1907
+ if (localParticipant) {
1908
+ setParticipantSpanAttributes(span, localParticipant);
1909
+ }
1910
+
1818
1911
  speechHandleStorage.enterWith(speechHandle);
1819
1912
 
1820
1913
  if (!this.realtimeSession) {
@@ -2139,50 +2232,18 @@ export class AgentActivity implements RecognitionHooks {
2139
2232
  return;
2140
2233
  }
2141
2234
 
2142
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2143
- functionCalls: [],
2144
- functionCallOutputs: [],
2145
- });
2146
- let shouldGenerateToolReply: boolean = false;
2147
- let newAgentTask: Agent | null = null;
2148
- let ignoreTaskSwitch: boolean = false;
2149
-
2150
- for (const sanitizedOut of toolOutput.output) {
2151
- if (sanitizedOut.toolCallOutput !== undefined) {
2152
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2153
- if (sanitizedOut.replyRequired) {
2154
- shouldGenerateToolReply = true;
2155
- }
2156
- }
2157
-
2158
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2159
- this.logger.error('expected to receive only one agent task from the tool executions');
2160
- ignoreTaskSwitch = true;
2161
- }
2162
-
2163
- newAgentTask = sanitizedOut.agentTask ?? null;
2164
-
2165
- this.logger.debug(
2166
- {
2167
- speechId: speechHandle.id,
2168
- name: sanitizedOut.toolCall?.name,
2169
- args: sanitizedOut.toolCall.args,
2170
- output: sanitizedOut.toolCallOutput?.output,
2171
- isError: sanitizedOut.toolCallOutput?.isError,
2172
- },
2173
- 'Tool call execution finished',
2174
- );
2175
- }
2235
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
2236
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
2176
2237
 
2177
2238
  this.agentSession.emit(
2178
2239
  AgentSessionEventTypes.FunctionToolsExecuted,
2179
2240
  functionToolsExecutedEvent,
2180
2241
  );
2181
2242
 
2182
- let draining = this.draining;
2243
+ let schedulingPaused = this.schedulingPaused;
2183
2244
  if (!ignoreTaskSwitch && newAgentTask !== null) {
2184
2245
  this.agentSession.updateAgent(newAgentTask);
2185
- draining = true;
2246
+ schedulingPaused = true;
2186
2247
  }
2187
2248
 
2188
2249
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -2238,15 +2299,14 @@ export class AgentActivity implements RecognitionHooks {
2238
2299
  }),
2239
2300
  );
2240
2301
 
2241
- const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2302
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2242
2303
  this.createSpeechTask({
2243
- task: Task.from((abortController: AbortController) =>
2304
+ taskFn: (abortController: AbortController) =>
2244
2305
  this.realtimeReplyTask({
2245
2306
  speechHandle: replySpeechHandle,
2246
2307
  modelSettings: { toolChoice },
2247
2308
  abortController,
2248
2309
  }),
2249
- ),
2250
2310
  ownedSpeechHandle: replySpeechHandle,
2251
2311
  name: 'AgentActivity.realtime_reply',
2252
2312
  });
@@ -2254,6 +2314,53 @@ export class AgentActivity implements RecognitionHooks {
2254
2314
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
2255
2315
  }
2256
2316
 
2317
+ private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
2318
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2319
+ functionCalls: [],
2320
+ functionCallOutputs: [],
2321
+ });
2322
+
2323
+ let shouldGenerateToolReply = false;
2324
+ let newAgentTask: Agent | null = null;
2325
+ let ignoreTaskSwitch = false;
2326
+
2327
+ for (const sanitizedOut of toolOutput.output) {
2328
+ if (sanitizedOut.toolCallOutput !== undefined) {
2329
+ // Keep event payload symmetric for pipeline + realtime paths.
2330
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
2331
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2332
+ if (sanitizedOut.replyRequired) {
2333
+ shouldGenerateToolReply = true;
2334
+ }
2335
+ }
2336
+
2337
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2338
+ this.logger.error('expected to receive only one agent task from the tool executions');
2339
+ ignoreTaskSwitch = true;
2340
+ }
2341
+
2342
+ newAgentTask = sanitizedOut.agentTask ?? null;
2343
+
2344
+ this.logger.debug(
2345
+ {
2346
+ speechId: speechHandle.id,
2347
+ name: sanitizedOut.toolCall?.name,
2348
+ args: sanitizedOut.toolCall.args,
2349
+ output: sanitizedOut.toolCallOutput?.output,
2350
+ isError: sanitizedOut.toolCallOutput?.isError,
2351
+ },
2352
+ 'Tool call execution finished',
2353
+ );
2354
+ }
2355
+
2356
+ return {
2357
+ functionToolsExecutedEvent,
2358
+ shouldGenerateToolReply,
2359
+ newAgentTask,
2360
+ ignoreTaskSwitch,
2361
+ };
2362
+ }
2363
+
2257
2364
  private async realtimeReplyTask({
2258
2365
  speechHandle,
2259
2366
  modelSettings: { toolChoice },
@@ -2312,10 +2419,10 @@ export class AgentActivity implements RecognitionHooks {
2312
2419
  priority: number,
2313
2420
  force: boolean = false,
2314
2421
  ): void {
2315
- // when force=true, we allow tool responses to bypass draining
2422
+ // when force=true, we allow tool responses to bypass scheduling pause
2316
2423
  // This allows for tool responses to be generated before the AgentActivity is finalized
2317
- if (this.draining && !force) {
2318
- throw new Error('cannot schedule new speech, the agent is draining');
2424
+ if (this.schedulingPaused && !force) {
2425
+ throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
2319
2426
  }
2320
2427
 
2321
2428
  // Monotonic time to avoid near 0 collisions
@@ -2324,6 +2431,48 @@ export class AgentActivity implements RecognitionHooks {
2324
2431
  this.wakeupMainTask();
2325
2432
  }
2326
2433
 
2434
+ private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
2435
+ if (this._schedulingPaused) return;
2436
+
2437
+ this._schedulingPaused = true;
2438
+ this._drainBlockedTasks = blockedTasks;
2439
+ this.wakeupMainTask();
2440
+
2441
+ if (this._mainTask) {
2442
+ // When pausing/draining, we ensure that all speech_tasks complete fully.
2443
+ // This means that even if the SpeechHandle themselves have finished,
2444
+ // we still wait for the entire execution (e.g function_tools)
2445
+ await this._mainTask.result;
2446
+ }
2447
+ }
2448
+
2449
+ private _resumeSchedulingTask(): void {
2450
+ if (!this._schedulingPaused) return;
2451
+
2452
+ this._schedulingPaused = false;
2453
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
2454
+ }
2455
+
2456
+ async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
2457
+ const { blockedTasks = [] } = options;
2458
+ const unlock = await this.lock.lock();
2459
+
2460
+ try {
2461
+ const span = tracer.startSpan({
2462
+ name: 'pause_agent_activity',
2463
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2464
+ });
2465
+ try {
2466
+ await this._pauseSchedulingTask(blockedTasks);
2467
+ await this._closeSessionResources();
2468
+ } finally {
2469
+ span.end();
2470
+ }
2471
+ } finally {
2472
+ unlock();
2473
+ }
2474
+ }
2475
+
2327
2476
  async drain(): Promise<void> {
2328
2477
  // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
2329
2478
  return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
@@ -2337,23 +2486,22 @@ export class AgentActivity implements RecognitionHooks {
2337
2486
 
2338
2487
  const unlock = await this.lock.lock();
2339
2488
  try {
2340
- if (this._draining) return;
2341
-
2342
- this.cancelPreemptiveGeneration();
2489
+ if (this._schedulingPaused) return;
2343
2490
 
2344
- const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
2345
- name: 'on_exit',
2346
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2347
- });
2348
-
2349
- this.createSpeechTask({
2350
- task: Task.from(() => onExitTask),
2491
+ this._onExitTask = this.createSpeechTask({
2492
+ taskFn: () =>
2493
+ tracer.startActiveSpan(async () => this.agent.onExit(), {
2494
+ name: 'on_exit',
2495
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2496
+ }),
2497
+ inlineTask: true,
2351
2498
  name: 'AgentActivity_onExit',
2352
2499
  });
2353
2500
 
2354
- this.wakeupMainTask();
2355
- this._draining = true;
2356
- await this._mainTask?.result;
2501
+ this.cancelPreemptiveGeneration();
2502
+
2503
+ await this._onExitTask.result;
2504
+ await this._pauseSchedulingTask([]);
2357
2505
  } finally {
2358
2506
  unlock();
2359
2507
  }
@@ -2362,44 +2510,59 @@ export class AgentActivity implements RecognitionHooks {
2362
2510
  async close(): Promise<void> {
2363
2511
  const unlock = await this.lock.lock();
2364
2512
  try {
2365
- if (!this._draining) {
2366
- this.logger.warn('task closing without draining');
2367
- }
2368
-
2369
2513
  this.cancelPreemptiveGeneration();
2370
- // Unregister event handlers to prevent duplicate metrics
2371
- if (this.llm instanceof LLM) {
2372
- this.llm.off('metrics_collected', this.onMetricsCollected);
2373
- }
2374
- if (this.realtimeSession) {
2375
- this.realtimeSession.off('generation_created', this.onGenerationCreated);
2376
- this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
2377
- this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
2378
- this.realtimeSession.off(
2379
- 'input_audio_transcription_completed',
2380
- this.onInputAudioTranscriptionCompleted,
2381
- );
2382
- this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2383
- }
2384
- if (this.stt instanceof STT) {
2385
- this.stt.off('metrics_collected', this.onMetricsCollected);
2386
- }
2387
- if (this.tts instanceof TTS) {
2388
- this.tts.off('metrics_collected', this.onMetricsCollected);
2389
- }
2390
- if (this.vad instanceof VAD) {
2391
- this.vad.off('metrics_collected', this.onMetricsCollected);
2514
+ await this._closeSessionResources();
2515
+
2516
+ if (this._mainTask) {
2517
+ await this._mainTask.cancelAndWait();
2392
2518
  }
2393
2519
 
2394
- this.detachAudioInput();
2395
- this.realtimeSpans?.clear();
2396
- await this.realtimeSession?.close();
2397
- await this.audioRecognition?.close();
2398
- await this._mainTask?.cancelAndWait();
2520
+ this.agent._agentActivity = undefined;
2399
2521
  } finally {
2400
2522
  unlock();
2401
2523
  }
2402
2524
  }
2525
+
2526
+ private async _closeSessionResources(): Promise<void> {
2527
+ // Unregister event handlers to prevent duplicate metrics
2528
+ if (this.llm instanceof LLM) {
2529
+ this.llm.off('metrics_collected', this.onMetricsCollected);
2530
+ this.llm.off('error', this.onModelError);
2531
+ }
2532
+
2533
+ if (this.realtimeSession) {
2534
+ this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
2535
+ this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
2536
+ this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
2537
+ this.realtimeSession.off(
2538
+ 'input_audio_transcription_completed',
2539
+ this.onRealtimeInputAudioTranscriptionCompleted,
2540
+ );
2541
+ this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2542
+ this.realtimeSession.off('error', this.onModelError);
2543
+ }
2544
+
2545
+ if (this.stt instanceof STT) {
2546
+ this.stt.off('metrics_collected', this.onMetricsCollected);
2547
+ this.stt.off('error', this.onModelError);
2548
+ }
2549
+
2550
+ if (this.tts instanceof TTS) {
2551
+ this.tts.off('metrics_collected', this.onMetricsCollected);
2552
+ this.tts.off('error', this.onModelError);
2553
+ }
2554
+
2555
+ if (this.vad instanceof VAD) {
2556
+ this.vad.off('metrics_collected', this.onMetricsCollected);
2557
+ }
2558
+
2559
+ this.detachAudioInput();
2560
+ this.realtimeSpans?.clear();
2561
+ await this.realtimeSession?.close();
2562
+ await this.audioRecognition?.close();
2563
+ this.realtimeSession = undefined;
2564
+ this.audioRecognition = undefined;
2565
+ }
2403
2566
  }
2404
2567
 
2405
2568
  function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {