@livekit/agents 1.0.46 → 1.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/cli.cjs +14 -20
  23. package/dist/cli.cjs.map +1 -1
  24. package/dist/cli.d.ts.map +1 -1
  25. package/dist/cli.js +14 -20
  26. package/dist/cli.js.map +1 -1
  27. package/dist/index.cjs +3 -0
  28. package/dist/index.cjs.map +1 -1
  29. package/dist/index.d.cts +2 -1
  30. package/dist/index.d.ts +2 -1
  31. package/dist/index.d.ts.map +1 -1
  32. package/dist/index.js +2 -0
  33. package/dist/index.js.map +1 -1
  34. package/dist/inference/api_protos.d.cts +59 -59
  35. package/dist/inference/api_protos.d.ts +59 -59
  36. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  37. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  38. package/dist/ipc/job_proc_lazy_main.js +14 -5
  39. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  40. package/dist/llm/chat_context.cjs +108 -1
  41. package/dist/llm/chat_context.cjs.map +1 -1
  42. package/dist/llm/chat_context.d.cts +14 -1
  43. package/dist/llm/chat_context.d.ts +14 -1
  44. package/dist/llm/chat_context.d.ts.map +1 -1
  45. package/dist/llm/chat_context.js +108 -1
  46. package/dist/llm/chat_context.js.map +1 -1
  47. package/dist/llm/chat_context.test.cjs +43 -0
  48. package/dist/llm/chat_context.test.cjs.map +1 -1
  49. package/dist/llm/chat_context.test.js +43 -0
  50. package/dist/llm/chat_context.test.js.map +1 -1
  51. package/dist/llm/index.cjs +2 -0
  52. package/dist/llm/index.cjs.map +1 -1
  53. package/dist/llm/index.d.cts +1 -1
  54. package/dist/llm/index.d.ts +1 -1
  55. package/dist/llm/index.d.ts.map +1 -1
  56. package/dist/llm/index.js +3 -1
  57. package/dist/llm/index.js.map +1 -1
  58. package/dist/llm/provider_format/index.cjs +2 -0
  59. package/dist/llm/provider_format/index.cjs.map +1 -1
  60. package/dist/llm/provider_format/index.d.cts +2 -2
  61. package/dist/llm/provider_format/index.d.ts +2 -2
  62. package/dist/llm/provider_format/index.d.ts.map +1 -1
  63. package/dist/llm/provider_format/index.js +6 -1
  64. package/dist/llm/provider_format/index.js.map +1 -1
  65. package/dist/llm/provider_format/openai.cjs +82 -2
  66. package/dist/llm/provider_format/openai.cjs.map +1 -1
  67. package/dist/llm/provider_format/openai.d.cts +1 -0
  68. package/dist/llm/provider_format/openai.d.ts +1 -0
  69. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  70. package/dist/llm/provider_format/openai.js +80 -1
  71. package/dist/llm/provider_format/openai.js.map +1 -1
  72. package/dist/llm/provider_format/openai.test.cjs +326 -0
  73. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  74. package/dist/llm/provider_format/openai.test.js +327 -1
  75. package/dist/llm/provider_format/openai.test.js.map +1 -1
  76. package/dist/llm/provider_format/utils.cjs +4 -3
  77. package/dist/llm/provider_format/utils.cjs.map +1 -1
  78. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  79. package/dist/llm/provider_format/utils.js +4 -3
  80. package/dist/llm/provider_format/utils.js.map +1 -1
  81. package/dist/llm/realtime.cjs.map +1 -1
  82. package/dist/llm/realtime.d.cts +1 -0
  83. package/dist/llm/realtime.d.ts +1 -0
  84. package/dist/llm/realtime.d.ts.map +1 -1
  85. package/dist/llm/realtime.js.map +1 -1
  86. package/dist/llm/tool_context.cjs +7 -0
  87. package/dist/llm/tool_context.cjs.map +1 -1
  88. package/dist/llm/tool_context.d.cts +10 -2
  89. package/dist/llm/tool_context.d.ts +10 -2
  90. package/dist/llm/tool_context.d.ts.map +1 -1
  91. package/dist/llm/tool_context.js +6 -0
  92. package/dist/llm/tool_context.js.map +1 -1
  93. package/dist/log.cjs +5 -2
  94. package/dist/log.cjs.map +1 -1
  95. package/dist/log.d.ts.map +1 -1
  96. package/dist/log.js +5 -2
  97. package/dist/log.js.map +1 -1
  98. package/dist/stream/deferred_stream.cjs +15 -6
  99. package/dist/stream/deferred_stream.cjs.map +1 -1
  100. package/dist/stream/deferred_stream.d.ts.map +1 -1
  101. package/dist/stream/deferred_stream.js +15 -6
  102. package/dist/stream/deferred_stream.js.map +1 -1
  103. package/dist/utils.cjs +32 -2
  104. package/dist/utils.cjs.map +1 -1
  105. package/dist/utils.d.cts +7 -0
  106. package/dist/utils.d.ts +7 -0
  107. package/dist/utils.d.ts.map +1 -1
  108. package/dist/utils.js +32 -2
  109. package/dist/utils.js.map +1 -1
  110. package/dist/utils.test.cjs +71 -0
  111. package/dist/utils.test.cjs.map +1 -1
  112. package/dist/utils.test.js +71 -0
  113. package/dist/utils.test.js.map +1 -1
  114. package/dist/version.cjs +1 -1
  115. package/dist/version.cjs.map +1 -1
  116. package/dist/version.d.cts +1 -1
  117. package/dist/version.d.ts +1 -1
  118. package/dist/version.d.ts.map +1 -1
  119. package/dist/version.js +1 -1
  120. package/dist/version.js.map +1 -1
  121. package/dist/voice/agent.cjs +153 -12
  122. package/dist/voice/agent.cjs.map +1 -1
  123. package/dist/voice/agent.d.cts +30 -4
  124. package/dist/voice/agent.d.ts +30 -4
  125. package/dist/voice/agent.d.ts.map +1 -1
  126. package/dist/voice/agent.js +149 -11
  127. package/dist/voice/agent.js.map +1 -1
  128. package/dist/voice/agent.test.cjs +120 -0
  129. package/dist/voice/agent.test.cjs.map +1 -1
  130. package/dist/voice/agent.test.js +122 -2
  131. package/dist/voice/agent.test.js.map +1 -1
  132. package/dist/voice/agent_activity.cjs +406 -298
  133. package/dist/voice/agent_activity.cjs.map +1 -1
  134. package/dist/voice/agent_activity.d.cts +41 -7
  135. package/dist/voice/agent_activity.d.ts +41 -7
  136. package/dist/voice/agent_activity.d.ts.map +1 -1
  137. package/dist/voice/agent_activity.js +407 -294
  138. package/dist/voice/agent_activity.js.map +1 -1
  139. package/dist/voice/agent_session.cjs +140 -40
  140. package/dist/voice/agent_session.cjs.map +1 -1
  141. package/dist/voice/agent_session.d.cts +19 -7
  142. package/dist/voice/agent_session.d.ts +19 -7
  143. package/dist/voice/agent_session.d.ts.map +1 -1
  144. package/dist/voice/agent_session.js +137 -37
  145. package/dist/voice/agent_session.js.map +1 -1
  146. package/dist/voice/audio_recognition.cjs +4 -0
  147. package/dist/voice/audio_recognition.cjs.map +1 -1
  148. package/dist/voice/audio_recognition.d.ts.map +1 -1
  149. package/dist/voice/audio_recognition.js +4 -0
  150. package/dist/voice/audio_recognition.js.map +1 -1
  151. package/dist/voice/generation.cjs +39 -19
  152. package/dist/voice/generation.cjs.map +1 -1
  153. package/dist/voice/generation.d.ts.map +1 -1
  154. package/dist/voice/generation.js +44 -20
  155. package/dist/voice/generation.js.map +1 -1
  156. package/dist/voice/index.cjs +2 -0
  157. package/dist/voice/index.cjs.map +1 -1
  158. package/dist/voice/index.d.cts +1 -1
  159. package/dist/voice/index.d.ts +1 -1
  160. package/dist/voice/index.d.ts.map +1 -1
  161. package/dist/voice/index.js +2 -1
  162. package/dist/voice/index.js.map +1 -1
  163. package/dist/voice/room_io/room_io.cjs +11 -2
  164. package/dist/voice/room_io/room_io.cjs.map +1 -1
  165. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  166. package/dist/voice/room_io/room_io.js +12 -3
  167. package/dist/voice/room_io/room_io.js.map +1 -1
  168. package/dist/voice/speech_handle.cjs +7 -1
  169. package/dist/voice/speech_handle.cjs.map +1 -1
  170. package/dist/voice/speech_handle.d.cts +2 -0
  171. package/dist/voice/speech_handle.d.ts +2 -0
  172. package/dist/voice/speech_handle.d.ts.map +1 -1
  173. package/dist/voice/speech_handle.js +8 -2
  174. package/dist/voice/speech_handle.js.map +1 -1
  175. package/dist/voice/testing/fake_llm.cjs +127 -0
  176. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  177. package/dist/voice/testing/fake_llm.d.cts +30 -0
  178. package/dist/voice/testing/fake_llm.d.ts +30 -0
  179. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  180. package/dist/voice/testing/fake_llm.js +103 -0
  181. package/dist/voice/testing/fake_llm.js.map +1 -0
  182. package/dist/voice/testing/index.cjs +3 -0
  183. package/dist/voice/testing/index.cjs.map +1 -1
  184. package/dist/voice/testing/index.d.cts +1 -0
  185. package/dist/voice/testing/index.d.ts +1 -0
  186. package/dist/voice/testing/index.d.ts.map +1 -1
  187. package/dist/voice/testing/index.js +2 -0
  188. package/dist/voice/testing/index.js.map +1 -1
  189. package/dist/voice/testing/run_result.cjs +66 -15
  190. package/dist/voice/testing/run_result.cjs.map +1 -1
  191. package/dist/voice/testing/run_result.d.cts +14 -3
  192. package/dist/voice/testing/run_result.d.ts +14 -3
  193. package/dist/voice/testing/run_result.d.ts.map +1 -1
  194. package/dist/voice/testing/run_result.js +66 -15
  195. package/dist/voice/testing/run_result.js.map +1 -1
  196. package/package.json +1 -1
  197. package/src/beta/index.ts +9 -0
  198. package/src/beta/workflows/index.ts +9 -0
  199. package/src/beta/workflows/task_group.ts +194 -0
  200. package/src/cli.ts +20 -33
  201. package/src/index.ts +2 -1
  202. package/src/ipc/job_proc_lazy_main.ts +16 -5
  203. package/src/llm/chat_context.test.ts +48 -0
  204. package/src/llm/chat_context.ts +158 -0
  205. package/src/llm/index.ts +1 -0
  206. package/src/llm/provider_format/index.ts +7 -2
  207. package/src/llm/provider_format/openai.test.ts +385 -1
  208. package/src/llm/provider_format/openai.ts +103 -0
  209. package/src/llm/provider_format/utils.ts +6 -4
  210. package/src/llm/realtime.ts +1 -0
  211. package/src/llm/tool_context.ts +14 -0
  212. package/src/log.ts +5 -2
  213. package/src/stream/deferred_stream.ts +17 -6
  214. package/src/utils.test.ts +87 -0
  215. package/src/utils.ts +41 -2
  216. package/src/version.ts +1 -1
  217. package/src/voice/agent.test.ts +140 -2
  218. package/src/voice/agent.ts +200 -10
  219. package/src/voice/agent_activity.ts +466 -290
  220. package/src/voice/agent_session.ts +178 -40
  221. package/src/voice/audio_recognition.ts +4 -0
  222. package/src/voice/generation.ts +52 -23
  223. package/src/voice/index.ts +1 -1
  224. package/src/voice/room_io/room_io.ts +14 -3
  225. package/src/voice/speech_handle.ts +9 -2
  226. package/src/voice/testing/fake_llm.ts +138 -0
  227. package/src/voice/testing/index.ts +2 -0
  228. package/src/voice/testing/run_result.ts +81 -23
@@ -23,6 +23,7 @@ import {
23
23
  type RealtimeSession,
24
24
  type ToolChoice,
25
25
  type ToolContext,
26
+ ToolFlag,
26
27
  } from '../llm/index.js';
27
28
  import type { LLMError } from '../llm/llm.js';
28
29
  import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
@@ -35,7 +36,7 @@ import type {
35
36
  TTSMetrics,
36
37
  VADMetrics,
37
38
  } from '../metrics/base.js';
38
- import { DeferredReadableStream } from '../stream/deferred_stream.js';
39
+ import { MultiInputStream } from '../stream/multi_input_stream.js';
39
40
  import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
40
41
  import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
41
42
  import { splitWords } from '../tokenize/basic/word.js';
@@ -43,7 +44,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
43
44
  import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
44
45
  import { VAD, type VADEvent } from '../vad.js';
45
46
  import type { Agent, ModelSettings } from './agent.js';
46
- import { StopResponse, asyncLocalStorage } from './agent.js';
47
+ import {
48
+ StopResponse,
49
+ _getActivityTaskInfo,
50
+ _setActivityTaskInfo,
51
+ functionCallStorage,
52
+ speechHandleStorage,
53
+ } from './agent.js';
47
54
  import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
48
55
  import {
49
56
  AudioRecognition,
@@ -60,7 +67,7 @@ import {
60
67
  createSpeechCreatedEvent,
61
68
  createUserInputTranscribedEvent,
62
69
  } from './events.js';
63
- import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
70
+ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
64
71
  import {
65
72
  type _AudioOut,
66
73
  type _TextOut,
@@ -76,7 +83,13 @@ import type { TimedString } from './io.js';
76
83
  import { SpeechHandle } from './speech_handle.js';
77
84
  import { setParticipantSpanAttributes } from './utils.js';
78
85
 
79
- const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
86
+ export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
87
+ export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
88
+
89
+ interface OnEnterData {
90
+ session: AgentSession;
91
+ agent: Agent;
92
+ }
80
93
 
81
94
  interface PreemptiveGeneration {
82
95
  speechHandle: SpeechHandle;
@@ -89,31 +102,47 @@ interface PreemptiveGeneration {
89
102
  }
90
103
 
91
104
  export class AgentActivity implements RecognitionHooks {
105
+ agent: Agent;
106
+ agentSession: AgentSession;
107
+
92
108
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
109
+
93
110
  private started = false;
94
111
  private audioRecognition?: AudioRecognition;
95
112
  private realtimeSession?: RealtimeSession;
96
113
  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
97
114
  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
98
115
  private logger = log();
99
- private _draining = false;
116
+ private _schedulingPaused = true;
117
+ private _drainBlockedTasks: Task<any>[] = [];
100
118
  private _currentSpeech?: SpeechHandle;
101
119
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
102
120
  private q_updated: Future;
103
121
  private speechTasks: Set<Task<void>> = new Set();
104
122
  private lock = new Mutex();
105
- private audioStream = new DeferredReadableStream<AudioFrame>();
123
+ private audioStream = new MultiInputStream<AudioFrame>();
124
+ private audioStreamId?: string;
125
+
106
126
  // default to null as None, which maps to the default provider tool choice value
107
127
  private toolChoice: ToolChoice | null = null;
108
128
  private _preemptiveGeneration?: PreemptiveGeneration;
109
129
 
110
- agent: Agent;
111
- agentSession: AgentSession;
112
-
113
130
  /** @internal */
114
131
  _mainTask?: Task<void>;
115
- _userTurnCompletedTask?: Promise<void>;
116
-
132
+ _onEnterTask?: Task<void>;
133
+ _onExitTask?: Task<void>;
134
+ _userTurnCompletedTask?: Task<void>;
135
+
136
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
137
+ this.onGenerationCreated(ev);
138
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
139
+ this.onInputSpeechStarted(ev);
140
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
141
+ this.onInputSpeechStopped(ev);
142
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
143
+ this.onInputAudioTranscriptionCompleted(ev);
144
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
145
+ this.onError(ev);
117
146
  constructor(agent: Agent, agentSession: AgentSession) {
118
147
  this.agent = agent;
119
148
  this.agentSession = agentSession;
@@ -133,7 +162,7 @@ export class AgentActivity implements RecognitionHooks {
133
162
 
134
163
  if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
135
164
  this.logger.warn(
136
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
165
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
137
166
  );
138
167
  this.turnDetectionMode = undefined;
139
168
  }
@@ -211,120 +240,142 @@ export class AgentActivity implements RecognitionHooks {
211
240
  async start(): Promise<void> {
212
241
  const unlock = await this.lock.lock();
213
242
  try {
214
- // Create start_agent_activity as a ROOT span (new trace) to match Python behavior
215
- const startSpan = tracer.startSpan({
216
- name: 'start_agent_activity',
217
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
218
- context: ROOT_CONTEXT,
219
- });
243
+ await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
244
+ } finally {
245
+ unlock();
246
+ }
247
+ }
220
248
 
221
- this.agent._agentActivity = this;
249
+ async resume(): Promise<void> {
250
+ const unlock = await this.lock.lock();
251
+ try {
252
+ await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
253
+ } finally {
254
+ unlock();
255
+ }
256
+ }
222
257
 
223
- if (this.llm instanceof RealtimeModel) {
224
- this.realtimeSession = this.llm.session();
225
- this.realtimeSpans = new Map<string, Span>();
226
- this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
227
- this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
228
- this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
229
- this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
230
- this.onInputAudioTranscriptionCompleted(ev),
231
- );
232
- this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
233
- this.realtimeSession.on('error', (ev) => this.onError(ev));
234
-
235
- removeInstructions(this.agent._chatCtx);
236
- try {
237
- await this.realtimeSession.updateInstructions(this.agent.instructions);
238
- } catch (error) {
239
- this.logger.error(error, 'failed to update the instructions');
240
- }
258
+ private async _startSession(options: {
259
+ spanName: 'start_agent_activity' | 'resume_agent_activity';
260
+ runOnEnter: boolean;
261
+ }): Promise<void> {
262
+ const { spanName, runOnEnter } = options;
263
+ const startSpan = tracer.startSpan({
264
+ name: spanName,
265
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
266
+ context: ROOT_CONTEXT,
267
+ });
241
268
 
242
- try {
243
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
244
- } catch (error) {
245
- this.logger.error(error, 'failed to update the chat context');
246
- }
269
+ this.agent._agentActivity = this;
247
270
 
248
- try {
249
- await this.realtimeSession.updateTools(this.tools);
250
- } catch (error) {
251
- this.logger.error(error, 'failed to update the tools');
252
- }
271
+ if (this.llm instanceof RealtimeModel) {
272
+ this.realtimeSession = this.llm.session();
273
+ this.realtimeSpans = new Map<string, Span>();
274
+ this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
275
+ this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
276
+ this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
277
+ this.realtimeSession.on(
278
+ 'input_audio_transcription_completed',
279
+ this.onRealtimeInputAudioTranscriptionCompleted,
280
+ );
281
+ this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
282
+ this.realtimeSession.on('error', this.onModelError);
253
283
 
254
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
255
- this.logger.error(
256
- 'audio output is enabled but RealtimeModel has no audio modality ' +
257
- 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
258
- 'or set a TTS model.',
259
- );
260
- }
261
- } else if (this.llm instanceof LLM) {
262
- try {
263
- updateInstructions({
264
- chatCtx: this.agent._chatCtx,
265
- instructions: this.agent.instructions,
266
- addIfMissing: true,
267
- });
268
- } catch (error) {
269
- this.logger.error('failed to update the instructions', error);
270
- }
284
+ removeInstructions(this.agent._chatCtx);
285
+ try {
286
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
287
+ } catch (error) {
288
+ this.logger.error(error, 'failed to update the instructions');
271
289
  }
272
290
 
273
- // metrics and error handling
274
- if (this.llm instanceof LLM) {
275
- this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
276
- this.llm.on('error', (ev) => this.onError(ev));
291
+ try {
292
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
293
+ } catch (error) {
294
+ this.logger.error(error, 'failed to update the chat context');
277
295
  }
278
296
 
279
- if (this.stt instanceof STT) {
280
- this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
281
- this.stt.on('error', (ev) => this.onError(ev));
297
+ try {
298
+ await this.realtimeSession.updateTools(this.tools);
299
+ } catch (error) {
300
+ this.logger.error(error, 'failed to update the tools');
282
301
  }
283
302
 
284
- if (this.tts instanceof TTS) {
285
- this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
286
- this.tts.on('error', (ev) => this.onError(ev));
303
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
304
+ this.logger.error(
305
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
306
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
307
+ 'or set a TTS model.',
308
+ );
287
309
  }
288
-
289
- if (this.vad instanceof VAD) {
290
- this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
310
+ } else if (this.llm instanceof LLM) {
311
+ try {
312
+ updateInstructions({
313
+ chatCtx: this.agent._chatCtx,
314
+ instructions: this.agent.instructions,
315
+ addIfMissing: true,
316
+ });
317
+ } catch (error) {
318
+ this.logger.error('failed to update the instructions', error);
291
319
  }
320
+ }
292
321
 
293
- this.audioRecognition = new AudioRecognition({
294
- recognitionHooks: this,
295
- // Disable stt node if stt is not provided
296
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
297
- vad: this.vad,
298
- turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
299
- turnDetectionMode: this.turnDetectionMode,
300
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
301
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
302
- rootSpanContext: this.agentSession.rootSpanContext,
303
- sttModel: this.stt?.label,
304
- sttProvider: this.getSttProvider(),
305
- getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
306
- });
307
- this.audioRecognition.start();
308
- this.started = true;
322
+ // TODO(parity): Record initial AgentConfigUpdate in chat context
309
323
 
310
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
324
+ // metrics and error handling
325
+ if (this.llm instanceof LLM) {
326
+ this.llm.on('metrics_collected', this.onMetricsCollected);
327
+ this.llm.on('error', this.onModelError);
328
+ }
311
329
 
312
- // Create on_enter as a child of start_agent_activity in the new trace
313
- const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
314
- name: 'on_enter',
315
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
316
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
317
- });
330
+ if (this.stt instanceof STT) {
331
+ this.stt.on('metrics_collected', this.onMetricsCollected);
332
+ this.stt.on('error', this.onModelError);
333
+ }
318
334
 
319
- this.createSpeechTask({
320
- task: Task.from(() => onEnterTask),
335
+ if (this.tts instanceof TTS) {
336
+ this.tts.on('metrics_collected', this.onMetricsCollected);
337
+ this.tts.on('error', this.onModelError);
338
+ }
339
+
340
+ if (this.vad instanceof VAD) {
341
+ this.vad.on('metrics_collected', this.onMetricsCollected);
342
+ }
343
+
344
+ this.audioRecognition = new AudioRecognition({
345
+ recognitionHooks: this,
346
+ // Disable stt node if stt is not provided
347
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
348
+ vad: this.vad,
349
+ turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
350
+ turnDetectionMode: this.turnDetectionMode,
351
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
352
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
353
+ rootSpanContext: this.agentSession.rootSpanContext,
354
+ sttModel: this.stt?.label,
355
+ sttProvider: this.getSttProvider(),
356
+ getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
357
+ });
358
+ this.audioRecognition.start();
359
+ this.started = true;
360
+
361
+ this._resumeSchedulingTask();
362
+
363
+ if (runOnEnter) {
364
+ this._onEnterTask = this.createSpeechTask({
365
+ taskFn: () =>
366
+ onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
367
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
368
+ name: 'on_enter',
369
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
370
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
371
+ }),
372
+ ),
373
+ inlineTask: true,
321
374
  name: 'AgentActivity_onEnter',
322
375
  });
323
-
324
- startSpan.end();
325
- } finally {
326
- unlock();
327
376
  }
377
+
378
+ startSpan.end();
328
379
  }
329
380
 
330
381
  get currentSpeech(): SpeechHandle | undefined {
@@ -362,8 +413,8 @@ export class AgentActivity implements RecognitionHooks {
362
413
  return this.agent.toolCtx;
363
414
  }
364
415
 
365
- get draining(): boolean {
366
- return this._draining;
416
+ get schedulingPaused(): boolean {
417
+ return this._schedulingPaused;
367
418
  }
368
419
 
369
420
  get realtimeLLMSession(): RealtimeSession | undefined {
@@ -406,6 +457,20 @@ export class AgentActivity implements RecognitionHooks {
406
457
  }
407
458
  }
408
459
 
460
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
461
+ async updateTools(tools: ToolContext): Promise<void> {
462
+ this.agent._tools = { ...tools };
463
+
464
+ if (this.realtimeSession) {
465
+ await this.realtimeSession.updateTools(tools);
466
+ }
467
+
468
+ if (this.llm instanceof LLM) {
469
+ // for realtime LLM, we assume the server will remove unvalid tool messages
470
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
471
+ }
472
+ }
473
+
409
474
  updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
410
475
  if (toolChoice !== undefined) {
411
476
  this.toolChoice = toolChoice;
@@ -417,18 +482,10 @@ export class AgentActivity implements RecognitionHooks {
417
482
  }
418
483
 
419
484
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
420
- if (this.audioStream.isSourceSet) {
421
- this.logger.debug('detaching existing audio input in agent activity');
422
- this.audioStream.detachSource();
423
- }
485
+ void this.audioStream.close();
486
+ this.audioStream = new MultiInputStream<AudioFrame>();
424
487
 
425
- /**
426
- * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
427
- * The tee() operation should be applied to the deferred stream, not the original audioStream.
428
- * This is important because teeing the original stream directly makes it very difficult—if not
429
- * impossible—to implement stream unlock logic cleanly.
430
- */
431
- this.audioStream.setSource(audioStream);
488
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
432
489
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
433
490
 
434
491
  if (this.realtimeSession) {
@@ -441,16 +498,29 @@ export class AgentActivity implements RecognitionHooks {
441
498
  }
442
499
 
443
500
  detachAudioInput(): void {
444
- this.audioStream.detachSource();
501
+ if (this.audioStreamId === undefined) {
502
+ return;
503
+ }
504
+
505
+ void this.audioStream.close();
506
+ this.audioStream = new MultiInputStream<AudioFrame>();
507
+ this.audioStreamId = undefined;
445
508
  }
446
509
 
447
- commitUserTurn() {
510
+ commitUserTurn(
511
+ options: {
512
+ audioDetached?: boolean;
513
+ throwIfNotReady?: boolean;
514
+ } = {},
515
+ ) {
516
+ const { audioDetached = false, throwIfNotReady = true } = options;
448
517
  if (!this.audioRecognition) {
449
- throw new Error('AudioRecognition is not initialized');
518
+ if (throwIfNotReady) {
519
+ throw new Error('AudioRecognition is not initialized');
520
+ }
521
+ return;
450
522
  }
451
523
 
452
- // TODO(brian): add audio_detached flag
453
- const audioDetached = false;
454
524
  this.audioRecognition.commitUserTurn(audioDetached);
455
525
  }
456
526
 
@@ -508,14 +578,13 @@ export class AgentActivity implements RecognitionHooks {
508
578
  }),
509
579
  );
510
580
  const task = this.createSpeechTask({
511
- task: Task.from((abortController: AbortController) =>
581
+ taskFn: (abortController: AbortController) =>
512
582
  this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
513
- ),
514
583
  ownedSpeechHandle: handle,
515
584
  name: 'AgentActivity.say_tts',
516
585
  });
517
586
 
518
- task.finally(() => this.onPipelineReplyDone());
587
+ task.result.finally(() => this.onPipelineReplyDone());
519
588
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
520
589
  return handle;
521
590
  }
@@ -628,9 +697,9 @@ export class AgentActivity implements RecognitionHooks {
628
697
  return;
629
698
  }
630
699
 
631
- if (this.draining) {
700
+ if (this.schedulingPaused) {
632
701
  // TODO(shubhra): should we "forward" this new turn to the next agent?
633
- this.logger.warn('skipping new realtime generation, the agent is draining');
702
+ this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
634
703
  return;
635
704
  }
636
705
 
@@ -648,9 +717,8 @@ export class AgentActivity implements RecognitionHooks {
648
717
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
649
718
 
650
719
  this.createSpeechTask({
651
- task: Task.from((abortController: AbortController) =>
720
+ taskFn: (abortController: AbortController) =>
652
721
  this.realtimeGenerationTask(handle, ev, {}, abortController),
653
- ),
654
722
  ownedSpeechHandle: handle,
655
723
  name: 'AgentActivity.realtimeGeneration',
656
724
  });
@@ -782,7 +850,7 @@ export class AgentActivity implements RecognitionHooks {
782
850
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
783
851
  if (
784
852
  !this.agentSession.options.preemptiveGeneration ||
785
- this.draining ||
853
+ this.schedulingPaused ||
786
854
  (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
787
855
  !(this.llm instanceof LLM)
788
856
  ) {
@@ -829,11 +897,32 @@ export class AgentActivity implements RecognitionHooks {
829
897
  }
830
898
 
831
899
  private createSpeechTask(options: {
832
- task: Task<void>;
900
+ taskFn: (controller: AbortController) => Promise<void>;
901
+ controller?: AbortController;
833
902
  ownedSpeechHandle?: SpeechHandle;
903
+ inlineTask?: boolean;
834
904
  name?: string;
835
- }): Promise<void> {
836
- const { task, ownedSpeechHandle } = options;
905
+ }): Task<void> {
906
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
907
+
908
+ const wrappedFn = (ctrl: AbortController) => {
909
+ return agentActivityStorage.run(this, () => {
910
+ // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
911
+ // before post-construction metadata is attached to the Task instance.
912
+ const currentTask = Task.current();
913
+ if (currentTask) {
914
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
915
+ }
916
+
917
+ if (ownedSpeechHandle) {
918
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
919
+ }
920
+ return taskFn(ctrl);
921
+ });
922
+ };
923
+
924
+ const task = Task.from(wrappedFn, controller, name);
925
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
837
926
 
838
927
  this.speechTasks.add(task);
839
928
  task.addDoneCallback(() => {
@@ -853,13 +942,16 @@ export class AgentActivity implements RecognitionHooks {
853
942
  this.wakeupMainTask();
854
943
  });
855
944
 
856
- return task.result;
945
+ return task;
857
946
  }
858
947
 
859
948
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
860
- if (this.draining) {
949
+ if (this.schedulingPaused) {
861
950
  this.cancelPreemptiveGeneration();
862
- this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
951
+ this.logger.warn(
952
+ { user_input: info.newTranscript },
953
+ 'skipping user input, speech scheduling is paused',
954
+ );
863
955
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
864
956
  return true;
865
957
  }
@@ -892,7 +984,7 @@ export class AgentActivity implements RecognitionHooks {
892
984
 
893
985
  const oldTask = this._userTurnCompletedTask;
894
986
  this._userTurnCompletedTask = this.createSpeechTask({
895
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
987
+ taskFn: () => this.userTurnCompleted(info, oldTask),
896
988
  name: 'AgentActivity.userTurnCompleted',
897
989
  });
898
990
  return true;
@@ -928,10 +1020,12 @@ export class AgentActivity implements RecognitionHooks {
928
1020
  this._currentSpeech = undefined;
929
1021
  }
930
1022
 
931
- // If we're draining and there are no more speech tasks, we can exit.
932
- // Only speech tasks can bypass draining to create a tool response
933
- if (this.draining && this.speechTasks.size === 0) {
934
- this.logger.info('mainTask: draining and no more speech tasks');
1023
+ // if we're draining/pausing and there are no more speech tasks, we can exit.
1024
+ // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
1025
+ const toWait = this.getDrainPendingSpeechTasks();
1026
+
1027
+ if (this._schedulingPaused && toWait.length === 0) {
1028
+ this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
935
1029
  break;
936
1030
  }
937
1031
 
@@ -941,6 +1035,39 @@ export class AgentActivity implements RecognitionHooks {
941
1035
  this.logger.info('AgentActivity mainTask: exiting');
942
1036
  }
943
1037
 
1038
+ private getDrainPendingSpeechTasks(): Task<void>[] {
1039
+ const blockedHandles: SpeechHandle[] = [];
1040
+
1041
+ for (const task of this._drainBlockedTasks) {
1042
+ const info = _getActivityTaskInfo(task);
1043
+ if (!info) {
1044
+ this.logger.error('blocked task without activity info; skipping.');
1045
+ continue;
1046
+ }
1047
+
1048
+ if (!info.speechHandle) {
1049
+ continue; // onEnter/onExit
1050
+ }
1051
+
1052
+ blockedHandles.push(info.speechHandle);
1053
+ }
1054
+
1055
+ const toWait: Task<void>[] = [];
1056
+ for (const task of this.speechTasks) {
1057
+ if (this._drainBlockedTasks.includes(task)) {
1058
+ continue;
1059
+ }
1060
+
1061
+ const info = _getActivityTaskInfo(task);
1062
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
1063
+ continue;
1064
+ }
1065
+
1066
+ toWait.push(task);
1067
+ }
1068
+ return toWait;
1069
+ }
1070
+
944
1071
  private wakeupMainTask(): void {
945
1072
  this.q_updated.resolve();
946
1073
  }
@@ -982,7 +1109,7 @@ export class AgentActivity implements RecognitionHooks {
982
1109
  throw new Error('trying to generate reply without an LLM model');
983
1110
  }
984
1111
 
985
- const functionCall = asyncLocalStorage.getStore()?.functionCall;
1112
+ const functionCall = functionCallStorage.getStore()?.functionCall;
986
1113
  if (toolChoice === undefined && functionCall !== undefined) {
987
1114
  // when generateReply is called inside a tool, set toolChoice to 'none' by default
988
1115
  toolChoice = 'none';
@@ -1004,7 +1131,7 @@ export class AgentActivity implements RecognitionHooks {
1004
1131
 
1005
1132
  if (this.llm instanceof RealtimeModel) {
1006
1133
  this.createSpeechTask({
1007
- task: Task.from((abortController: AbortController) =>
1134
+ taskFn: (abortController: AbortController) =>
1008
1135
  this.realtimeReplyTask({
1009
1136
  speechHandle: handle,
1010
1137
  // TODO(brian): support llm.ChatMessage for the realtime model
@@ -1016,7 +1143,6 @@ export class AgentActivity implements RecognitionHooks {
1016
1143
  },
1017
1144
  abortController,
1018
1145
  }),
1019
- ),
1020
1146
  ownedSpeechHandle: handle,
1021
1147
  name: 'AgentActivity.realtimeReply',
1022
1148
  });
@@ -1028,12 +1154,25 @@ export class AgentActivity implements RecognitionHooks {
1028
1154
  instructions = `${this.agent.instructions}\n${instructions}`;
1029
1155
  }
1030
1156
 
1157
+ // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
1158
+ const onEnterData = onEnterStorage.getStore();
1159
+ const shouldFilterTools =
1160
+ onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
1161
+
1162
+ const tools = shouldFilterTools
1163
+ ? Object.fromEntries(
1164
+ Object.entries(this.agent.toolCtx).filter(
1165
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
1166
+ ),
1167
+ )
1168
+ : this.agent.toolCtx;
1169
+
1031
1170
  const task = this.createSpeechTask({
1032
- task: Task.from((abortController: AbortController) =>
1171
+ taskFn: (abortController: AbortController) =>
1033
1172
  this.pipelineReplyTask(
1034
1173
  handle,
1035
1174
  chatCtx ?? this.agent.chatCtx,
1036
- this.agent.toolCtx,
1175
+ tools,
1037
1176
  {
1038
1177
  toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1039
1178
  },
@@ -1041,12 +1180,11 @@ export class AgentActivity implements RecognitionHooks {
1041
1180
  instructions,
1042
1181
  userMessage,
1043
1182
  ),
1044
- ),
1045
1183
  ownedSpeechHandle: handle,
1046
1184
  name: 'AgentActivity.pipelineReply',
1047
1185
  });
1048
1186
 
1049
- task.finally(() => this.onPipelineReplyDone());
1187
+ task.result.finally(() => this.onPipelineReplyDone());
1050
1188
  }
1051
1189
 
1052
1190
  if (scheduleSpeech) {
@@ -1055,16 +1193,19 @@ export class AgentActivity implements RecognitionHooks {
1055
1193
  return handle;
1056
1194
  }
1057
1195
 
1058
- interrupt(): Future<void> {
1196
+ interrupt(options: { force?: boolean } = {}): Future<void> {
1197
+ const { force = false } = options;
1198
+ this.cancelPreemptiveGeneration();
1199
+
1059
1200
  const future = new Future<void>();
1060
1201
  const currentSpeech = this._currentSpeech;
1061
1202
 
1062
1203
  //TODO(AJS-273): add interrupt for background speeches
1063
1204
 
1064
- currentSpeech?.interrupt();
1205
+ currentSpeech?.interrupt(force);
1065
1206
 
1066
1207
  for (const [_, __, speech] of this.speechQueue) {
1067
- speech.interrupt();
1208
+ speech.interrupt(force);
1068
1209
  }
1069
1210
 
1070
1211
  this.realtimeSession?.interrupt();
@@ -1087,13 +1228,13 @@ export class AgentActivity implements RecognitionHooks {
1087
1228
  }
1088
1229
  }
1089
1230
 
1090
- private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
1231
+ private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
1091
1232
  if (oldTask) {
1092
1233
  // We never cancel user code as this is very confusing.
1093
1234
  // So we wait for the old execution of onUserTurnCompleted to finish.
1094
1235
  // In practice this is OK because most speeches will be interrupted if a new turn
1095
1236
  // is detected. So the previous execution should complete quickly.
1096
- await oldTask;
1237
+ await oldTask.result;
1097
1238
  }
1098
1239
 
1099
1240
  // When the audio recognition detects the end of a user turn:
@@ -1551,13 +1692,15 @@ export class AgentActivity implements RecognitionHooks {
1551
1692
  for (const msg of toolsMessages) {
1552
1693
  msg.createdAt = replyStartedAt;
1553
1694
  }
1554
- this.agent._chatCtx.insert(toolsMessages);
1555
- // Only add FunctionCallOutput items to session history since FunctionCall items
1556
- // were already added by onToolExecutionStarted when the tool execution began
1695
+ // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
1696
+ // were already added by onToolExecutionStarted when the tool execution began.
1697
+ // Inserting function_calls again would create duplicates that break provider APIs
1698
+ // (e.g. Google's "function response parts != function call parts" error).
1557
1699
  const toolCallOutputs = toolsMessages.filter(
1558
1700
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1559
1701
  );
1560
1702
  if (toolCallOutputs.length > 0) {
1703
+ this.agent._chatCtx.insert(toolCallOutputs);
1561
1704
  this.agentSession._toolItemsAdded(toolCallOutputs);
1562
1705
  }
1563
1706
  }
@@ -1665,52 +1808,18 @@ export class AgentActivity implements RecognitionHooks {
1665
1808
  return;
1666
1809
  }
1667
1810
 
1668
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1669
- functionCalls: [],
1670
- functionCallOutputs: [],
1671
- });
1672
- let shouldGenerateToolReply: boolean = false;
1673
- let newAgentTask: Agent | null = null;
1674
- let ignoreTaskSwitch: boolean = false;
1675
-
1676
- for (const sanitizedOut of toolOutput.output) {
1677
- if (sanitizedOut.toolCallOutput !== undefined) {
1678
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1679
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1680
- if (sanitizedOut.replyRequired) {
1681
- shouldGenerateToolReply = true;
1682
- }
1683
- }
1684
-
1685
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1686
- this.logger.error('expected to receive only one agent task from the tool executions');
1687
- ignoreTaskSwitch = true;
1688
- // TODO(brian): should we mark the function call as failed to notify the LLM?
1689
- }
1690
-
1691
- newAgentTask = sanitizedOut.agentTask ?? null;
1692
-
1693
- this.logger.debug(
1694
- {
1695
- speechId: speechHandle.id,
1696
- name: sanitizedOut.toolCall?.name,
1697
- args: sanitizedOut.toolCall.args,
1698
- output: sanitizedOut.toolCallOutput?.output,
1699
- isError: sanitizedOut.toolCallOutput?.isError,
1700
- },
1701
- 'Tool call execution finished',
1702
- );
1703
- }
1811
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
1812
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1704
1813
 
1705
1814
  this.agentSession.emit(
1706
1815
  AgentSessionEventTypes.FunctionToolsExecuted,
1707
1816
  functionToolsExecutedEvent,
1708
1817
  );
1709
1818
 
1710
- let draining = this.draining;
1819
+ let schedulingPaused = this.schedulingPaused;
1711
1820
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1712
1821
  this.agentSession.updateAgent(newAgentTask);
1713
- draining = true;
1822
+ schedulingPaused = true;
1714
1823
  }
1715
1824
 
1716
1825
  const toolMessages = [
@@ -1725,11 +1834,12 @@ export class AgentActivity implements RecognitionHooks {
1725
1834
 
1726
1835
  // Avoid setting tool_choice to "required" or a specific function when
1727
1836
  // passing tool response back to the LLM
1728
- const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1837
+ const respondToolChoice =
1838
+ schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1729
1839
 
1730
1840
  // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
1731
1841
  const toolResponseTask = this.createSpeechTask({
1732
- task: Task.from(() =>
1842
+ taskFn: () =>
1733
1843
  this.pipelineReplyTask(
1734
1844
  speechHandle,
1735
1845
  chatCtx,
@@ -1740,12 +1850,11 @@ export class AgentActivity implements RecognitionHooks {
1740
1850
  undefined,
1741
1851
  toolMessages,
1742
1852
  ),
1743
- ),
1744
1853
  ownedSpeechHandle: speechHandle,
1745
1854
  name: 'AgentActivity.pipelineReply',
1746
1855
  });
1747
1856
 
1748
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1857
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1749
1858
 
1750
1859
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1751
1860
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -1753,15 +1862,12 @@ export class AgentActivity implements RecognitionHooks {
1753
1862
  msg.createdAt = replyStartedAt;
1754
1863
  }
1755
1864
 
1756
- this.agent._chatCtx.insert(toolMessages);
1757
-
1758
- // Only add FunctionCallOutput items to session history since FunctionCall items
1759
- // were already added by onToolExecutionStarted when the tool execution began
1760
1865
  const toolCallOutputs = toolMessages.filter(
1761
1866
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1762
1867
  );
1763
1868
 
1764
1869
  if (toolCallOutputs.length > 0) {
1870
+ this.agent._chatCtx.insert(toolCallOutputs);
1765
1871
  this.agentSession._toolItemsAdded(toolCallOutputs);
1766
1872
  }
1767
1873
  }
@@ -2164,50 +2270,18 @@ export class AgentActivity implements RecognitionHooks {
2164
2270
  return;
2165
2271
  }
2166
2272
 
2167
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2168
- functionCalls: [],
2169
- functionCallOutputs: [],
2170
- });
2171
- let shouldGenerateToolReply: boolean = false;
2172
- let newAgentTask: Agent | null = null;
2173
- let ignoreTaskSwitch: boolean = false;
2174
-
2175
- for (const sanitizedOut of toolOutput.output) {
2176
- if (sanitizedOut.toolCallOutput !== undefined) {
2177
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2178
- if (sanitizedOut.replyRequired) {
2179
- shouldGenerateToolReply = true;
2180
- }
2181
- }
2182
-
2183
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2184
- this.logger.error('expected to receive only one agent task from the tool executions');
2185
- ignoreTaskSwitch = true;
2186
- }
2187
-
2188
- newAgentTask = sanitizedOut.agentTask ?? null;
2189
-
2190
- this.logger.debug(
2191
- {
2192
- speechId: speechHandle.id,
2193
- name: sanitizedOut.toolCall?.name,
2194
- args: sanitizedOut.toolCall.args,
2195
- output: sanitizedOut.toolCallOutput?.output,
2196
- isError: sanitizedOut.toolCallOutput?.isError,
2197
- },
2198
- 'Tool call execution finished',
2199
- );
2200
- }
2273
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
2274
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
2201
2275
 
2202
2276
  this.agentSession.emit(
2203
2277
  AgentSessionEventTypes.FunctionToolsExecuted,
2204
2278
  functionToolsExecutedEvent,
2205
2279
  );
2206
2280
 
2207
- let draining = this.draining;
2281
+ let schedulingPaused = this.schedulingPaused;
2208
2282
  if (!ignoreTaskSwitch && newAgentTask !== null) {
2209
2283
  this.agentSession.updateAgent(newAgentTask);
2210
- draining = true;
2284
+ schedulingPaused = true;
2211
2285
  }
2212
2286
 
2213
2287
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -2263,15 +2337,14 @@ export class AgentActivity implements RecognitionHooks {
2263
2337
  }),
2264
2338
  );
2265
2339
 
2266
- const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2340
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2267
2341
  this.createSpeechTask({
2268
- task: Task.from((abortController: AbortController) =>
2342
+ taskFn: (abortController: AbortController) =>
2269
2343
  this.realtimeReplyTask({
2270
2344
  speechHandle: replySpeechHandle,
2271
2345
  modelSettings: { toolChoice },
2272
2346
  abortController,
2273
2347
  }),
2274
- ),
2275
2348
  ownedSpeechHandle: replySpeechHandle,
2276
2349
  name: 'AgentActivity.realtime_reply',
2277
2350
  });
@@ -2279,6 +2352,53 @@ export class AgentActivity implements RecognitionHooks {
2279
2352
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
2280
2353
  }
2281
2354
 
2355
+ private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
2356
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2357
+ functionCalls: [],
2358
+ functionCallOutputs: [],
2359
+ });
2360
+
2361
+ let shouldGenerateToolReply = false;
2362
+ let newAgentTask: Agent | null = null;
2363
+ let ignoreTaskSwitch = false;
2364
+
2365
+ for (const sanitizedOut of toolOutput.output) {
2366
+ if (sanitizedOut.toolCallOutput !== undefined) {
2367
+ // Keep event payload symmetric for pipeline + realtime paths.
2368
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
2369
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2370
+ if (sanitizedOut.replyRequired) {
2371
+ shouldGenerateToolReply = true;
2372
+ }
2373
+ }
2374
+
2375
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2376
+ this.logger.error('expected to receive only one agent task from the tool executions');
2377
+ ignoreTaskSwitch = true;
2378
+ }
2379
+
2380
+ newAgentTask = sanitizedOut.agentTask ?? null;
2381
+
2382
+ this.logger.debug(
2383
+ {
2384
+ speechId: speechHandle.id,
2385
+ name: sanitizedOut.toolCall?.name,
2386
+ args: sanitizedOut.toolCall.args,
2387
+ output: sanitizedOut.toolCallOutput?.output,
2388
+ isError: sanitizedOut.toolCallOutput?.isError,
2389
+ },
2390
+ 'Tool call execution finished',
2391
+ );
2392
+ }
2393
+
2394
+ return {
2395
+ functionToolsExecutedEvent,
2396
+ shouldGenerateToolReply,
2397
+ newAgentTask,
2398
+ ignoreTaskSwitch,
2399
+ };
2400
+ }
2401
+
2282
2402
  private async realtimeReplyTask({
2283
2403
  speechHandle,
2284
2404
  modelSettings: { toolChoice },
@@ -2337,10 +2457,10 @@ export class AgentActivity implements RecognitionHooks {
2337
2457
  priority: number,
2338
2458
  force: boolean = false,
2339
2459
  ): void {
2340
- // when force=true, we allow tool responses to bypass draining
2460
+ // when force=true, we allow tool responses to bypass scheduling pause
2341
2461
  // This allows for tool responses to be generated before the AgentActivity is finalized
2342
- if (this.draining && !force) {
2343
- throw new Error('cannot schedule new speech, the agent is draining');
2462
+ if (this.schedulingPaused && !force) {
2463
+ throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
2344
2464
  }
2345
2465
 
2346
2466
  // Monotonic time to avoid near 0 collisions
@@ -2349,6 +2469,48 @@ export class AgentActivity implements RecognitionHooks {
2349
2469
  this.wakeupMainTask();
2350
2470
  }
2351
2471
 
2472
+ private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
2473
+ if (this._schedulingPaused) return;
2474
+
2475
+ this._schedulingPaused = true;
2476
+ this._drainBlockedTasks = blockedTasks;
2477
+ this.wakeupMainTask();
2478
+
2479
+ if (this._mainTask) {
2480
+ // When pausing/draining, we ensure that all speech_tasks complete fully.
2481
+ // This means that even if the SpeechHandle themselves have finished,
2482
+ // we still wait for the entire execution (e.g function_tools)
2483
+ await this._mainTask.result;
2484
+ }
2485
+ }
2486
+
2487
+ private _resumeSchedulingTask(): void {
2488
+ if (!this._schedulingPaused) return;
2489
+
2490
+ this._schedulingPaused = false;
2491
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
2492
+ }
2493
+
2494
+ async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
2495
+ const { blockedTasks = [] } = options;
2496
+ const unlock = await this.lock.lock();
2497
+
2498
+ try {
2499
+ const span = tracer.startSpan({
2500
+ name: 'pause_agent_activity',
2501
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2502
+ });
2503
+ try {
2504
+ await this._pauseSchedulingTask(blockedTasks);
2505
+ await this._closeSessionResources();
2506
+ } finally {
2507
+ span.end();
2508
+ }
2509
+ } finally {
2510
+ unlock();
2511
+ }
2512
+ }
2513
+
2352
2514
  async drain(): Promise<void> {
2353
2515
  // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
2354
2516
  return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
@@ -2362,23 +2524,22 @@ export class AgentActivity implements RecognitionHooks {
2362
2524
 
2363
2525
  const unlock = await this.lock.lock();
2364
2526
  try {
2365
- if (this._draining) return;
2527
+ if (this._schedulingPaused) return;
2366
2528
 
2367
- this.cancelPreemptiveGeneration();
2368
-
2369
- const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
2370
- name: 'on_exit',
2371
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2372
- });
2373
-
2374
- this.createSpeechTask({
2375
- task: Task.from(() => onExitTask),
2529
+ this._onExitTask = this.createSpeechTask({
2530
+ taskFn: () =>
2531
+ tracer.startActiveSpan(async () => this.agent.onExit(), {
2532
+ name: 'on_exit',
2533
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2534
+ }),
2535
+ inlineTask: true,
2376
2536
  name: 'AgentActivity_onExit',
2377
2537
  });
2378
2538
 
2379
- this.wakeupMainTask();
2380
- this._draining = true;
2381
- await this._mainTask?.result;
2539
+ this.cancelPreemptiveGeneration();
2540
+
2541
+ await this._onExitTask.result;
2542
+ await this._pauseSchedulingTask([]);
2382
2543
  } finally {
2383
2544
  unlock();
2384
2545
  }
@@ -2387,44 +2548,59 @@ export class AgentActivity implements RecognitionHooks {
2387
2548
  async close(): Promise<void> {
2388
2549
  const unlock = await this.lock.lock();
2389
2550
  try {
2390
- if (!this._draining) {
2391
- this.logger.warn('task closing without draining');
2392
- }
2393
-
2394
2551
  this.cancelPreemptiveGeneration();
2395
- // Unregister event handlers to prevent duplicate metrics
2396
- if (this.llm instanceof LLM) {
2397
- this.llm.off('metrics_collected', this.onMetricsCollected);
2398
- }
2399
- if (this.realtimeSession) {
2400
- this.realtimeSession.off('generation_created', this.onGenerationCreated);
2401
- this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
2402
- this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
2403
- this.realtimeSession.off(
2404
- 'input_audio_transcription_completed',
2405
- this.onInputAudioTranscriptionCompleted,
2406
- );
2407
- this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2408
- }
2409
- if (this.stt instanceof STT) {
2410
- this.stt.off('metrics_collected', this.onMetricsCollected);
2411
- }
2412
- if (this.tts instanceof TTS) {
2413
- this.tts.off('metrics_collected', this.onMetricsCollected);
2414
- }
2415
- if (this.vad instanceof VAD) {
2416
- this.vad.off('metrics_collected', this.onMetricsCollected);
2552
+ await this._closeSessionResources();
2553
+
2554
+ if (this._mainTask) {
2555
+ await this._mainTask.cancelAndWait();
2417
2556
  }
2418
2557
 
2419
- this.detachAudioInput();
2420
- this.realtimeSpans?.clear();
2421
- await this.realtimeSession?.close();
2422
- await this.audioRecognition?.close();
2423
- await this._mainTask?.cancelAndWait();
2558
+ this.agent._agentActivity = undefined;
2424
2559
  } finally {
2425
2560
  unlock();
2426
2561
  }
2427
2562
  }
2563
+
2564
+ private async _closeSessionResources(): Promise<void> {
2565
+ // Unregister event handlers to prevent duplicate metrics
2566
+ if (this.llm instanceof LLM) {
2567
+ this.llm.off('metrics_collected', this.onMetricsCollected);
2568
+ this.llm.off('error', this.onModelError);
2569
+ }
2570
+
2571
+ if (this.realtimeSession) {
2572
+ this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
2573
+ this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
2574
+ this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
2575
+ this.realtimeSession.off(
2576
+ 'input_audio_transcription_completed',
2577
+ this.onRealtimeInputAudioTranscriptionCompleted,
2578
+ );
2579
+ this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2580
+ this.realtimeSession.off('error', this.onModelError);
2581
+ }
2582
+
2583
+ if (this.stt instanceof STT) {
2584
+ this.stt.off('metrics_collected', this.onMetricsCollected);
2585
+ this.stt.off('error', this.onModelError);
2586
+ }
2587
+
2588
+ if (this.tts instanceof TTS) {
2589
+ this.tts.off('metrics_collected', this.onMetricsCollected);
2590
+ this.tts.off('error', this.onModelError);
2591
+ }
2592
+
2593
+ if (this.vad instanceof VAD) {
2594
+ this.vad.off('metrics_collected', this.onMetricsCollected);
2595
+ }
2596
+
2597
+ this.detachAudioInput();
2598
+ this.realtimeSpans?.clear();
2599
+ await this.realtimeSession?.close();
2600
+ await this.audioRecognition?.close();
2601
+ this.realtimeSession = undefined;
2602
+ this.audioRecognition = undefined;
2603
+ }
2428
2604
  }
2429
2605
 
2430
2606
  function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {