@livekit/agents 1.0.46 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/utils.cjs +31 -2
  56. package/dist/utils.cjs.map +1 -1
  57. package/dist/utils.d.cts +7 -0
  58. package/dist/utils.d.ts +7 -0
  59. package/dist/utils.d.ts.map +1 -1
  60. package/dist/utils.js +31 -2
  61. package/dist/utils.js.map +1 -1
  62. package/dist/utils.test.cjs +71 -0
  63. package/dist/utils.test.cjs.map +1 -1
  64. package/dist/utils.test.js +71 -0
  65. package/dist/utils.test.js.map +1 -1
  66. package/dist/version.cjs +1 -1
  67. package/dist/version.cjs.map +1 -1
  68. package/dist/version.d.cts +1 -1
  69. package/dist/version.d.ts +1 -1
  70. package/dist/version.d.ts.map +1 -1
  71. package/dist/version.js +1 -1
  72. package/dist/version.js.map +1 -1
  73. package/dist/voice/agent.cjs +144 -12
  74. package/dist/voice/agent.cjs.map +1 -1
  75. package/dist/voice/agent.d.cts +29 -4
  76. package/dist/voice/agent.d.ts +29 -4
  77. package/dist/voice/agent.d.ts.map +1 -1
  78. package/dist/voice/agent.js +140 -11
  79. package/dist/voice/agent.js.map +1 -1
  80. package/dist/voice/agent.test.cjs +120 -0
  81. package/dist/voice/agent.test.cjs.map +1 -1
  82. package/dist/voice/agent.test.js +122 -2
  83. package/dist/voice/agent.test.js.map +1 -1
  84. package/dist/voice/agent_activity.cjs +383 -298
  85. package/dist/voice/agent_activity.cjs.map +1 -1
  86. package/dist/voice/agent_activity.d.cts +34 -7
  87. package/dist/voice/agent_activity.d.ts +34 -7
  88. package/dist/voice/agent_activity.d.ts.map +1 -1
  89. package/dist/voice/agent_activity.js +383 -293
  90. package/dist/voice/agent_activity.js.map +1 -1
  91. package/dist/voice/agent_session.cjs +140 -40
  92. package/dist/voice/agent_session.cjs.map +1 -1
  93. package/dist/voice/agent_session.d.cts +19 -7
  94. package/dist/voice/agent_session.d.ts +19 -7
  95. package/dist/voice/agent_session.d.ts.map +1 -1
  96. package/dist/voice/agent_session.js +137 -37
  97. package/dist/voice/agent_session.js.map +1 -1
  98. package/dist/voice/audio_recognition.cjs +4 -0
  99. package/dist/voice/audio_recognition.cjs.map +1 -1
  100. package/dist/voice/audio_recognition.d.ts.map +1 -1
  101. package/dist/voice/audio_recognition.js +4 -0
  102. package/dist/voice/audio_recognition.js.map +1 -1
  103. package/dist/voice/generation.cjs +39 -19
  104. package/dist/voice/generation.cjs.map +1 -1
  105. package/dist/voice/generation.d.ts.map +1 -1
  106. package/dist/voice/generation.js +44 -20
  107. package/dist/voice/generation.js.map +1 -1
  108. package/dist/voice/index.cjs +2 -0
  109. package/dist/voice/index.cjs.map +1 -1
  110. package/dist/voice/index.d.cts +1 -1
  111. package/dist/voice/index.d.ts +1 -1
  112. package/dist/voice/index.d.ts.map +1 -1
  113. package/dist/voice/index.js +2 -1
  114. package/dist/voice/index.js.map +1 -1
  115. package/dist/voice/speech_handle.cjs +7 -1
  116. package/dist/voice/speech_handle.cjs.map +1 -1
  117. package/dist/voice/speech_handle.d.cts +2 -0
  118. package/dist/voice/speech_handle.d.ts +2 -0
  119. package/dist/voice/speech_handle.d.ts.map +1 -1
  120. package/dist/voice/speech_handle.js +8 -2
  121. package/dist/voice/speech_handle.js.map +1 -1
  122. package/dist/voice/testing/run_result.cjs +66 -15
  123. package/dist/voice/testing/run_result.cjs.map +1 -1
  124. package/dist/voice/testing/run_result.d.cts +14 -3
  125. package/dist/voice/testing/run_result.d.ts +14 -3
  126. package/dist/voice/testing/run_result.d.ts.map +1 -1
  127. package/dist/voice/testing/run_result.js +66 -15
  128. package/dist/voice/testing/run_result.js.map +1 -1
  129. package/package.json +1 -1
  130. package/src/cli.ts +20 -33
  131. package/src/ipc/job_proc_lazy_main.ts +16 -5
  132. package/src/llm/chat_context.ts +35 -0
  133. package/src/llm/provider_format/index.ts +7 -2
  134. package/src/llm/provider_format/openai.test.ts +385 -1
  135. package/src/llm/provider_format/openai.ts +103 -0
  136. package/src/llm/provider_format/utils.ts +6 -4
  137. package/src/llm/realtime.ts +1 -0
  138. package/src/log.ts +5 -2
  139. package/src/stream/deferred_stream.ts +17 -6
  140. package/src/utils.test.ts +87 -0
  141. package/src/utils.ts +36 -2
  142. package/src/version.ts +1 -1
  143. package/src/voice/agent.test.ts +140 -2
  144. package/src/voice/agent.ts +189 -10
  145. package/src/voice/agent_activity.ts +427 -289
  146. package/src/voice/agent_session.ts +178 -40
  147. package/src/voice/audio_recognition.ts +4 -0
  148. package/src/voice/generation.ts +52 -23
  149. package/src/voice/index.ts +1 -1
  150. package/src/voice/speech_handle.ts +9 -2
  151. package/src/voice/testing/run_result.ts +81 -23
@@ -35,7 +35,7 @@ import type {
35
35
  TTSMetrics,
36
36
  VADMetrics,
37
37
  } from '../metrics/base.js';
38
- import { DeferredReadableStream } from '../stream/deferred_stream.js';
38
+ import { MultiInputStream } from '../stream/multi_input_stream.js';
39
39
  import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
40
40
  import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
41
41
  import { splitWords } from '../tokenize/basic/word.js';
@@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
43
43
  import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
44
44
  import { VAD, type VADEvent } from '../vad.js';
45
45
  import type { Agent, ModelSettings } from './agent.js';
46
- import { StopResponse, asyncLocalStorage } from './agent.js';
46
+ import {
47
+ StopResponse,
48
+ _getActivityTaskInfo,
49
+ _setActivityTaskInfo,
50
+ functionCallStorage,
51
+ speechHandleStorage,
52
+ } from './agent.js';
47
53
  import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
48
54
  import {
49
55
  AudioRecognition,
@@ -60,7 +66,7 @@ import {
60
66
  createSpeechCreatedEvent,
61
67
  createUserInputTranscribedEvent,
62
68
  } from './events.js';
63
- import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
69
+ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
64
70
  import {
65
71
  type _AudioOut,
66
72
  type _TextOut,
@@ -76,7 +82,7 @@ import type { TimedString } from './io.js';
76
82
  import { SpeechHandle } from './speech_handle.js';
77
83
  import { setParticipantSpanAttributes } from './utils.js';
78
84
 
79
- const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
85
+ export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
80
86
 
81
87
  interface PreemptiveGeneration {
82
88
  speechHandle: SpeechHandle;
@@ -89,31 +95,47 @@ interface PreemptiveGeneration {
89
95
  }
90
96
 
91
97
  export class AgentActivity implements RecognitionHooks {
98
+ agent: Agent;
99
+ agentSession: AgentSession;
100
+
92
101
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
102
+
93
103
  private started = false;
94
104
  private audioRecognition?: AudioRecognition;
95
105
  private realtimeSession?: RealtimeSession;
96
106
  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
97
107
  private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
98
108
  private logger = log();
99
- private _draining = false;
109
+ private _schedulingPaused = true;
110
+ private _drainBlockedTasks: Task<any>[] = [];
100
111
  private _currentSpeech?: SpeechHandle;
101
112
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
102
113
  private q_updated: Future;
103
114
  private speechTasks: Set<Task<void>> = new Set();
104
115
  private lock = new Mutex();
105
- private audioStream = new DeferredReadableStream<AudioFrame>();
116
+ private audioStream = new MultiInputStream<AudioFrame>();
117
+ private audioStreamId?: string;
118
+
106
119
  // default to null as None, which maps to the default provider tool choice value
107
120
  private toolChoice: ToolChoice | null = null;
108
121
  private _preemptiveGeneration?: PreemptiveGeneration;
109
122
 
110
- agent: Agent;
111
- agentSession: AgentSession;
112
-
113
123
  /** @internal */
114
124
  _mainTask?: Task<void>;
115
- _userTurnCompletedTask?: Promise<void>;
116
-
125
+ _onEnterTask?: Task<void>;
126
+ _onExitTask?: Task<void>;
127
+ _userTurnCompletedTask?: Task<void>;
128
+
129
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
130
+ this.onGenerationCreated(ev);
131
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
132
+ this.onInputSpeechStarted(ev);
133
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
134
+ this.onInputSpeechStopped(ev);
135
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
136
+ this.onInputAudioTranscriptionCompleted(ev);
137
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
138
+ this.onError(ev);
117
139
  constructor(agent: Agent, agentSession: AgentSession) {
118
140
  this.agent = agent;
119
141
  this.agentSession = agentSession;
@@ -133,7 +155,7 @@ export class AgentActivity implements RecognitionHooks {
133
155
 
134
156
  if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
135
157
  this.logger.warn(
136
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
158
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
137
159
  );
138
160
  this.turnDetectionMode = undefined;
139
161
  }
@@ -211,120 +233,138 @@ export class AgentActivity implements RecognitionHooks {
211
233
  async start(): Promise<void> {
212
234
  const unlock = await this.lock.lock();
213
235
  try {
214
- // Create start_agent_activity as a ROOT span (new trace) to match Python behavior
215
- const startSpan = tracer.startSpan({
216
- name: 'start_agent_activity',
217
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
218
- context: ROOT_CONTEXT,
219
- });
236
+ await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
237
+ } finally {
238
+ unlock();
239
+ }
240
+ }
220
241
 
221
- this.agent._agentActivity = this;
242
+ async resume(): Promise<void> {
243
+ const unlock = await this.lock.lock();
244
+ try {
245
+ await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
246
+ } finally {
247
+ unlock();
248
+ }
249
+ }
222
250
 
223
- if (this.llm instanceof RealtimeModel) {
224
- this.realtimeSession = this.llm.session();
225
- this.realtimeSpans = new Map<string, Span>();
226
- this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
227
- this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
228
- this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
229
- this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
230
- this.onInputAudioTranscriptionCompleted(ev),
231
- );
232
- this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
233
- this.realtimeSession.on('error', (ev) => this.onError(ev));
234
-
235
- removeInstructions(this.agent._chatCtx);
236
- try {
237
- await this.realtimeSession.updateInstructions(this.agent.instructions);
238
- } catch (error) {
239
- this.logger.error(error, 'failed to update the instructions');
240
- }
251
+ private async _startSession(options: {
252
+ spanName: 'start_agent_activity' | 'resume_agent_activity';
253
+ runOnEnter: boolean;
254
+ }): Promise<void> {
255
+ const { spanName, runOnEnter } = options;
256
+ const startSpan = tracer.startSpan({
257
+ name: spanName,
258
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
259
+ context: ROOT_CONTEXT,
260
+ });
241
261
 
242
- try {
243
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
244
- } catch (error) {
245
- this.logger.error(error, 'failed to update the chat context');
246
- }
262
+ this.agent._agentActivity = this;
247
263
 
248
- try {
249
- await this.realtimeSession.updateTools(this.tools);
250
- } catch (error) {
251
- this.logger.error(error, 'failed to update the tools');
252
- }
264
+ if (this.llm instanceof RealtimeModel) {
265
+ this.realtimeSession = this.llm.session();
266
+ this.realtimeSpans = new Map<string, Span>();
267
+ this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
268
+ this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
269
+ this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
270
+ this.realtimeSession.on(
271
+ 'input_audio_transcription_completed',
272
+ this.onRealtimeInputAudioTranscriptionCompleted,
273
+ );
274
+ this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
275
+ this.realtimeSession.on('error', this.onModelError);
253
276
 
254
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
255
- this.logger.error(
256
- 'audio output is enabled but RealtimeModel has no audio modality ' +
257
- 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
258
- 'or set a TTS model.',
259
- );
260
- }
261
- } else if (this.llm instanceof LLM) {
262
- try {
263
- updateInstructions({
264
- chatCtx: this.agent._chatCtx,
265
- instructions: this.agent.instructions,
266
- addIfMissing: true,
267
- });
268
- } catch (error) {
269
- this.logger.error('failed to update the instructions', error);
270
- }
277
+ removeInstructions(this.agent._chatCtx);
278
+ try {
279
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
280
+ } catch (error) {
281
+ this.logger.error(error, 'failed to update the instructions');
271
282
  }
272
283
 
273
- // metrics and error handling
274
- if (this.llm instanceof LLM) {
275
- this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
276
- this.llm.on('error', (ev) => this.onError(ev));
284
+ try {
285
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
286
+ } catch (error) {
287
+ this.logger.error(error, 'failed to update the chat context');
277
288
  }
278
289
 
279
- if (this.stt instanceof STT) {
280
- this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
281
- this.stt.on('error', (ev) => this.onError(ev));
290
+ try {
291
+ await this.realtimeSession.updateTools(this.tools);
292
+ } catch (error) {
293
+ this.logger.error(error, 'failed to update the tools');
282
294
  }
283
295
 
284
- if (this.tts instanceof TTS) {
285
- this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
286
- this.tts.on('error', (ev) => this.onError(ev));
296
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
297
+ this.logger.error(
298
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
299
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
300
+ 'or set a TTS model.',
301
+ );
287
302
  }
288
-
289
- if (this.vad instanceof VAD) {
290
- this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
303
+ } else if (this.llm instanceof LLM) {
304
+ try {
305
+ updateInstructions({
306
+ chatCtx: this.agent._chatCtx,
307
+ instructions: this.agent.instructions,
308
+ addIfMissing: true,
309
+ });
310
+ } catch (error) {
311
+ this.logger.error('failed to update the instructions', error);
291
312
  }
313
+ }
292
314
 
293
- this.audioRecognition = new AudioRecognition({
294
- recognitionHooks: this,
295
- // Disable stt node if stt is not provided
296
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
297
- vad: this.vad,
298
- turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
299
- turnDetectionMode: this.turnDetectionMode,
300
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
301
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
302
- rootSpanContext: this.agentSession.rootSpanContext,
303
- sttModel: this.stt?.label,
304
- sttProvider: this.getSttProvider(),
305
- getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
306
- });
307
- this.audioRecognition.start();
308
- this.started = true;
315
+ // metrics and error handling
316
+ if (this.llm instanceof LLM) {
317
+ this.llm.on('metrics_collected', this.onMetricsCollected);
318
+ this.llm.on('error', this.onModelError);
319
+ }
309
320
 
310
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
321
+ if (this.stt instanceof STT) {
322
+ this.stt.on('metrics_collected', this.onMetricsCollected);
323
+ this.stt.on('error', this.onModelError);
324
+ }
311
325
 
312
- // Create on_enter as a child of start_agent_activity in the new trace
313
- const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
314
- name: 'on_enter',
315
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
316
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
317
- });
326
+ if (this.tts instanceof TTS) {
327
+ this.tts.on('metrics_collected', this.onMetricsCollected);
328
+ this.tts.on('error', this.onModelError);
329
+ }
318
330
 
319
- this.createSpeechTask({
320
- task: Task.from(() => onEnterTask),
331
+ if (this.vad instanceof VAD) {
332
+ this.vad.on('metrics_collected', this.onMetricsCollected);
333
+ }
334
+
335
+ this.audioRecognition = new AudioRecognition({
336
+ recognitionHooks: this,
337
+ // Disable stt node if stt is not provided
338
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
339
+ vad: this.vad,
340
+ turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
341
+ turnDetectionMode: this.turnDetectionMode,
342
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
343
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
344
+ rootSpanContext: this.agentSession.rootSpanContext,
345
+ sttModel: this.stt?.label,
346
+ sttProvider: this.getSttProvider(),
347
+ getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
348
+ });
349
+ this.audioRecognition.start();
350
+ this.started = true;
351
+
352
+ this._resumeSchedulingTask();
353
+
354
+ if (runOnEnter) {
355
+ this._onEnterTask = this.createSpeechTask({
356
+ taskFn: () =>
357
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
358
+ name: 'on_enter',
359
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
360
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
361
+ }),
362
+ inlineTask: true,
321
363
  name: 'AgentActivity_onEnter',
322
364
  });
323
-
324
- startSpan.end();
325
- } finally {
326
- unlock();
327
365
  }
366
+
367
+ startSpan.end();
328
368
  }
329
369
 
330
370
  get currentSpeech(): SpeechHandle | undefined {
@@ -362,8 +402,8 @@ export class AgentActivity implements RecognitionHooks {
362
402
  return this.agent.toolCtx;
363
403
  }
364
404
 
365
- get draining(): boolean {
366
- return this._draining;
405
+ get schedulingPaused(): boolean {
406
+ return this._schedulingPaused;
367
407
  }
368
408
 
369
409
  get realtimeLLMSession(): RealtimeSession | undefined {
@@ -417,18 +457,10 @@ export class AgentActivity implements RecognitionHooks {
417
457
  }
418
458
 
419
459
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
420
- if (this.audioStream.isSourceSet) {
421
- this.logger.debug('detaching existing audio input in agent activity');
422
- this.audioStream.detachSource();
423
- }
460
+ void this.audioStream.close();
461
+ this.audioStream = new MultiInputStream<AudioFrame>();
424
462
 
425
- /**
426
- * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
427
- * The tee() operation should be applied to the deferred stream, not the original audioStream.
428
- * This is important because teeing the original stream directly makes it very difficult—if not
429
- * impossible—to implement stream unlock logic cleanly.
430
- */
431
- this.audioStream.setSource(audioStream);
463
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
432
464
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
433
465
 
434
466
  if (this.realtimeSession) {
@@ -441,16 +473,29 @@ export class AgentActivity implements RecognitionHooks {
441
473
  }
442
474
 
443
475
  detachAudioInput(): void {
444
- this.audioStream.detachSource();
476
+ if (this.audioStreamId === undefined) {
477
+ return;
478
+ }
479
+
480
+ void this.audioStream.close();
481
+ this.audioStream = new MultiInputStream<AudioFrame>();
482
+ this.audioStreamId = undefined;
445
483
  }
446
484
 
447
- commitUserTurn() {
485
+ commitUserTurn(
486
+ options: {
487
+ audioDetached?: boolean;
488
+ throwIfNotReady?: boolean;
489
+ } = {},
490
+ ) {
491
+ const { audioDetached = false, throwIfNotReady = true } = options;
448
492
  if (!this.audioRecognition) {
449
- throw new Error('AudioRecognition is not initialized');
493
+ if (throwIfNotReady) {
494
+ throw new Error('AudioRecognition is not initialized');
495
+ }
496
+ return;
450
497
  }
451
498
 
452
- // TODO(brian): add audio_detached flag
453
- const audioDetached = false;
454
499
  this.audioRecognition.commitUserTurn(audioDetached);
455
500
  }
456
501
 
@@ -508,14 +553,13 @@ export class AgentActivity implements RecognitionHooks {
508
553
  }),
509
554
  );
510
555
  const task = this.createSpeechTask({
511
- task: Task.from((abortController: AbortController) =>
556
+ taskFn: (abortController: AbortController) =>
512
557
  this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
513
- ),
514
558
  ownedSpeechHandle: handle,
515
559
  name: 'AgentActivity.say_tts',
516
560
  });
517
561
 
518
- task.finally(() => this.onPipelineReplyDone());
562
+ task.result.finally(() => this.onPipelineReplyDone());
519
563
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
520
564
  return handle;
521
565
  }
@@ -628,9 +672,9 @@ export class AgentActivity implements RecognitionHooks {
628
672
  return;
629
673
  }
630
674
 
631
- if (this.draining) {
675
+ if (this.schedulingPaused) {
632
676
  // TODO(shubhra): should we "forward" this new turn to the next agent?
633
- this.logger.warn('skipping new realtime generation, the agent is draining');
677
+ this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
634
678
  return;
635
679
  }
636
680
 
@@ -648,9 +692,8 @@ export class AgentActivity implements RecognitionHooks {
648
692
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
649
693
 
650
694
  this.createSpeechTask({
651
- task: Task.from((abortController: AbortController) =>
695
+ taskFn: (abortController: AbortController) =>
652
696
  this.realtimeGenerationTask(handle, ev, {}, abortController),
653
- ),
654
697
  ownedSpeechHandle: handle,
655
698
  name: 'AgentActivity.realtimeGeneration',
656
699
  });
@@ -782,7 +825,7 @@ export class AgentActivity implements RecognitionHooks {
782
825
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
783
826
  if (
784
827
  !this.agentSession.options.preemptiveGeneration ||
785
- this.draining ||
828
+ this.schedulingPaused ||
786
829
  (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
787
830
  !(this.llm instanceof LLM)
788
831
  ) {
@@ -829,11 +872,32 @@ export class AgentActivity implements RecognitionHooks {
829
872
  }
830
873
 
831
874
  private createSpeechTask(options: {
832
- task: Task<void>;
875
+ taskFn: (controller: AbortController) => Promise<void>;
876
+ controller?: AbortController;
833
877
  ownedSpeechHandle?: SpeechHandle;
878
+ inlineTask?: boolean;
834
879
  name?: string;
835
- }): Promise<void> {
836
- const { task, ownedSpeechHandle } = options;
880
+ }): Task<void> {
881
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
882
+
883
+ const wrappedFn = (ctrl: AbortController) => {
884
+ return agentActivityStorage.run(this, () => {
885
+ // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
886
+ // before post-construction metadata is attached to the Task instance.
887
+ const currentTask = Task.current();
888
+ if (currentTask) {
889
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
890
+ }
891
+
892
+ if (ownedSpeechHandle) {
893
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
894
+ }
895
+ return taskFn(ctrl);
896
+ });
897
+ };
898
+
899
+ const task = Task.from(wrappedFn, controller, name);
900
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
837
901
 
838
902
  this.speechTasks.add(task);
839
903
  task.addDoneCallback(() => {
@@ -853,13 +917,16 @@ export class AgentActivity implements RecognitionHooks {
853
917
  this.wakeupMainTask();
854
918
  });
855
919
 
856
- return task.result;
920
+ return task;
857
921
  }
858
922
 
859
923
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
860
- if (this.draining) {
924
+ if (this.schedulingPaused) {
861
925
  this.cancelPreemptiveGeneration();
862
- this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
926
+ this.logger.warn(
927
+ { user_input: info.newTranscript },
928
+ 'skipping user input, speech scheduling is paused',
929
+ );
863
930
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
864
931
  return true;
865
932
  }
@@ -892,7 +959,7 @@ export class AgentActivity implements RecognitionHooks {
892
959
 
893
960
  const oldTask = this._userTurnCompletedTask;
894
961
  this._userTurnCompletedTask = this.createSpeechTask({
895
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
962
+ taskFn: () => this.userTurnCompleted(info, oldTask),
896
963
  name: 'AgentActivity.userTurnCompleted',
897
964
  });
898
965
  return true;
@@ -928,10 +995,12 @@ export class AgentActivity implements RecognitionHooks {
928
995
  this._currentSpeech = undefined;
929
996
  }
930
997
 
931
- // If we're draining and there are no more speech tasks, we can exit.
932
- // Only speech tasks can bypass draining to create a tool response
933
- if (this.draining && this.speechTasks.size === 0) {
934
- this.logger.info('mainTask: draining and no more speech tasks');
998
+ // if we're draining/pausing and there are no more speech tasks, we can exit.
999
+ // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
1000
+ const toWait = this.getDrainPendingSpeechTasks();
1001
+
1002
+ if (this._schedulingPaused && toWait.length === 0) {
1003
+ this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
935
1004
  break;
936
1005
  }
937
1006
 
@@ -941,6 +1010,39 @@ export class AgentActivity implements RecognitionHooks {
941
1010
  this.logger.info('AgentActivity mainTask: exiting');
942
1011
  }
943
1012
 
1013
+ private getDrainPendingSpeechTasks(): Task<void>[] {
1014
+ const blockedHandles: SpeechHandle[] = [];
1015
+
1016
+ for (const task of this._drainBlockedTasks) {
1017
+ const info = _getActivityTaskInfo(task);
1018
+ if (!info) {
1019
+ this.logger.error('blocked task without activity info; skipping.');
1020
+ continue;
1021
+ }
1022
+
1023
+ if (!info.speechHandle) {
1024
+ continue; // onEnter/onExit
1025
+ }
1026
+
1027
+ blockedHandles.push(info.speechHandle);
1028
+ }
1029
+
1030
+ const toWait: Task<void>[] = [];
1031
+ for (const task of this.speechTasks) {
1032
+ if (this._drainBlockedTasks.includes(task)) {
1033
+ continue;
1034
+ }
1035
+
1036
+ const info = _getActivityTaskInfo(task);
1037
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
1038
+ continue;
1039
+ }
1040
+
1041
+ toWait.push(task);
1042
+ }
1043
+ return toWait;
1044
+ }
1045
+
944
1046
  private wakeupMainTask(): void {
945
1047
  this.q_updated.resolve();
946
1048
  }
@@ -982,7 +1084,7 @@ export class AgentActivity implements RecognitionHooks {
982
1084
  throw new Error('trying to generate reply without an LLM model');
983
1085
  }
984
1086
 
985
- const functionCall = asyncLocalStorage.getStore()?.functionCall;
1087
+ const functionCall = functionCallStorage.getStore()?.functionCall;
986
1088
  if (toolChoice === undefined && functionCall !== undefined) {
987
1089
  // when generateReply is called inside a tool, set toolChoice to 'none' by default
988
1090
  toolChoice = 'none';
@@ -1004,7 +1106,7 @@ export class AgentActivity implements RecognitionHooks {
1004
1106
 
1005
1107
  if (this.llm instanceof RealtimeModel) {
1006
1108
  this.createSpeechTask({
1007
- task: Task.from((abortController: AbortController) =>
1109
+ taskFn: (abortController: AbortController) =>
1008
1110
  this.realtimeReplyTask({
1009
1111
  speechHandle: handle,
1010
1112
  // TODO(brian): support llm.ChatMessage for the realtime model
@@ -1016,7 +1118,6 @@ export class AgentActivity implements RecognitionHooks {
1016
1118
  },
1017
1119
  abortController,
1018
1120
  }),
1019
- ),
1020
1121
  ownedSpeechHandle: handle,
1021
1122
  name: 'AgentActivity.realtimeReply',
1022
1123
  });
@@ -1029,7 +1130,7 @@ export class AgentActivity implements RecognitionHooks {
1029
1130
  }
1030
1131
 
1031
1132
  const task = this.createSpeechTask({
1032
- task: Task.from((abortController: AbortController) =>
1133
+ taskFn: (abortController: AbortController) =>
1033
1134
  this.pipelineReplyTask(
1034
1135
  handle,
1035
1136
  chatCtx ?? this.agent.chatCtx,
@@ -1041,12 +1142,11 @@ export class AgentActivity implements RecognitionHooks {
1041
1142
  instructions,
1042
1143
  userMessage,
1043
1144
  ),
1044
- ),
1045
1145
  ownedSpeechHandle: handle,
1046
1146
  name: 'AgentActivity.pipelineReply',
1047
1147
  });
1048
1148
 
1049
- task.finally(() => this.onPipelineReplyDone());
1149
+ task.result.finally(() => this.onPipelineReplyDone());
1050
1150
  }
1051
1151
 
1052
1152
  if (scheduleSpeech) {
@@ -1055,16 +1155,19 @@ export class AgentActivity implements RecognitionHooks {
1055
1155
  return handle;
1056
1156
  }
1057
1157
 
1058
- interrupt(): Future<void> {
1158
+ interrupt(options: { force?: boolean } = {}): Future<void> {
1159
+ const { force = false } = options;
1160
+ this.cancelPreemptiveGeneration();
1161
+
1059
1162
  const future = new Future<void>();
1060
1163
  const currentSpeech = this._currentSpeech;
1061
1164
 
1062
1165
  //TODO(AJS-273): add interrupt for background speeches
1063
1166
 
1064
- currentSpeech?.interrupt();
1167
+ currentSpeech?.interrupt(force);
1065
1168
 
1066
1169
  for (const [_, __, speech] of this.speechQueue) {
1067
- speech.interrupt();
1170
+ speech.interrupt(force);
1068
1171
  }
1069
1172
 
1070
1173
  this.realtimeSession?.interrupt();
@@ -1087,13 +1190,13 @@ export class AgentActivity implements RecognitionHooks {
1087
1190
  }
1088
1191
  }
1089
1192
 
1090
- private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
1193
+ private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
1091
1194
  if (oldTask) {
1092
1195
  // We never cancel user code as this is very confusing.
1093
1196
  // So we wait for the old execution of onUserTurnCompleted to finish.
1094
1197
  // In practice this is OK because most speeches will be interrupted if a new turn
1095
1198
  // is detected. So the previous execution should complete quickly.
1096
- await oldTask;
1199
+ await oldTask.result;
1097
1200
  }
1098
1201
 
1099
1202
  // When the audio recognition detects the end of a user turn:
@@ -1551,13 +1654,15 @@ export class AgentActivity implements RecognitionHooks {
1551
1654
  for (const msg of toolsMessages) {
1552
1655
  msg.createdAt = replyStartedAt;
1553
1656
  }
1554
- this.agent._chatCtx.insert(toolsMessages);
1555
- // Only add FunctionCallOutput items to session history since FunctionCall items
1556
- // were already added by onToolExecutionStarted when the tool execution began
1657
+ // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
1658
+ // were already added by onToolExecutionStarted when the tool execution began.
1659
+ // Inserting function_calls again would create duplicates that break provider APIs
1660
+ // (e.g. Google's "function response parts != function call parts" error).
1557
1661
  const toolCallOutputs = toolsMessages.filter(
1558
1662
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1559
1663
  );
1560
1664
  if (toolCallOutputs.length > 0) {
1665
+ this.agent._chatCtx.insert(toolCallOutputs);
1561
1666
  this.agentSession._toolItemsAdded(toolCallOutputs);
1562
1667
  }
1563
1668
  }
@@ -1665,52 +1770,18 @@ export class AgentActivity implements RecognitionHooks {
1665
1770
  return;
1666
1771
  }
1667
1772
 
1668
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1669
- functionCalls: [],
1670
- functionCallOutputs: [],
1671
- });
1672
- let shouldGenerateToolReply: boolean = false;
1673
- let newAgentTask: Agent | null = null;
1674
- let ignoreTaskSwitch: boolean = false;
1675
-
1676
- for (const sanitizedOut of toolOutput.output) {
1677
- if (sanitizedOut.toolCallOutput !== undefined) {
1678
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1679
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1680
- if (sanitizedOut.replyRequired) {
1681
- shouldGenerateToolReply = true;
1682
- }
1683
- }
1684
-
1685
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1686
- this.logger.error('expected to receive only one agent task from the tool executions');
1687
- ignoreTaskSwitch = true;
1688
- // TODO(brian): should we mark the function call as failed to notify the LLM?
1689
- }
1690
-
1691
- newAgentTask = sanitizedOut.agentTask ?? null;
1692
-
1693
- this.logger.debug(
1694
- {
1695
- speechId: speechHandle.id,
1696
- name: sanitizedOut.toolCall?.name,
1697
- args: sanitizedOut.toolCall.args,
1698
- output: sanitizedOut.toolCallOutput?.output,
1699
- isError: sanitizedOut.toolCallOutput?.isError,
1700
- },
1701
- 'Tool call execution finished',
1702
- );
1703
- }
1773
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
1774
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1704
1775
 
1705
1776
  this.agentSession.emit(
1706
1777
  AgentSessionEventTypes.FunctionToolsExecuted,
1707
1778
  functionToolsExecutedEvent,
1708
1779
  );
1709
1780
 
1710
- let draining = this.draining;
1781
+ let schedulingPaused = this.schedulingPaused;
1711
1782
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1712
1783
  this.agentSession.updateAgent(newAgentTask);
1713
- draining = true;
1784
+ schedulingPaused = true;
1714
1785
  }
1715
1786
 
1716
1787
  const toolMessages = [
@@ -1725,11 +1796,12 @@ export class AgentActivity implements RecognitionHooks {
1725
1796
 
1726
1797
  // Avoid setting tool_choice to "required" or a specific function when
1727
1798
  // passing tool response back to the LLM
1728
- const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1799
+ const respondToolChoice =
1800
+ schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1729
1801
 
1730
1802
  // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
1731
1803
  const toolResponseTask = this.createSpeechTask({
1732
- task: Task.from(() =>
1804
+ taskFn: () =>
1733
1805
  this.pipelineReplyTask(
1734
1806
  speechHandle,
1735
1807
  chatCtx,
@@ -1740,12 +1812,11 @@ export class AgentActivity implements RecognitionHooks {
1740
1812
  undefined,
1741
1813
  toolMessages,
1742
1814
  ),
1743
- ),
1744
1815
  ownedSpeechHandle: speechHandle,
1745
1816
  name: 'AgentActivity.pipelineReply',
1746
1817
  });
1747
1818
 
1748
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1819
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1749
1820
 
1750
1821
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1751
1822
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -1753,15 +1824,12 @@ export class AgentActivity implements RecognitionHooks {
1753
1824
  msg.createdAt = replyStartedAt;
1754
1825
  }
1755
1826
 
1756
- this.agent._chatCtx.insert(toolMessages);
1757
-
1758
- // Only add FunctionCallOutput items to session history since FunctionCall items
1759
- // were already added by onToolExecutionStarted when the tool execution began
1760
1827
  const toolCallOutputs = toolMessages.filter(
1761
1828
  (m): m is FunctionCallOutput => m.type === 'function_call_output',
1762
1829
  );
1763
1830
 
1764
1831
  if (toolCallOutputs.length > 0) {
1832
+ this.agent._chatCtx.insert(toolCallOutputs);
1765
1833
  this.agentSession._toolItemsAdded(toolCallOutputs);
1766
1834
  }
1767
1835
  }
@@ -2164,50 +2232,18 @@ export class AgentActivity implements RecognitionHooks {
2164
2232
  return;
2165
2233
  }
2166
2234
 
2167
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2168
- functionCalls: [],
2169
- functionCallOutputs: [],
2170
- });
2171
- let shouldGenerateToolReply: boolean = false;
2172
- let newAgentTask: Agent | null = null;
2173
- let ignoreTaskSwitch: boolean = false;
2174
-
2175
- for (const sanitizedOut of toolOutput.output) {
2176
- if (sanitizedOut.toolCallOutput !== undefined) {
2177
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2178
- if (sanitizedOut.replyRequired) {
2179
- shouldGenerateToolReply = true;
2180
- }
2181
- }
2182
-
2183
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2184
- this.logger.error('expected to receive only one agent task from the tool executions');
2185
- ignoreTaskSwitch = true;
2186
- }
2187
-
2188
- newAgentTask = sanitizedOut.agentTask ?? null;
2189
-
2190
- this.logger.debug(
2191
- {
2192
- speechId: speechHandle.id,
2193
- name: sanitizedOut.toolCall?.name,
2194
- args: sanitizedOut.toolCall.args,
2195
- output: sanitizedOut.toolCallOutput?.output,
2196
- isError: sanitizedOut.toolCallOutput?.isError,
2197
- },
2198
- 'Tool call execution finished',
2199
- );
2200
- }
2235
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
2236
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
2201
2237
 
2202
2238
  this.agentSession.emit(
2203
2239
  AgentSessionEventTypes.FunctionToolsExecuted,
2204
2240
  functionToolsExecutedEvent,
2205
2241
  );
2206
2242
 
2207
- let draining = this.draining;
2243
+ let schedulingPaused = this.schedulingPaused;
2208
2244
  if (!ignoreTaskSwitch && newAgentTask !== null) {
2209
2245
  this.agentSession.updateAgent(newAgentTask);
2210
- draining = true;
2246
+ schedulingPaused = true;
2211
2247
  }
2212
2248
 
2213
2249
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -2263,15 +2299,14 @@ export class AgentActivity implements RecognitionHooks {
2263
2299
  }),
2264
2300
  );
2265
2301
 
2266
- const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2302
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2267
2303
  this.createSpeechTask({
2268
- task: Task.from((abortController: AbortController) =>
2304
+ taskFn: (abortController: AbortController) =>
2269
2305
  this.realtimeReplyTask({
2270
2306
  speechHandle: replySpeechHandle,
2271
2307
  modelSettings: { toolChoice },
2272
2308
  abortController,
2273
2309
  }),
2274
- ),
2275
2310
  ownedSpeechHandle: replySpeechHandle,
2276
2311
  name: 'AgentActivity.realtime_reply',
2277
2312
  });
@@ -2279,6 +2314,53 @@ export class AgentActivity implements RecognitionHooks {
2279
2314
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
2280
2315
  }
2281
2316
 
2317
+ private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
2318
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2319
+ functionCalls: [],
2320
+ functionCallOutputs: [],
2321
+ });
2322
+
2323
+ let shouldGenerateToolReply = false;
2324
+ let newAgentTask: Agent | null = null;
2325
+ let ignoreTaskSwitch = false;
2326
+
2327
+ for (const sanitizedOut of toolOutput.output) {
2328
+ if (sanitizedOut.toolCallOutput !== undefined) {
2329
+ // Keep event payload symmetric for pipeline + realtime paths.
2330
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
2331
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2332
+ if (sanitizedOut.replyRequired) {
2333
+ shouldGenerateToolReply = true;
2334
+ }
2335
+ }
2336
+
2337
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2338
+ this.logger.error('expected to receive only one agent task from the tool executions');
2339
+ ignoreTaskSwitch = true;
2340
+ }
2341
+
2342
+ newAgentTask = sanitizedOut.agentTask ?? null;
2343
+
2344
+ this.logger.debug(
2345
+ {
2346
+ speechId: speechHandle.id,
2347
+ name: sanitizedOut.toolCall?.name,
2348
+ args: sanitizedOut.toolCall.args,
2349
+ output: sanitizedOut.toolCallOutput?.output,
2350
+ isError: sanitizedOut.toolCallOutput?.isError,
2351
+ },
2352
+ 'Tool call execution finished',
2353
+ );
2354
+ }
2355
+
2356
+ return {
2357
+ functionToolsExecutedEvent,
2358
+ shouldGenerateToolReply,
2359
+ newAgentTask,
2360
+ ignoreTaskSwitch,
2361
+ };
2362
+ }
2363
+
2282
2364
  private async realtimeReplyTask({
2283
2365
  speechHandle,
2284
2366
  modelSettings: { toolChoice },
@@ -2337,10 +2419,10 @@ export class AgentActivity implements RecognitionHooks {
2337
2419
  priority: number,
2338
2420
  force: boolean = false,
2339
2421
  ): void {
2340
- // when force=true, we allow tool responses to bypass draining
2422
+ // when force=true, we allow tool responses to bypass scheduling pause
2341
2423
  // This allows for tool responses to be generated before the AgentActivity is finalized
2342
- if (this.draining && !force) {
2343
- throw new Error('cannot schedule new speech, the agent is draining');
2424
+ if (this.schedulingPaused && !force) {
2425
+ throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
2344
2426
  }
2345
2427
 
2346
2428
  // Monotonic time to avoid near 0 collisions
@@ -2349,6 +2431,48 @@ export class AgentActivity implements RecognitionHooks {
2349
2431
  this.wakeupMainTask();
2350
2432
  }
2351
2433
 
2434
+ private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
2435
+ if (this._schedulingPaused) return;
2436
+
2437
+ this._schedulingPaused = true;
2438
+ this._drainBlockedTasks = blockedTasks;
2439
+ this.wakeupMainTask();
2440
+
2441
+ if (this._mainTask) {
2442
+ // When pausing/draining, we ensure that all speech_tasks complete fully.
2443
+ // This means that even if the SpeechHandle themselves have finished,
2444
+ // we still wait for the entire execution (e.g function_tools)
2445
+ await this._mainTask.result;
2446
+ }
2447
+ }
2448
+
2449
+ private _resumeSchedulingTask(): void {
2450
+ if (!this._schedulingPaused) return;
2451
+
2452
+ this._schedulingPaused = false;
2453
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
2454
+ }
2455
+
2456
+ async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
2457
+ const { blockedTasks = [] } = options;
2458
+ const unlock = await this.lock.lock();
2459
+
2460
+ try {
2461
+ const span = tracer.startSpan({
2462
+ name: 'pause_agent_activity',
2463
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2464
+ });
2465
+ try {
2466
+ await this._pauseSchedulingTask(blockedTasks);
2467
+ await this._closeSessionResources();
2468
+ } finally {
2469
+ span.end();
2470
+ }
2471
+ } finally {
2472
+ unlock();
2473
+ }
2474
+ }
2475
+
2352
2476
  async drain(): Promise<void> {
2353
2477
  // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
2354
2478
  return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
@@ -2362,23 +2486,22 @@ export class AgentActivity implements RecognitionHooks {
2362
2486
 
2363
2487
  const unlock = await this.lock.lock();
2364
2488
  try {
2365
- if (this._draining) return;
2489
+ if (this._schedulingPaused) return;
2366
2490
 
2367
- this.cancelPreemptiveGeneration();
2368
-
2369
- const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
2370
- name: 'on_exit',
2371
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2372
- });
2373
-
2374
- this.createSpeechTask({
2375
- task: Task.from(() => onExitTask),
2491
+ this._onExitTask = this.createSpeechTask({
2492
+ taskFn: () =>
2493
+ tracer.startActiveSpan(async () => this.agent.onExit(), {
2494
+ name: 'on_exit',
2495
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2496
+ }),
2497
+ inlineTask: true,
2376
2498
  name: 'AgentActivity_onExit',
2377
2499
  });
2378
2500
 
2379
- this.wakeupMainTask();
2380
- this._draining = true;
2381
- await this._mainTask?.result;
2501
+ this.cancelPreemptiveGeneration();
2502
+
2503
+ await this._onExitTask.result;
2504
+ await this._pauseSchedulingTask([]);
2382
2505
  } finally {
2383
2506
  unlock();
2384
2507
  }
@@ -2387,44 +2510,59 @@ export class AgentActivity implements RecognitionHooks {
2387
2510
  async close(): Promise<void> {
2388
2511
  const unlock = await this.lock.lock();
2389
2512
  try {
2390
- if (!this._draining) {
2391
- this.logger.warn('task closing without draining');
2392
- }
2393
-
2394
2513
  this.cancelPreemptiveGeneration();
2395
- // Unregister event handlers to prevent duplicate metrics
2396
- if (this.llm instanceof LLM) {
2397
- this.llm.off('metrics_collected', this.onMetricsCollected);
2398
- }
2399
- if (this.realtimeSession) {
2400
- this.realtimeSession.off('generation_created', this.onGenerationCreated);
2401
- this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
2402
- this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
2403
- this.realtimeSession.off(
2404
- 'input_audio_transcription_completed',
2405
- this.onInputAudioTranscriptionCompleted,
2406
- );
2407
- this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2408
- }
2409
- if (this.stt instanceof STT) {
2410
- this.stt.off('metrics_collected', this.onMetricsCollected);
2411
- }
2412
- if (this.tts instanceof TTS) {
2413
- this.tts.off('metrics_collected', this.onMetricsCollected);
2414
- }
2415
- if (this.vad instanceof VAD) {
2416
- this.vad.off('metrics_collected', this.onMetricsCollected);
2514
+ await this._closeSessionResources();
2515
+
2516
+ if (this._mainTask) {
2517
+ await this._mainTask.cancelAndWait();
2417
2518
  }
2418
2519
 
2419
- this.detachAudioInput();
2420
- this.realtimeSpans?.clear();
2421
- await this.realtimeSession?.close();
2422
- await this.audioRecognition?.close();
2423
- await this._mainTask?.cancelAndWait();
2520
+ this.agent._agentActivity = undefined;
2424
2521
  } finally {
2425
2522
  unlock();
2426
2523
  }
2427
2524
  }
2525
+
2526
+ private async _closeSessionResources(): Promise<void> {
2527
+ // Unregister event handlers to prevent duplicate metrics
2528
+ if (this.llm instanceof LLM) {
2529
+ this.llm.off('metrics_collected', this.onMetricsCollected);
2530
+ this.llm.off('error', this.onModelError);
2531
+ }
2532
+
2533
+ if (this.realtimeSession) {
2534
+ this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
2535
+ this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
2536
+ this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
2537
+ this.realtimeSession.off(
2538
+ 'input_audio_transcription_completed',
2539
+ this.onRealtimeInputAudioTranscriptionCompleted,
2540
+ );
2541
+ this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2542
+ this.realtimeSession.off('error', this.onModelError);
2543
+ }
2544
+
2545
+ if (this.stt instanceof STT) {
2546
+ this.stt.off('metrics_collected', this.onMetricsCollected);
2547
+ this.stt.off('error', this.onModelError);
2548
+ }
2549
+
2550
+ if (this.tts instanceof TTS) {
2551
+ this.tts.off('metrics_collected', this.onMetricsCollected);
2552
+ this.tts.off('error', this.onModelError);
2553
+ }
2554
+
2555
+ if (this.vad instanceof VAD) {
2556
+ this.vad.off('metrics_collected', this.onMetricsCollected);
2557
+ }
2558
+
2559
+ this.detachAudioInput();
2560
+ this.realtimeSpans?.clear();
2561
+ await this.realtimeSession?.close();
2562
+ await this.audioRecognition?.close();
2563
+ this.realtimeSession = undefined;
2564
+ this.audioRecognition = undefined;
2565
+ }
2428
2566
  }
2429
2567
 
2430
2568
  function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {