npm - @livekit/agents - Versions diffs - 1.0.45 → 1.0.47 - Mend

@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

package/dist/cli.cjs +14 -20
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +14 -20
package/dist/cli.js.map +1 -1
package/dist/ipc/job_proc_lazy_main.cjs +14 -5
package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/job_proc_lazy_main.js +14 -5
package/dist/ipc/job_proc_lazy_main.js.map +1 -1
package/dist/llm/chat_context.cjs +19 -0
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +4 -0
package/dist/llm/chat_context.d.ts +4 -0
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +19 -0
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/provider_format/index.cjs +2 -0
package/dist/llm/provider_format/index.cjs.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/provider_format/index.js +6 -1
package/dist/llm/provider_format/index.js.map +1 -1
package/dist/llm/provider_format/openai.cjs +82 -2
package/dist/llm/provider_format/openai.cjs.map +1 -1
package/dist/llm/provider_format/openai.d.cts +1 -0
package/dist/llm/provider_format/openai.d.ts +1 -0
package/dist/llm/provider_format/openai.d.ts.map +1 -1
package/dist/llm/provider_format/openai.js +80 -1
package/dist/llm/provider_format/openai.js.map +1 -1
package/dist/llm/provider_format/openai.test.cjs +326 -0
package/dist/llm/provider_format/openai.test.cjs.map +1 -1
package/dist/llm/provider_format/openai.test.js +327 -1
package/dist/llm/provider_format/openai.test.js.map +1 -1
package/dist/llm/provider_format/utils.cjs +4 -3
package/dist/llm/provider_format/utils.cjs.map +1 -1
package/dist/llm/provider_format/utils.d.ts.map +1 -1
package/dist/llm/provider_format/utils.js +4 -3
package/dist/llm/provider_format/utils.js.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +1 -0
package/dist/llm/realtime.d.ts +1 -0
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/log.cjs +5 -2
package/dist/log.cjs.map +1 -1
package/dist/log.d.ts.map +1 -1
package/dist/log.js +5 -2
package/dist/log.js.map +1 -1
package/dist/stream/deferred_stream.cjs +15 -6
package/dist/stream/deferred_stream.cjs.map +1 -1
package/dist/stream/deferred_stream.d.ts.map +1 -1
package/dist/stream/deferred_stream.js +15 -6
package/dist/stream/deferred_stream.js.map +1 -1
package/dist/stream/index.cjs +3 -0
package/dist/stream/index.cjs.map +1 -1
package/dist/stream/index.d.cts +1 -0
package/dist/stream/index.d.ts +1 -0
package/dist/stream/index.d.ts.map +1 -1
package/dist/stream/index.js +2 -0
package/dist/stream/index.js.map +1 -1
package/dist/stream/multi_input_stream.cjs +139 -0
package/dist/stream/multi_input_stream.cjs.map +1 -0
package/dist/stream/multi_input_stream.d.cts +55 -0
package/dist/stream/multi_input_stream.d.ts +55 -0
package/dist/stream/multi_input_stream.d.ts.map +1 -0
package/dist/stream/multi_input_stream.js +115 -0
package/dist/stream/multi_input_stream.js.map +1 -0
package/dist/stream/multi_input_stream.test.cjs +340 -0
package/dist/stream/multi_input_stream.test.cjs.map +1 -0
package/dist/stream/multi_input_stream.test.js +339 -0
package/dist/stream/multi_input_stream.test.js.map +1 -0
package/dist/telemetry/trace_types.cjs +42 -0
package/dist/telemetry/trace_types.cjs.map +1 -1
package/dist/telemetry/trace_types.d.cts +14 -0
package/dist/telemetry/trace_types.d.ts +14 -0
package/dist/telemetry/trace_types.d.ts.map +1 -1
package/dist/telemetry/trace_types.js +28 -0
package/dist/telemetry/trace_types.js.map +1 -1
package/dist/utils.cjs +44 -2
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +8 -0
package/dist/utils.d.ts +8 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +44 -2
package/dist/utils.js.map +1 -1
package/dist/utils.test.cjs +71 -0
package/dist/utils.test.cjs.map +1 -1
package/dist/utils.test.js +71 -0
package/dist/utils.test.js.map +1 -1
package/dist/version.cjs +1 -1
package/dist/version.cjs.map +1 -1
package/dist/version.d.cts +1 -1
package/dist/version.d.ts +1 -1
package/dist/version.d.ts.map +1 -1
package/dist/version.js +1 -1
package/dist/version.js.map +1 -1
package/dist/voice/agent.cjs +144 -12
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +29 -4
package/dist/voice/agent.d.ts +29 -4
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +140 -11
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent.test.cjs +120 -0
package/dist/voice/agent.test.cjs.map +1 -1
package/dist/voice/agent.test.js +122 -2
package/dist/voice/agent.test.js.map +1 -1
package/dist/voice/agent_activity.cjs +402 -292
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +35 -7
package/dist/voice/agent_activity.d.ts +35 -7
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +402 -287
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +156 -44
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +22 -9
package/dist/voice/agent_session.d.ts +22 -9
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +156 -44
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +89 -36
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +22 -1
package/dist/voice/audio_recognition.d.ts +22 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +93 -36
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/audio_recognition_span.test.cjs +233 -0
package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
package/dist/voice/audio_recognition_span.test.js +232 -0
package/dist/voice/audio_recognition_span.test.js.map +1 -0
package/dist/voice/generation.cjs +39 -19
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +44 -20
package/dist/voice/generation.js.map +1 -1
package/dist/voice/index.cjs +2 -0
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -1
package/dist/voice/index.d.ts +1 -1
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +2 -1
package/dist/voice/index.js.map +1 -1
package/dist/voice/io.cjs +6 -3
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +3 -2
package/dist/voice/io.d.ts +3 -2
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +6 -3
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +3 -1
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +3 -1
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +17 -17
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +2 -2
package/dist/voice/room_io/_input.d.ts +2 -2
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +7 -6
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/room_io.cjs +9 -0
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +3 -1
package/dist/voice/room_io/room_io.d.ts +3 -1
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +9 -0
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +7 -1
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +2 -0
package/dist/voice/speech_handle.d.ts +2 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +8 -2
package/dist/voice/speech_handle.js.map +1 -1
package/dist/voice/testing/run_result.cjs +66 -15
package/dist/voice/testing/run_result.cjs.map +1 -1
package/dist/voice/testing/run_result.d.cts +14 -3
package/dist/voice/testing/run_result.d.ts +14 -3
package/dist/voice/testing/run_result.d.ts.map +1 -1
package/dist/voice/testing/run_result.js +66 -15
package/dist/voice/testing/run_result.js.map +1 -1
package/dist/voice/utils.cjs +47 -0
package/dist/voice/utils.cjs.map +1 -0
package/dist/voice/utils.d.cts +4 -0
package/dist/voice/utils.d.ts +4 -0
package/dist/voice/utils.d.ts.map +1 -0
package/dist/voice/utils.js +23 -0
package/dist/voice/utils.js.map +1 -0
package/package.json +1 -1
package/src/cli.ts +20 -33
package/src/ipc/job_proc_lazy_main.ts +16 -5
package/src/llm/chat_context.ts +35 -0
package/src/llm/provider_format/index.ts +7 -2
package/src/llm/provider_format/openai.test.ts +385 -1
package/src/llm/provider_format/openai.ts +103 -0
package/src/llm/provider_format/utils.ts +6 -4
package/src/llm/realtime.ts +1 -0
package/src/log.ts +5 -2
package/src/stream/deferred_stream.ts +17 -6
package/src/stream/index.ts +1 -0
package/src/stream/multi_input_stream.test.ts +540 -0
package/src/stream/multi_input_stream.ts +172 -0
package/src/telemetry/trace_types.ts +18 -0
package/src/utils.test.ts +87 -0
package/src/utils.ts +52 -2
package/src/version.ts +1 -1
package/src/voice/agent.test.ts +140 -2
package/src/voice/agent.ts +189 -10
package/src/voice/agent_activity.ts +449 -286
package/src/voice/agent_session.ts +195 -51
package/src/voice/audio_recognition.ts +118 -38
package/src/voice/audio_recognition_span.test.ts +261 -0
package/src/voice/generation.ts +52 -23
package/src/voice/index.ts +1 -1
package/src/voice/io.ts +7 -4
package/src/voice/recorder_io/recorder_io.ts +2 -1
package/src/voice/room_io/_input.ts +11 -7
package/src/voice/room_io/room_io.ts +12 -0
package/src/voice/speech_handle.ts +9 -2
package/src/voice/testing/run_result.ts +81 -23
package/src/voice/utils.ts +29 -0

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -35,7 +35,7 @@ import type {
   TTSMetrics,
   VADMetrics,
 } from '../metrics/base.js';
-import { DeferredReadableStream } from '../stream/deferred_stream.js';
+import { MultiInputStream } from '../stream/multi_input_stream.js';
 import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
 import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
 import { splitWords } from '../tokenize/basic/word.js';
@@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js';
 import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
 import { VAD, type VADEvent } from '../vad.js';
 import type { Agent, ModelSettings } from './agent.js';
-import { StopResponse, asyncLocalStorage } from './agent.js';
+import {
+  StopResponse,
+  _getActivityTaskInfo,
+  _setActivityTaskInfo,
+  functionCallStorage,
+  speechHandleStorage,
+} from './agent.js';
 import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
 import {
   AudioRecognition,
@@ -60,7 +66,7 @@ import {
   createSpeechCreatedEvent,
   createUserInputTranscribedEvent,
 } from './events.js';
-import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js';
+import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
 import {
   type _AudioOut,
   type _TextOut,
@@ -74,8 +80,9 @@ import {
 } from './generation.js';
 import type { TimedString } from './io.js';
 import { SpeechHandle } from './speech_handle.js';
+import { setParticipantSpanAttributes } from './utils.js';
-const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
+export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
 interface PreemptiveGeneration {
   speechHandle: SpeechHandle;
@@ -88,31 +95,47 @@ interface PreemptiveGeneration {
 }
 export class AgentActivity implements RecognitionHooks {
+  agent: Agent;
+  agentSession: AgentSession;
   private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
   private started = false;
   private audioRecognition?: AudioRecognition;
   private realtimeSession?: RealtimeSession;
   private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
   private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
   private logger = log();
-  private _draining = false;
+  private _schedulingPaused = true;
+  private _drainBlockedTasks: Task<any>[] = [];
   private _currentSpeech?: SpeechHandle;
   private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
   private q_updated: Future;
   private speechTasks: Set<Task<void>> = new Set();
   private lock = new Mutex();
-  private audioStream = new DeferredReadableStream<AudioFrame>();
+  private audioStream = new MultiInputStream<AudioFrame>();
+  private audioStreamId?: string;
   // default to null as None, which maps to the default provider tool choice value
   private toolChoice: ToolChoice | null = null;
   private _preemptiveGeneration?: PreemptiveGeneration;
-  agent: Agent;
-  agentSession: AgentSession;
   /** @internal */
   _mainTask?: Task<void>;
-  _userTurnCompletedTask?: Promise<void>;
+  _onEnterTask?: Task<void>;
+  _onExitTask?: Task<void>;
+  _userTurnCompletedTask?: Task<void>;
+  private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
+    this.onGenerationCreated(ev);
+  private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
+    this.onInputSpeechStarted(ev);
+  private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
+    this.onInputSpeechStopped(ev);
+  private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
+    this.onInputAudioTranscriptionCompleted(ev);
+  private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
+    this.onError(ev);
   constructor(agent: Agent, agentSession: AgentSession) {
     this.agent = agent;
     this.agentSession = agentSession;
@@ -132,7 +155,7 @@ export class AgentActivity implements RecognitionHooks {
     if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
       this.logger.warn(
-        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
+        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
       );
       this.turnDetectionMode = undefined;
     }
@@ -210,117 +233,138 @@ export class AgentActivity implements RecognitionHooks {
   async start(): Promise<void> {
     const unlock = await this.lock.lock();
     try {
-      // Create start_agent_activity as a ROOT span (new trace) to match Python behavior
-      const startSpan = tracer.startSpan({
-        name: 'start_agent_activity',
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
-        context: ROOT_CONTEXT,
-      });
+      await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
+    } finally {
+      unlock();
+    }
+  }
-      this.agent._agentActivity = this;
+  async resume(): Promise<void> {
+    const unlock = await this.lock.lock();
+    try {
+      await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
+    } finally {
+      unlock();
+    }
+  }
-      if (this.llm instanceof RealtimeModel) {
-        this.realtimeSession = this.llm.session();
-        this.realtimeSpans = new Map<string, Span>();
-        this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
-        this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
-        this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
-        this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
-          this.onInputAudioTranscriptionCompleted(ev),
-        );
-        this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.realtimeSession.on('error', (ev) => this.onError(ev));
-        removeInstructions(this.agent._chatCtx);
-        try {
-          await this.realtimeSession.updateInstructions(this.agent.instructions);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the instructions');
-        }
+  private async _startSession(options: {
+    spanName: 'start_agent_activity' | 'resume_agent_activity';
+    runOnEnter: boolean;
+  }): Promise<void> {
+    const { spanName, runOnEnter } = options;
+    const startSpan = tracer.startSpan({
+      name: spanName,
+      attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+      context: ROOT_CONTEXT,
+    });
-        try {
-          await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the chat context');
-        }
+    this.agent._agentActivity = this;
-        try {
-          await this.realtimeSession.updateTools(this.tools);
-        } catch (error) {
-          this.logger.error(error, 'failed to update the tools');
-        }
+    if (this.llm instanceof RealtimeModel) {
+      this.realtimeSession = this.llm.session();
+      this.realtimeSpans = new Map<string, Span>();
+      this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
+      this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.on(
+        'input_audio_transcription_completed',
+        this.onRealtimeInputAudioTranscriptionCompleted,
+      );
+      this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
+      this.realtimeSession.on('error', this.onModelError);
-        if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
-          this.logger.error(
-            'audio output is enabled but RealtimeModel has no audio modality ' +
-              'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
-              'or set a TTS model.',
-          );
-        }
-      } else if (this.llm instanceof LLM) {
-        try {
-          updateInstructions({
-            chatCtx: this.agent._chatCtx,
-            instructions: this.agent.instructions,
-            addIfMissing: true,
-          });
-        } catch (error) {
-          this.logger.error('failed to update the instructions', error);
-        }
+      removeInstructions(this.agent._chatCtx);
+      try {
+        await this.realtimeSession.updateInstructions(this.agent.instructions);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the instructions');
       }
-      // metrics and error handling
-      if (this.llm instanceof LLM) {
-        this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.llm.on('error', (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the chat context');
       }
-      if (this.stt instanceof STT) {
-        this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.stt.on('error', (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateTools(this.tools);
+      } catch (error) {
+        this.logger.error(error, 'failed to update the tools');
       }
-      if (this.tts instanceof TTS) {
-        this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
-        this.tts.on('error', (ev) => this.onError(ev));
+      if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+        this.logger.error(
+          'audio output is enabled but RealtimeModel has no audio modality ' +
+            'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
+            'or set a TTS model.',
+        );
       }
-      if (this.vad instanceof VAD) {
-        this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
+    } else if (this.llm instanceof LLM) {
+      try {
+        updateInstructions({
+          chatCtx: this.agent._chatCtx,
+          instructions: this.agent.instructions,
+          addIfMissing: true,
+        });
+      } catch (error) {
+        this.logger.error('failed to update the instructions', error);
       }
+    }
-      this.audioRecognition = new AudioRecognition({
-        recognitionHooks: this,
-        // Disable stt node if stt is not provided
-        stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
-        vad: this.vad,
-        turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
-        turnDetectionMode: this.turnDetectionMode,
-        minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
-        maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
-        rootSpanContext: this.agentSession.rootSpanContext,
-      });
-      this.audioRecognition.start();
-      this.started = true;
+    // metrics and error handling
+    if (this.llm instanceof LLM) {
+      this.llm.on('metrics_collected', this.onMetricsCollected);
+      this.llm.on('error', this.onModelError);
+    }
-      this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
+    if (this.stt instanceof STT) {
+      this.stt.on('metrics_collected', this.onMetricsCollected);
+      this.stt.on('error', this.onModelError);
+    }
-      // Create on_enter as a child of start_agent_activity in the new trace
-      const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
-        name: 'on_enter',
-        context: trace.setSpan(ROOT_CONTEXT, startSpan),
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
-      });
+    if (this.tts instanceof TTS) {
+      this.tts.on('metrics_collected', this.onMetricsCollected);
+      this.tts.on('error', this.onModelError);
+    }
-      this.createSpeechTask({
-        task: Task.from(() => onEnterTask),
+    if (this.vad instanceof VAD) {
+      this.vad.on('metrics_collected', this.onMetricsCollected);
+    }
+    this.audioRecognition = new AudioRecognition({
+      recognitionHooks: this,
+      // Disable stt node if stt is not provided
+      stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
+      vad: this.vad,
+      turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
+      turnDetectionMode: this.turnDetectionMode,
+      minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
+      maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
+      rootSpanContext: this.agentSession.rootSpanContext,
+      sttModel: this.stt?.label,
+      sttProvider: this.getSttProvider(),
+      getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
+    });
+    this.audioRecognition.start();
+    this.started = true;
+    this._resumeSchedulingTask();
+    if (runOnEnter) {
+      this._onEnterTask = this.createSpeechTask({
+        taskFn: () =>
+          tracer.startActiveSpan(async () => this.agent.onEnter(), {
+            name: 'on_enter',
+            context: trace.setSpan(ROOT_CONTEXT, startSpan),
+            attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+          }),
+        inlineTask: true,
         name: 'AgentActivity_onEnter',
       });
-      startSpan.end();
-    } finally {
-      unlock();
     }
+    startSpan.end();
   }
   get currentSpeech(): SpeechHandle | undefined {
@@ -335,6 +379,17 @@ export class AgentActivity implements RecognitionHooks {
     return this.agent.stt || this.agentSession.stt;
   }
+  private getSttProvider(): string | undefined {
+    const label = this.stt?.label;
+    if (!label) {
+      return undefined;
+    }
+    // Heuristic: most labels look like "<provider>-<model>"
+    const [provider] = label.split('-', 1);
+    return provider || label;
+  }
   get llm(): LLM | RealtimeModel | undefined {
     return this.agent.llm || this.agentSession.llm;
   }
@@ -347,8 +402,8 @@ export class AgentActivity implements RecognitionHooks {
     return this.agent.toolCtx;
   }
-  get draining(): boolean {
-    return this._draining;
+  get schedulingPaused(): boolean {
+    return this._schedulingPaused;
   }
   get realtimeLLMSession(): RealtimeSession | undefined {
@@ -402,18 +457,10 @@ export class AgentActivity implements RecognitionHooks {
   }
   attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
-    if (this.audioStream.isSourceSet) {
-      this.logger.debug('detaching existing audio input in agent activity');
-      this.audioStream.detachSource();
-    }
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream<AudioFrame>();
-    /**
-     * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
-     * The tee() operation should be applied to the deferred stream, not the original audioStream.
-     * This is important because teeing the original stream directly makes it very difficult—if not
-     * impossible—to implement stream unlock logic cleanly.
-     */
-    this.audioStream.setSource(audioStream);
+    this.audioStreamId = this.audioStream.addInputStream(audioStream);
     const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
     if (this.realtimeSession) {
@@ -426,16 +473,29 @@ export class AgentActivity implements RecognitionHooks {
   }
   detachAudioInput(): void {
-    this.audioStream.detachSource();
+    if (this.audioStreamId === undefined) {
+      return;
+    }
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream<AudioFrame>();
+    this.audioStreamId = undefined;
   }
-  commitUserTurn() {
+  commitUserTurn(
+    options: {
+      audioDetached?: boolean;
+      throwIfNotReady?: boolean;
+    } = {},
+  ) {
+    const { audioDetached = false, throwIfNotReady = true } = options;
     if (!this.audioRecognition) {
-      throw new Error('AudioRecognition is not initialized');
+      if (throwIfNotReady) {
+        throw new Error('AudioRecognition is not initialized');
+      }
+      return;
     }
-    // TODO(brian): add audio_detached flag
-    const audioDetached = false;
     this.audioRecognition.commitUserTurn(audioDetached);
   }
@@ -493,14 +553,13 @@ export class AgentActivity implements RecognitionHooks {
       }),
     );
     const task = this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
-      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.say_tts',
     });
-    task.finally(() => this.onPipelineReplyDone());
+    task.result.finally(() => this.onPipelineReplyDone());
     this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
     return handle;
   }
@@ -613,9 +672,9 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    if (this.draining) {
+    if (this.schedulingPaused) {
       // TODO(shubhra): should we "forward" this new turn to the next agent?
-      this.logger.warn('skipping new realtime generation, the agent is draining');
+      this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
       return;
     }
@@ -633,9 +692,8 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
     this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.realtimeGenerationTask(handle, ev, {}, abortController),
-      ),
       ownedSpeechHandle: handle,
       name: 'AgentActivity.realtimeGeneration',
     });
@@ -767,7 +825,7 @@ export class AgentActivity implements RecognitionHooks {
   onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
     if (
       !this.agentSession.options.preemptiveGeneration ||
-      this.draining ||
+      this.schedulingPaused ||
       (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
       !(this.llm instanceof LLM)
     ) {
@@ -814,11 +872,32 @@ export class AgentActivity implements RecognitionHooks {
   }
   private createSpeechTask(options: {
-    task: Task<void>;
+    taskFn: (controller: AbortController) => Promise<void>;
+    controller?: AbortController;
     ownedSpeechHandle?: SpeechHandle;
+    inlineTask?: boolean;
     name?: string;
-  }): Promise<void> {
-    const { task, ownedSpeechHandle } = options;
+  }): Task<void> {
+    const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
+    const wrappedFn = (ctrl: AbortController) => {
+      return agentActivityStorage.run(this, () => {
+        // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
+        // before post-construction metadata is attached to the Task instance.
+        const currentTask = Task.current();
+        if (currentTask) {
+          _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
+        }
+        if (ownedSpeechHandle) {
+          return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
+        }
+        return taskFn(ctrl);
+      });
+    };
+    const task = Task.from(wrappedFn, controller, name);
+    _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
     this.speechTasks.add(task);
     task.addDoneCallback(() => {
@@ -838,13 +917,16 @@ export class AgentActivity implements RecognitionHooks {
       this.wakeupMainTask();
     });
-    return task.result;
+    return task;
   }
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
-    if (this.draining) {
+    if (this.schedulingPaused) {
       this.cancelPreemptiveGeneration();
-      this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
+      this.logger.warn(
+        { user_input: info.newTranscript },
+        'skipping user input, speech scheduling is paused',
+      );
       // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
       return true;
     }
@@ -877,7 +959,7 @@ export class AgentActivity implements RecognitionHooks {
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
-      task: Task.from(() => this.userTurnCompleted(info, oldTask)),
+      taskFn: () => this.userTurnCompleted(info, oldTask),
       name: 'AgentActivity.userTurnCompleted',
     });
     return true;
@@ -913,10 +995,12 @@ export class AgentActivity implements RecognitionHooks {
         this._currentSpeech = undefined;
       }
-      // If we're draining and there are no more speech tasks, we can exit.
-      // Only speech tasks can bypass draining to create a tool response
-      if (this.draining && this.speechTasks.size === 0) {
-        this.logger.info('mainTask: draining and no more speech tasks');
+      // if we're draining/pausing and there are no more speech tasks, we can exit.
+      // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
+      const toWait = this.getDrainPendingSpeechTasks();
+      if (this._schedulingPaused && toWait.length === 0) {
+        this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
         break;
       }
@@ -926,6 +1010,39 @@ export class AgentActivity implements RecognitionHooks {
     this.logger.info('AgentActivity mainTask: exiting');
   }
+  private getDrainPendingSpeechTasks(): Task<void>[] {
+    const blockedHandles: SpeechHandle[] = [];
+    for (const task of this._drainBlockedTasks) {
+      const info = _getActivityTaskInfo(task);
+      if (!info) {
+        this.logger.error('blocked task without activity info; skipping.');
+        continue;
+      }
+      if (!info.speechHandle) {
+        continue; // onEnter/onExit
+      }
+      blockedHandles.push(info.speechHandle);
+    }
+    const toWait: Task<void>[] = [];
+    for (const task of this.speechTasks) {
+      if (this._drainBlockedTasks.includes(task)) {
+        continue;
+      }
+      const info = _getActivityTaskInfo(task);
+      if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
+        continue;
+      }
+      toWait.push(task);
+    }
+    return toWait;
+  }
   private wakeupMainTask(): void {
     this.q_updated.resolve();
   }
@@ -967,7 +1084,7 @@ export class AgentActivity implements RecognitionHooks {
       throw new Error('trying to generate reply without an LLM model');
     }
-    const functionCall = asyncLocalStorage.getStore()?.functionCall;
+    const functionCall = functionCallStorage.getStore()?.functionCall;
     if (toolChoice === undefined && functionCall !== undefined) {
       // when generateReply is called inside a tool, set toolChoice to 'none' by default
       toolChoice = 'none';
@@ -989,7 +1106,7 @@ export class AgentActivity implements RecognitionHooks {
     if (this.llm instanceof RealtimeModel) {
       this.createSpeechTask({
-        task: Task.from((abortController: AbortController) =>
+        taskFn: (abortController: AbortController) =>
           this.realtimeReplyTask({
             speechHandle: handle,
             // TODO(brian): support llm.ChatMessage for the realtime model
@@ -1001,7 +1118,6 @@ export class AgentActivity implements RecognitionHooks {
             },
             abortController,
           }),
-        ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.realtimeReply',
       });
@@ -1014,7 +1130,7 @@ export class AgentActivity implements RecognitionHooks {
       }
       const task = this.createSpeechTask({
-        task: Task.from((abortController: AbortController) =>
+        taskFn: (abortController: AbortController) =>
           this.pipelineReplyTask(
             handle,
             chatCtx ?? this.agent.chatCtx,
@@ -1026,12 +1142,11 @@ export class AgentActivity implements RecognitionHooks {
             instructions,
             userMessage,
           ),
-        ),
         ownedSpeechHandle: handle,
         name: 'AgentActivity.pipelineReply',
       });
-      task.finally(() => this.onPipelineReplyDone());
+      task.result.finally(() => this.onPipelineReplyDone());
     }
     if (scheduleSpeech) {
@@ -1040,16 +1155,19 @@ export class AgentActivity implements RecognitionHooks {
     return handle;
   }
-  interrupt(): Future<void> {
+  interrupt(options: { force?: boolean } = {}): Future<void> {
+    const { force = false } = options;
+    this.cancelPreemptiveGeneration();
     const future = new Future<void>();
     const currentSpeech = this._currentSpeech;
     //TODO(AJS-273): add interrupt for background speeches
-    currentSpeech?.interrupt();
+    currentSpeech?.interrupt(force);
     for (const [_, __, speech] of this.speechQueue) {
-      speech.interrupt();
+      speech.interrupt(force);
     }
     this.realtimeSession?.interrupt();
@@ -1072,13 +1190,13 @@ export class AgentActivity implements RecognitionHooks {
     }
   }
-  private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
+  private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
     if (oldTask) {
       // We never cancel user code as this is very confusing.
       // So we wait for the old execution of onUserTurnCompleted to finish.
       // In practice this is OK because most speeches will be interrupted if a new turn
       // is detected. So the previous execution should complete quickly.
-      await oldTask;
+      await oldTask.result;
     }
     // When the audio recognition detects the end of a user turn:
@@ -1355,6 +1473,11 @@ export class AgentActivity implements RecognitionHooks {
       span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
     }
+    const localParticipant = this.agentSession._roomIO?.localParticipant;
+    if (localParticipant) {
+      setParticipantSpanAttributes(span, localParticipant);
+    }
     speechHandleStorage.enterWith(speechHandle);
     const audioOutput = this.agentSession.output.audioEnabled
@@ -1531,13 +1654,15 @@ export class AgentActivity implements RecognitionHooks {
       for (const msg of toolsMessages) {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolsMessages);
-      // Only add FunctionCallOutput items to session history since FunctionCall items
-      // were already added by onToolExecutionStarted when the tool execution began
+      // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
+      // were already added by onToolExecutionStarted when the tool execution began.
+      // Inserting function_calls again would create duplicates that break provider APIs
+      // (e.g. Google's "function response parts != function call parts" error).
       const toolCallOutputs = toolsMessages.filter(
         (m): m is FunctionCallOutput => m.type === 'function_call_output',
       );
       if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
         this.agentSession._toolItemsAdded(toolCallOutputs);
       }
     }
@@ -1645,52 +1770,18 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: [],
-    });
-    let shouldGenerateToolReply: boolean = false;
-    let newAgentTask: Agent | null = null;
-    let ignoreTaskSwitch: boolean = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== undefined) {
-        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
-        this.logger.error('expected to receive only one agent task from the tool executions');
-        ignoreTaskSwitch = true;
-        // TODO(brian): should we mark the function call as failed to notify the LLM?
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: sanitizedOut.toolCall?.name,
-          args: sanitizedOut.toolCall.args,
-          output: sanitizedOut.toolCallOutput?.output,
-          isError: sanitizedOut.toolCallOutput?.isError,
-        },
-        'Tool call execution finished',
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
+      this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent,
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     const toolMessages = [
@@ -1705,11 +1796,12 @@ export class AgentActivity implements RecognitionHooks {
       // Avoid setting tool_choice to "required" or a specific function when
       // passing tool response back to the LLM
-      const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+      const respondToolChoice =
+        schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
       // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
       const toolResponseTask = this.createSpeechTask({
-        task: Task.from(() =>
+        taskFn: () =>
           this.pipelineReplyTask(
             speechHandle,
             chatCtx,
@@ -1720,12 +1812,11 @@ export class AgentActivity implements RecognitionHooks {
             undefined,
             toolMessages,
           ),
-        ),
         ownedSpeechHandle: speechHandle,
         name: 'AgentActivity.pipelineReply',
       });
-      toolResponseTask.finally(() => this.onPipelineReplyDone());
+      toolResponseTask.result.finally(() => this.onPipelineReplyDone());
       this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
     } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -1733,15 +1824,12 @@ export class AgentActivity implements RecognitionHooks {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolMessages);
-      // Only add FunctionCallOutput items to session history since FunctionCall items
-      // were already added by onToolExecutionStarted when the tool execution began
       const toolCallOutputs = toolMessages.filter(
         (m): m is FunctionCallOutput => m.type === 'function_call_output',
       );
       if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
         this.agentSession._toolItemsAdded(toolCallOutputs);
       }
     }
@@ -1815,6 +1903,11 @@ export class AgentActivity implements RecognitionHooks {
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
+    const localParticipant = this.agentSession._roomIO?.localParticipant;
+    if (localParticipant) {
+      setParticipantSpanAttributes(span, localParticipant);
+    }
     speechHandleStorage.enterWith(speechHandle);
     if (!this.realtimeSession) {
@@ -2139,50 +2232,18 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: [],
-    });
-    let shouldGenerateToolReply: boolean = false;
-    let newAgentTask: Agent | null = null;
-    let ignoreTaskSwitch: boolean = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== undefined) {
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
-        this.logger.error('expected to receive only one agent task from the tool executions');
-        ignoreTaskSwitch = true;
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: sanitizedOut.toolCall?.name,
-          args: sanitizedOut.toolCall.args,
-          output: sanitizedOut.toolCallOutput?.output,
-          isError: sanitizedOut.toolCallOutput?.isError,
-        },
-        'Tool call execution finished',
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
+      this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent,
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
@@ -2238,15 +2299,14 @@ export class AgentActivity implements RecognitionHooks {
       }),
     );
-    const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
+    const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
     this.createSpeechTask({
-      task: Task.from((abortController: AbortController) =>
+      taskFn: (abortController: AbortController) =>
         this.realtimeReplyTask({
           speechHandle: replySpeechHandle,
           modelSettings: { toolChoice },
           abortController,
         }),
-      ),
       ownedSpeechHandle: replySpeechHandle,
       name: 'AgentActivity.realtime_reply',
     });
@@ -2254,6 +2314,53 @@ export class AgentActivity implements RecognitionHooks {
     this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
   }
+  private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
+    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
+      functionCalls: [],
+      functionCallOutputs: [],
+    });
+    let shouldGenerateToolReply = false;
+    let newAgentTask: Agent | null = null;
+    let ignoreTaskSwitch = false;
+    for (const sanitizedOut of toolOutput.output) {
+      if (sanitizedOut.toolCallOutput !== undefined) {
+        // Keep event payload symmetric for pipeline + realtime paths.
+        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
+        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
+        if (sanitizedOut.replyRequired) {
+          shouldGenerateToolReply = true;
+        }
+      }
+      if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
+        this.logger.error('expected to receive only one agent task from the tool executions');
+        ignoreTaskSwitch = true;
+      }
+      newAgentTask = sanitizedOut.agentTask ?? null;
+      this.logger.debug(
+        {
+          speechId: speechHandle.id,
+          name: sanitizedOut.toolCall?.name,
+          args: sanitizedOut.toolCall.args,
+          output: sanitizedOut.toolCallOutput?.output,
+          isError: sanitizedOut.toolCallOutput?.isError,
+        },
+        'Tool call execution finished',
+      );
+    }
+    return {
+      functionToolsExecutedEvent,
+      shouldGenerateToolReply,
+      newAgentTask,
+      ignoreTaskSwitch,
+    };
+  }
   private async realtimeReplyTask({
     speechHandle,
     modelSettings: { toolChoice },
@@ -2312,10 +2419,10 @@ export class AgentActivity implements RecognitionHooks {
     priority: number,
     force: boolean = false,
   ): void {
-    // when force=true, we allow tool responses to bypass draining
+    // when force=true, we allow tool responses to bypass scheduling pause
     // This allows for tool responses to be generated before the AgentActivity is finalized
-    if (this.draining && !force) {
-      throw new Error('cannot schedule new speech, the agent is draining');
+    if (this.schedulingPaused && !force) {
+      throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
     }
     // Monotonic time to avoid near 0 collisions
@@ -2324,6 +2431,48 @@ export class AgentActivity implements RecognitionHooks {
     this.wakeupMainTask();
   }
+  private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
+    if (this._schedulingPaused) return;
+    this._schedulingPaused = true;
+    this._drainBlockedTasks = blockedTasks;
+    this.wakeupMainTask();
+    if (this._mainTask) {
+      // When pausing/draining, we ensure that all speech_tasks complete fully.
+      // This means that even if the SpeechHandle themselves have finished,
+      // we still wait for the entire execution (e.g function_tools)
+      await this._mainTask.result;
+    }
+  }
+  private _resumeSchedulingTask(): void {
+    if (!this._schedulingPaused) return;
+    this._schedulingPaused = false;
+    this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
+  }
+  async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
+    const { blockedTasks = [] } = options;
+    const unlock = await this.lock.lock();
+    try {
+      const span = tracer.startSpan({
+        name: 'pause_agent_activity',
+        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+      });
+      try {
+        await this._pauseSchedulingTask(blockedTasks);
+        await this._closeSessionResources();
+      } finally {
+        span.end();
+      }
+    } finally {
+      unlock();
+    }
+  }
   async drain(): Promise<void> {
     // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
     return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
@@ -2337,23 +2486,22 @@ export class AgentActivity implements RecognitionHooks {
     const unlock = await this.lock.lock();
     try {
-      if (this._draining) return;
-      this.cancelPreemptiveGeneration();
+      if (this._schedulingPaused) return;
-      const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
-        name: 'on_exit',
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
-      });
-      this.createSpeechTask({
-        task: Task.from(() => onExitTask),
+      this._onExitTask = this.createSpeechTask({
+        taskFn: () =>
+          tracer.startActiveSpan(async () => this.agent.onExit(), {
+            name: 'on_exit',
+            attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+          }),
+        inlineTask: true,
         name: 'AgentActivity_onExit',
       });
-      this.wakeupMainTask();
-      this._draining = true;
-      await this._mainTask?.result;
+      this.cancelPreemptiveGeneration();
+      await this._onExitTask.result;
+      await this._pauseSchedulingTask([]);
     } finally {
       unlock();
     }
@@ -2362,44 +2510,59 @@ export class AgentActivity implements RecognitionHooks {
   async close(): Promise<void> {
     const unlock = await this.lock.lock();
     try {
-      if (!this._draining) {
-        this.logger.warn('task closing without draining');
-      }
       this.cancelPreemptiveGeneration();
-      // Unregister event handlers to prevent duplicate metrics
-      if (this.llm instanceof LLM) {
-        this.llm.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.realtimeSession) {
-        this.realtimeSession.off('generation_created', this.onGenerationCreated);
-        this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
-        this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
-        this.realtimeSession.off(
-          'input_audio_transcription_completed',
-          this.onInputAudioTranscriptionCompleted,
-        );
-        this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.stt instanceof STT) {
-        this.stt.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.tts instanceof TTS) {
-        this.tts.off('metrics_collected', this.onMetricsCollected);
-      }
-      if (this.vad instanceof VAD) {
-        this.vad.off('metrics_collected', this.onMetricsCollected);
+      await this._closeSessionResources();
+      if (this._mainTask) {
+        await this._mainTask.cancelAndWait();
       }
-      this.detachAudioInput();
-      this.realtimeSpans?.clear();
-      await this.realtimeSession?.close();
-      await this.audioRecognition?.close();
-      await this._mainTask?.cancelAndWait();
+      this.agent._agentActivity = undefined;
     } finally {
       unlock();
     }
   }
+  private async _closeSessionResources(): Promise<void> {
+    // Unregister event handlers to prevent duplicate metrics
+    if (this.llm instanceof LLM) {
+      this.llm.off('metrics_collected', this.onMetricsCollected);
+      this.llm.off('error', this.onModelError);
+    }
+    if (this.realtimeSession) {
+      this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
+      this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.off(
+        'input_audio_transcription_completed',
+        this.onRealtimeInputAudioTranscriptionCompleted,
+      );
+      this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
+      this.realtimeSession.off('error', this.onModelError);
+    }
+    if (this.stt instanceof STT) {
+      this.stt.off('metrics_collected', this.onMetricsCollected);
+      this.stt.off('error', this.onModelError);
+    }
+    if (this.tts instanceof TTS) {
+      this.tts.off('metrics_collected', this.onMetricsCollected);
+      this.tts.off('error', this.onModelError);
+    }
+    if (this.vad instanceof VAD) {
+      this.vad.off('metrics_collected', this.onMetricsCollected);
+    }
+    this.detachAudioInput();
+    this.realtimeSpans?.clear();
+    await this.realtimeSession?.close();
+    await this.audioRecognition?.close();
+    this.realtimeSession = undefined;
+    this.audioRecognition = undefined;
+  }
 }
 function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {