npm - @livekit/agents - Versions diffs - 1.0.46 → 1.0.47 - Mend

@livekit/agents 1.0.46 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (151) hide show

package/dist/cli.cjs +14 -20
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +14 -20
package/dist/cli.js.map +1 -1
package/dist/ipc/job_proc_lazy_main.cjs +14 -5
package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
package/dist/ipc/job_proc_lazy_main.js +14 -5
package/dist/ipc/job_proc_lazy_main.js.map +1 -1
package/dist/llm/chat_context.cjs +19 -0
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +4 -0
package/dist/llm/chat_context.d.ts +4 -0
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +19 -0
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/provider_format/index.cjs +2 -0
package/dist/llm/provider_format/index.cjs.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/provider_format/index.js +6 -1
package/dist/llm/provider_format/index.js.map +1 -1
package/dist/llm/provider_format/openai.cjs +82 -2
package/dist/llm/provider_format/openai.cjs.map +1 -1
package/dist/llm/provider_format/openai.d.cts +1 -0
package/dist/llm/provider_format/openai.d.ts +1 -0
package/dist/llm/provider_format/openai.d.ts.map +1 -1
package/dist/llm/provider_format/openai.js +80 -1
package/dist/llm/provider_format/openai.js.map +1 -1
package/dist/llm/provider_format/openai.test.cjs +326 -0
package/dist/llm/provider_format/openai.test.cjs.map +1 -1
package/dist/llm/provider_format/openai.test.js +327 -1
package/dist/llm/provider_format/openai.test.js.map +1 -1
package/dist/llm/provider_format/utils.cjs +4 -3
package/dist/llm/provider_format/utils.cjs.map +1 -1
package/dist/llm/provider_format/utils.d.ts.map +1 -1
package/dist/llm/provider_format/utils.js +4 -3
package/dist/llm/provider_format/utils.js.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +1 -0
package/dist/llm/realtime.d.ts +1 -0
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/log.cjs +5 -2
package/dist/log.cjs.map +1 -1
package/dist/log.d.ts.map +1 -1
package/dist/log.js +5 -2
package/dist/log.js.map +1 -1
package/dist/stream/deferred_stream.cjs +15 -6
package/dist/stream/deferred_stream.cjs.map +1 -1
package/dist/stream/deferred_stream.d.ts.map +1 -1
package/dist/stream/deferred_stream.js +15 -6
package/dist/stream/deferred_stream.js.map +1 -1
package/dist/utils.cjs +31 -2
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +7 -0
package/dist/utils.d.ts +7 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +31 -2
package/dist/utils.js.map +1 -1
package/dist/utils.test.cjs +71 -0
package/dist/utils.test.cjs.map +1 -1
package/dist/utils.test.js +71 -0
package/dist/utils.test.js.map +1 -1
package/dist/version.cjs +1 -1
package/dist/version.cjs.map +1 -1
package/dist/version.d.cts +1 -1
package/dist/version.d.ts +1 -1
package/dist/version.d.ts.map +1 -1
package/dist/version.js +1 -1
package/dist/version.js.map +1 -1
package/dist/voice/agent.cjs +144 -12
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +29 -4
package/dist/voice/agent.d.ts +29 -4
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +140 -11
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent.test.cjs +120 -0
package/dist/voice/agent.test.cjs.map +1 -1
package/dist/voice/agent.test.js +122 -2
package/dist/voice/agent.test.js.map +1 -1
package/dist/voice/agent_activity.cjs +383 -298
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +34 -7
package/dist/voice/agent_activity.d.ts +34 -7
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +383 -293
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +140 -40
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +19 -7
package/dist/voice/agent_session.d.ts +19 -7
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +137 -37
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +4 -0
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +4 -0
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/generation.cjs +39 -19
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +44 -20
package/dist/voice/generation.js.map +1 -1
package/dist/voice/index.cjs +2 -0
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -1
package/dist/voice/index.d.ts +1 -1
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +2 -1
package/dist/voice/index.js.map +1 -1
package/dist/voice/speech_handle.cjs +7 -1
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +2 -0
package/dist/voice/speech_handle.d.ts +2 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +8 -2
package/dist/voice/speech_handle.js.map +1 -1
package/dist/voice/testing/run_result.cjs +66 -15
package/dist/voice/testing/run_result.cjs.map +1 -1
package/dist/voice/testing/run_result.d.cts +14 -3
package/dist/voice/testing/run_result.d.ts +14 -3
package/dist/voice/testing/run_result.d.ts.map +1 -1
package/dist/voice/testing/run_result.js +66 -15
package/dist/voice/testing/run_result.js.map +1 -1
package/package.json +1 -1
package/src/cli.ts +20 -33
package/src/ipc/job_proc_lazy_main.ts +16 -5
package/src/llm/chat_context.ts +35 -0
package/src/llm/provider_format/index.ts +7 -2
package/src/llm/provider_format/openai.test.ts +385 -1
package/src/llm/provider_format/openai.ts +103 -0
package/src/llm/provider_format/utils.ts +6 -4
package/src/llm/realtime.ts +1 -0
package/src/log.ts +5 -2
package/src/stream/deferred_stream.ts +17 -6
package/src/utils.test.ts +87 -0
package/src/utils.ts +36 -2
package/src/version.ts +1 -1
package/src/voice/agent.test.ts +140 -2
package/src/voice/agent.ts +189 -10
package/src/voice/agent_activity.ts +427 -289
package/src/voice/agent_session.ts +178 -40
package/src/voice/audio_recognition.ts +4 -0
package/src/voice/generation.ts +52 -23
package/src/voice/index.ts +1 -1
package/src/voice/speech_handle.ts +9 -2
package/src/voice/testing/run_result.ts +81 -23

package/dist/voice/agent_activity.js CHANGED Viewed

@@ -10,14 +10,20 @@ import {
 } from "../llm/index.js";
 import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
 import { log } from "../log.js";
-import { DeferredReadableStream } from "../stream/deferred_stream.js";
+import { MultiInputStream } from "../stream/multi_input_stream.js";
 import { STT } from "../stt/stt.js";
 import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
 import { splitWords } from "../tokenize/basic/word.js";
 import { TTS } from "../tts/tts.js";
 import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
 import { VAD } from "../vad.js";
-import { StopResponse, asyncLocalStorage } from "./agent.js";
+import {
+  StopResponse,
+  _getActivityTaskInfo,
+  _setActivityTaskInfo,
+  functionCallStorage,
+  speechHandleStorage
+} from "./agent.js";
 import {} from "./agent_session.js";
 import {
   AudioRecognition
@@ -41,8 +47,10 @@ import {
 } from "./generation.js";
 import { SpeechHandle } from "./speech_handle.js";
 import { setParticipantSpanAttributes } from "./utils.js";
-const speechHandleStorage = new AsyncLocalStorage();
+const agentActivityStorage = new AsyncLocalStorage();
 class AgentActivity {
+  agent;
+  agentSession;
   static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
   started = false;
   audioRecognition;
@@ -51,22 +59,29 @@ class AgentActivity {
   // Maps response_id to OTEL span for metrics recording
   turnDetectionMode;
   logger = log();
-  _draining = false;
+  _schedulingPaused = true;
+  _drainBlockedTasks = [];
   _currentSpeech;
   speechQueue;
   // [priority, timestamp, speechHandle]
   q_updated;
   speechTasks = /* @__PURE__ */ new Set();
   lock = new Mutex();
-  audioStream = new DeferredReadableStream();
+  audioStream = new MultiInputStream();
+  audioStreamId;
   // default to null as None, which maps to the default provider tool choice value
   toolChoice = null;
   _preemptiveGeneration;
-  agent;
-  agentSession;
   /** @internal */
   _mainTask;
+  _onEnterTask;
+  _onExitTask;
   _userTurnCompletedTask;
+  onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
+  onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
+  onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
+  onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
+  onModelError = (ev) => this.onError(ev);
   constructor(agent, agentSession) {
     this.agent = agent;
     this.agentSession = agentSession;
@@ -77,7 +92,7 @@ class AgentActivity {
     this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
     if (this.turnDetectionMode === "vad" && this.vad === void 0) {
       this.logger.warn(
-        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
+        'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
       );
       this.turnDetectionMode = void 0;
     }
@@ -127,107 +142,121 @@ class AgentActivity {
     }
   }
   async start() {
-    var _a;
     const unlock = await this.lock.lock();
     try {
-      const startSpan = tracer.startSpan({
-        name: "start_agent_activity",
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
-        context: ROOT_CONTEXT
-      });
-      this.agent._agentActivity = this;
-      if (this.llm instanceof RealtimeModel) {
-        this.realtimeSession = this.llm.session();
-        this.realtimeSpans = /* @__PURE__ */ new Map();
-        this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
-        this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
-        this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
-        this.realtimeSession.on(
-          "input_audio_transcription_completed",
-          (ev) => this.onInputAudioTranscriptionCompleted(ev)
-        );
-        this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
-        this.realtimeSession.on("error", (ev) => this.onError(ev));
-        removeInstructions(this.agent._chatCtx);
-        try {
-          await this.realtimeSession.updateInstructions(this.agent.instructions);
-        } catch (error) {
-          this.logger.error(error, "failed to update the instructions");
-        }
-        try {
-          await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
-        } catch (error) {
-          this.logger.error(error, "failed to update the chat context");
-        }
-        try {
-          await this.realtimeSession.updateTools(this.tools);
-        } catch (error) {
-          this.logger.error(error, "failed to update the tools");
-        }
-        if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
-          this.logger.error(
-            "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
-          );
-        }
-      } else if (this.llm instanceof LLM) {
-        try {
-          updateInstructions({
-            chatCtx: this.agent._chatCtx,
-            instructions: this.agent.instructions,
-            addIfMissing: true
-          });
-        } catch (error) {
-          this.logger.error("failed to update the instructions", error);
-        }
+      await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
+    } finally {
+      unlock();
+    }
+  }
+  async resume() {
+    const unlock = await this.lock.lock();
+    try {
+      await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
+    } finally {
+      unlock();
+    }
+  }
+  async _startSession(options) {
+    var _a;
+    const { spanName, runOnEnter } = options;
+    const startSpan = tracer.startSpan({
+      name: spanName,
+      attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+      context: ROOT_CONTEXT
+    });
+    this.agent._agentActivity = this;
+    if (this.llm instanceof RealtimeModel) {
+      this.realtimeSession = this.llm.session();
+      this.realtimeSpans = /* @__PURE__ */ new Map();
+      this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
+      this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.on(
+        "input_audio_transcription_completed",
+        this.onRealtimeInputAudioTranscriptionCompleted
+      );
+      this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
+      this.realtimeSession.on("error", this.onModelError);
+      removeInstructions(this.agent._chatCtx);
+      try {
+        await this.realtimeSession.updateInstructions(this.agent.instructions);
+      } catch (error) {
+        this.logger.error(error, "failed to update the instructions");
       }
-      if (this.llm instanceof LLM) {
-        this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
-        this.llm.on("error", (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
+      } catch (error) {
+        this.logger.error(error, "failed to update the chat context");
       }
-      if (this.stt instanceof STT) {
-        this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
-        this.stt.on("error", (ev) => this.onError(ev));
+      try {
+        await this.realtimeSession.updateTools(this.tools);
+      } catch (error) {
+        this.logger.error(error, "failed to update the tools");
       }
-      if (this.tts instanceof TTS) {
-        this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
-        this.tts.on("error", (ev) => this.onError(ev));
+      if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+        this.logger.error(
+          "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
+        );
       }
-      if (this.vad instanceof VAD) {
-        this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
+    } else if (this.llm instanceof LLM) {
+      try {
+        updateInstructions({
+          chatCtx: this.agent._chatCtx,
+          instructions: this.agent.instructions,
+          addIfMissing: true
+        });
+      } catch (error) {
+        this.logger.error("failed to update the instructions", error);
       }
-      this.audioRecognition = new AudioRecognition({
-        recognitionHooks: this,
-        // Disable stt node if stt is not provided
-        stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
-        vad: this.vad,
-        turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
-        turnDetectionMode: this.turnDetectionMode,
-        minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
-        maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
-        rootSpanContext: this.agentSession.rootSpanContext,
-        sttModel: (_a = this.stt) == null ? void 0 : _a.label,
-        sttProvider: this.getSttProvider(),
-        getLinkedParticipant: () => {
-          var _a2;
-          return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
-        }
-      });
-      this.audioRecognition.start();
-      this.started = true;
-      this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
-      const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
-        name: "on_enter",
-        context: trace.setSpan(ROOT_CONTEXT, startSpan),
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
-      });
-      this.createSpeechTask({
-        task: Task.from(() => onEnterTask),
+    }
+    if (this.llm instanceof LLM) {
+      this.llm.on("metrics_collected", this.onMetricsCollected);
+      this.llm.on("error", this.onModelError);
+    }
+    if (this.stt instanceof STT) {
+      this.stt.on("metrics_collected", this.onMetricsCollected);
+      this.stt.on("error", this.onModelError);
+    }
+    if (this.tts instanceof TTS) {
+      this.tts.on("metrics_collected", this.onMetricsCollected);
+      this.tts.on("error", this.onModelError);
+    }
+    if (this.vad instanceof VAD) {
+      this.vad.on("metrics_collected", this.onMetricsCollected);
+    }
+    this.audioRecognition = new AudioRecognition({
+      recognitionHooks: this,
+      // Disable stt node if stt is not provided
+      stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
+      vad: this.vad,
+      turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
+      turnDetectionMode: this.turnDetectionMode,
+      minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
+      maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
+      rootSpanContext: this.agentSession.rootSpanContext,
+      sttModel: (_a = this.stt) == null ? void 0 : _a.label,
+      sttProvider: this.getSttProvider(),
+      getLinkedParticipant: () => {
+        var _a2;
+        return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
+      }
+    });
+    this.audioRecognition.start();
+    this.started = true;
+    this._resumeSchedulingTask();
+    if (runOnEnter) {
+      this._onEnterTask = this.createSpeechTask({
+        taskFn: () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
+          name: "on_enter",
+          context: trace.setSpan(ROOT_CONTEXT, startSpan),
+          attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
+        }),
+        inlineTask: true,
         name: "AgentActivity_onEnter"
       });
-      startSpan.end();
-    } finally {
-      unlock();
     }
+    startSpan.end();
   }
   get currentSpeech() {
     return this._currentSpeech;
@@ -256,8 +285,8 @@ class AgentActivity {
   get tools() {
     return this.agent.toolCtx;
   }
-  get draining() {
-    return this._draining;
+  get schedulingPaused() {
+    return this._schedulingPaused;
   }
   get realtimeLLMSession() {
     return this.realtimeSession;
@@ -297,11 +326,9 @@ class AgentActivity {
     }
   }
   attachAudioInput(audioStream) {
-    if (this.audioStream.isSourceSet) {
-      this.logger.debug("detaching existing audio input in agent activity");
-      this.audioStream.detachSource();
-    }
-    this.audioStream.setSource(audioStream);
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream();
+    this.audioStreamId = this.audioStream.addInputStream(audioStream);
     const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
     if (this.realtimeSession) {
       this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -311,13 +338,21 @@ class AgentActivity {
     }
   }
   detachAudioInput() {
-    this.audioStream.detachSource();
+    if (this.audioStreamId === void 0) {
+      return;
+    }
+    void this.audioStream.close();
+    this.audioStream = new MultiInputStream();
+    this.audioStreamId = void 0;
   }
-  commitUserTurn() {
+  commitUserTurn(options = {}) {
+    const { audioDetached = false, throwIfNotReady = true } = options;
     if (!this.audioRecognition) {
-      throw new Error("AudioRecognition is not initialized");
+      if (throwIfNotReady) {
+        throw new Error("AudioRecognition is not initialized");
+      }
+      return;
     }
-    const audioDetached = false;
     this.audioRecognition.commitUserTurn(audioDetached);
   }
   clearUserTurn() {
@@ -353,13 +388,11 @@ class AgentActivity {
       })
     );
     const task = this.createSpeechTask({
-      task: Task.from(
-        (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
-      ),
+      taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
       ownedSpeechHandle: handle,
       name: "AgentActivity.say_tts"
     });
-    task.finally(() => this.onPipelineReplyDone());
+    task.result.finally(() => this.onPipelineReplyDone());
     this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
     return handle;
   }
@@ -449,8 +482,8 @@ class AgentActivity {
     if (ev.userInitiated) {
       return;
     }
-    if (this.draining) {
-      this.logger.warn("skipping new realtime generation, the agent is draining");
+    if (this.schedulingPaused) {
+      this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
       return;
     }
     const handle = SpeechHandle.create({
@@ -466,9 +499,7 @@ class AgentActivity {
     );
     this.logger.info({ speech_id: handle.id }, "Creating speech handle");
     this.createSpeechTask({
-      task: Task.from(
-        (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
-      ),
+      taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
       ownedSpeechHandle: handle,
       name: "AgentActivity.realtimeGeneration"
     });
@@ -555,7 +586,7 @@ class AgentActivity {
     }
   }
   onPreemptiveGeneration(info) {
-    if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
+    if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
       return;
     }
     this.cancelPreemptiveGeneration();
@@ -593,7 +624,21 @@ class AgentActivity {
     }
   }
   createSpeechTask(options) {
-    const { task, ownedSpeechHandle } = options;
+    const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
+    const wrappedFn = (ctrl) => {
+      return agentActivityStorage.run(this, () => {
+        const currentTask = Task.current();
+        if (currentTask) {
+          _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
+        }
+        if (ownedSpeechHandle) {
+          return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
+        }
+        return taskFn(ctrl);
+      });
+    };
+    const task = Task.from(wrappedFn, controller, name);
+    _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
     this.speechTasks.add(task);
     task.addDoneCallback(() => {
       this.speechTasks.delete(task);
@@ -609,12 +654,15 @@ class AgentActivity {
     task.addDoneCallback(() => {
       this.wakeupMainTask();
     });
-    return task.result;
+    return task;
   }
   async onEndOfTurn(info) {
-    if (this.draining) {
+    if (this.schedulingPaused) {
       this.cancelPreemptiveGeneration();
-      this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
+      this.logger.warn(
+        { user_input: info.newTranscript },
+        "skipping user input, speech scheduling is paused"
+      );
       return true;
     }
     if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -633,7 +681,7 @@ class AgentActivity {
     }
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
-      task: Task.from(() => this.userTurnCompleted(info, oldTask)),
+      taskFn: () => this.userTurnCompleted(info, oldTask),
       name: "AgentActivity.userTurnCompleted"
     });
     return true;
@@ -663,14 +711,41 @@ class AgentActivity {
         await speechHandle._waitForGeneration();
         this._currentSpeech = void 0;
       }
-      if (this.draining && this.speechTasks.size === 0) {
-        this.logger.info("mainTask: draining and no more speech tasks");
+      const toWait = this.getDrainPendingSpeechTasks();
+      if (this._schedulingPaused && toWait.length === 0) {
+        this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
         break;
       }
       this.q_updated = new Future();
     }
     this.logger.info("AgentActivity mainTask: exiting");
   }
+  getDrainPendingSpeechTasks() {
+    const blockedHandles = [];
+    for (const task of this._drainBlockedTasks) {
+      const info = _getActivityTaskInfo(task);
+      if (!info) {
+        this.logger.error("blocked task without activity info; skipping.");
+        continue;
+      }
+      if (!info.speechHandle) {
+        continue;
+      }
+      blockedHandles.push(info.speechHandle);
+    }
+    const toWait = [];
+    for (const task of this.speechTasks) {
+      if (this._drainBlockedTasks.includes(task)) {
+        continue;
+      }
+      const info = _getActivityTaskInfo(task);
+      if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
+        continue;
+      }
+      toWait.push(task);
+    }
+    return toWait;
+  }
   wakeupMainTask() {
     this.q_updated.resolve();
   }
@@ -696,7 +771,7 @@ class AgentActivity {
     if (this.llm === void 0) {
       throw new Error("trying to generate reply without an LLM model");
     }
-    const functionCall = (_a = asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
+    const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
     if (toolChoice === void 0 && functionCall !== void 0) {
       toolChoice = "none";
     }
@@ -714,19 +789,17 @@ class AgentActivity {
     this.logger.info({ speech_id: handle.id }, "Creating speech handle");
     if (this.llm instanceof RealtimeModel) {
       this.createSpeechTask({
-        task: Task.from(
-          (abortController) => this.realtimeReplyTask({
-            speechHandle: handle,
-            // TODO(brian): support llm.ChatMessage for the realtime model
-            userInput: userMessage == null ? void 0 : userMessage.textContent,
-            instructions,
-            modelSettings: {
-              // isGiven(toolChoice) = toolChoice !== undefined
-              toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
-            },
-            abortController
-          })
-        ),
+        taskFn: (abortController) => this.realtimeReplyTask({
+          speechHandle: handle,
+          // TODO(brian): support llm.ChatMessage for the realtime model
+          userInput: userMessage == null ? void 0 : userMessage.textContent,
+          instructions,
+          modelSettings: {
+            // isGiven(toolChoice) = toolChoice !== undefined
+            toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
+          },
+          abortController
+        }),
         ownedSpeechHandle: handle,
         name: "AgentActivity.realtimeReply"
       });
@@ -736,36 +809,36 @@ class AgentActivity {
 ${instructions}`;
       }
       const task = this.createSpeechTask({
-        task: Task.from(
-          (abortController) => this.pipelineReplyTask(
-            handle,
-            chatCtx ?? this.agent.chatCtx,
-            this.agent.toolCtx,
-            {
-              toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
-            },
-            abortController,
-            instructions,
-            userMessage
-          )
+        taskFn: (abortController) => this.pipelineReplyTask(
+          handle,
+          chatCtx ?? this.agent.chatCtx,
+          this.agent.toolCtx,
+          {
+            toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
+          },
+          abortController,
+          instructions,
+          userMessage
         ),
         ownedSpeechHandle: handle,
         name: "AgentActivity.pipelineReply"
       });
-      task.finally(() => this.onPipelineReplyDone());
+      task.result.finally(() => this.onPipelineReplyDone());
     }
     if (scheduleSpeech) {
       this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
     }
     return handle;
   }
-  interrupt() {
+  interrupt(options = {}) {
     var _a;
+    const { force = false } = options;
+    this.cancelPreemptiveGeneration();
     const future = new Future();
     const currentSpeech = this._currentSpeech;
-    currentSpeech == null ? void 0 : currentSpeech.interrupt();
+    currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
     for (const [_, __, speech] of this.speechQueue) {
-      speech.interrupt();
+      speech.interrupt(force);
     }
     (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
     if (currentSpeech === void 0) {
@@ -786,7 +859,7 @@ ${instructions}`;
   async userTurnCompleted(info, oldTask) {
     var _a, _b;
     if (oldTask) {
-      await oldTask;
+      await oldTask.result;
     }
     if (this.llm instanceof RealtimeModel) {
       if (this.llm.capabilities.turnDetection) {
@@ -973,7 +1046,7 @@ ${instructions}`;
     toolsMessages,
     span
   }) => {
-    var _a, _b, _c, _d, _e;
+    var _a, _b;
     speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     if (instructions) {
@@ -1119,11 +1192,11 @@ ${instructions}`;
       for (const msg of toolsMessages) {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolsMessages);
       const toolCallOutputs = toolsMessages.filter(
         (m) => m.type === "function_call_output"
       );
       if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
         this.agentSession._toolItemsAdded(toolCallOutputs);
       }
     }
@@ -1211,45 +1284,15 @@ ${instructions}`;
       );
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: []
-    });
-    let shouldGenerateToolReply = false;
-    let newAgentTask = null;
-    let ignoreTaskSwitch = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== void 0) {
-        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
-        this.logger.error("expected to receive only one agent task from the tool executions");
-        ignoreTaskSwitch = true;
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
-          args: sanitizedOut.toolCall.args,
-          output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
-          isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
-        },
-        "Tool call execution finished"
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     const toolMessages = [
       ...functionToolsExecutedEvent.functionCalls,
@@ -1258,34 +1301,32 @@ ${instructions}`;
     if (shouldGenerateToolReply) {
       chatCtx.insert(toolMessages);
       speechHandle._numSteps += 1;
-      const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
+      const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
       const toolResponseTask = this.createSpeechTask({
-        task: Task.from(
-          () => this.pipelineReplyTask(
-            speechHandle,
-            chatCtx,
-            toolCtx,
-            { toolChoice: respondToolChoice },
-            replyAbortController,
-            instructions,
-            void 0,
-            toolMessages
-          )
+        taskFn: () => this.pipelineReplyTask(
+          speechHandle,
+          chatCtx,
+          toolCtx,
+          { toolChoice: respondToolChoice },
+          replyAbortController,
+          instructions,
+          void 0,
+          toolMessages
         ),
         ownedSpeechHandle: speechHandle,
         name: "AgentActivity.pipelineReply"
       });
-      toolResponseTask.finally(() => this.onPipelineReplyDone());
+      toolResponseTask.result.finally(() => this.onPipelineReplyDone());
       this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
     } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
       for (const msg of toolMessages) {
         msg.createdAt = replyStartedAt;
       }
-      this.agent._chatCtx.insert(toolMessages);
       const toolCallOutputs = toolMessages.filter(
         (m) => m.type === "function_call_output"
       );
       if (toolCallOutputs.length > 0) {
+        this.agent._chatCtx.insert(toolCallOutputs);
         this.agentSession._toolItemsAdded(toolCallOutputs);
       }
     }
@@ -1329,7 +1370,7 @@ ${instructions}`;
     replyAbortController,
     span
   }) {
-    var _a, _b, _c, _d;
+    var _a;
     speechHandle._agentTurnContext = otelContext.active();
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
@@ -1589,44 +1630,15 @@ ${instructions}`;
       );
       return;
     }
-    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
-      functionCalls: [],
-      functionCallOutputs: []
-    });
-    let shouldGenerateToolReply = false;
-    let newAgentTask = null;
-    let ignoreTaskSwitch = false;
-    for (const sanitizedOut of toolOutput.output) {
-      if (sanitizedOut.toolCallOutput !== void 0) {
-        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
-        if (sanitizedOut.replyRequired) {
-          shouldGenerateToolReply = true;
-        }
-      }
-      if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
-        this.logger.error("expected to receive only one agent task from the tool executions");
-        ignoreTaskSwitch = true;
-      }
-      newAgentTask = sanitizedOut.agentTask ?? null;
-      this.logger.debug(
-        {
-          speechId: speechHandle.id,
-          name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
-          args: sanitizedOut.toolCall.args,
-          output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
-          isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
-        },
-        "Tool call execution finished"
-      );
-    }
+    const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
     this.agentSession.emit(
       AgentSessionEventTypes.FunctionToolsExecuted,
       functionToolsExecutedEvent
     );
-    let draining = this.draining;
+    let schedulingPaused = this.schedulingPaused;
     if (!ignoreTaskSwitch && newAgentTask !== null) {
       this.agentSession.updateAgent(newAgentTask);
-      draining = true;
+      schedulingPaused = true;
     }
     if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
       while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1667,20 +1679,58 @@ ${instructions}`;
         speechHandle: replySpeechHandle
       })
     );
-    const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
+    const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
     this.createSpeechTask({
-      task: Task.from(
-        (abortController) => this.realtimeReplyTask({
-          speechHandle: replySpeechHandle,
-          modelSettings: { toolChoice },
-          abortController
-        })
-      ),
+      taskFn: (abortController) => this.realtimeReplyTask({
+        speechHandle: replySpeechHandle,
+        modelSettings: { toolChoice },
+        abortController
+      }),
       ownedSpeechHandle: replySpeechHandle,
       name: "AgentActivity.realtime_reply"
     });
     this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
   }
+  summarizeToolExecutionOutput(toolOutput, speechHandle) {
+    var _a, _b, _c;
+    const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
+      functionCalls: [],
+      functionCallOutputs: []
+    });
+    let shouldGenerateToolReply = false;
+    let newAgentTask = null;
+    let ignoreTaskSwitch = false;
+    for (const sanitizedOut of toolOutput.output) {
+      if (sanitizedOut.toolCallOutput !== void 0) {
+        functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
+        functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
+        if (sanitizedOut.replyRequired) {
+          shouldGenerateToolReply = true;
+        }
+      }
+      if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
+        this.logger.error("expected to receive only one agent task from the tool executions");
+        ignoreTaskSwitch = true;
+      }
+      newAgentTask = sanitizedOut.agentTask ?? null;
+      this.logger.debug(
+        {
+          speechId: speechHandle.id,
+          name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
+          args: sanitizedOut.toolCall.args,
+          output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
+          isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
+        },
+        "Tool call execution finished"
+      );
+    }
+    return {
+      functionToolsExecutedEvent,
+      shouldGenerateToolReply,
+      newAgentTask,
+      ignoreTaskSwitch
+    };
+  }
   async realtimeReplyTask({
     speechHandle,
     modelSettings: { toolChoice },
@@ -1722,13 +1772,45 @@ ${instructions}`;
     }
   }
   scheduleSpeech(speechHandle, priority, force = false) {
-    if (this.draining && !force) {
-      throw new Error("cannot schedule new speech, the agent is draining");
+    if (this.schedulingPaused && !force) {
+      throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
     }
     this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
     speechHandle._markScheduled();
     this.wakeupMainTask();
   }
+  async _pauseSchedulingTask(blockedTasks) {
+    if (this._schedulingPaused) return;
+    this._schedulingPaused = true;
+    this._drainBlockedTasks = blockedTasks;
+    this.wakeupMainTask();
+    if (this._mainTask) {
+      await this._mainTask.result;
+    }
+  }
+  _resumeSchedulingTask() {
+    if (!this._schedulingPaused) return;
+    this._schedulingPaused = false;
+    this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
+  }
+  async pause(options = {}) {
+    const { blockedTasks = [] } = options;
+    const unlock = await this.lock.lock();
+    try {
+      const span = tracer.startSpan({
+        name: "pause_agent_activity",
+        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
+      });
+      try {
+        await this._pauseSchedulingTask(blockedTasks);
+        await this._closeSessionResources();
+      } finally {
+        span.end();
+      }
+    } finally {
+      unlock();
+    }
+  }
   async drain() {
     return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
       name: "drain_agent_activity",
@@ -1736,71 +1818,79 @@ ${instructions}`;
     });
   }
   async _drainImpl(span) {
-    var _a;
     span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
     const unlock = await this.lock.lock();
     try {
-      if (this._draining) return;
-      this.cancelPreemptiveGeneration();
-      const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
-        name: "on_exit",
-        attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
-      });
-      this.createSpeechTask({
-        task: Task.from(() => onExitTask),
+      if (this._schedulingPaused) return;
+      this._onExitTask = this.createSpeechTask({
+        taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
+          name: "on_exit",
+          attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
+        }),
+        inlineTask: true,
         name: "AgentActivity_onExit"
       });
-      this.wakeupMainTask();
-      this._draining = true;
-      await ((_a = this._mainTask) == null ? void 0 : _a.result);
+      this.cancelPreemptiveGeneration();
+      await this._onExitTask.result;
+      await this._pauseSchedulingTask([]);
     } finally {
       unlock();
     }
   }
   async close() {
-    var _a, _b, _c, _d;
     const unlock = await this.lock.lock();
     try {
-      if (!this._draining) {
-        this.logger.warn("task closing without draining");
-      }
       this.cancelPreemptiveGeneration();
-      if (this.llm instanceof LLM) {
-        this.llm.off("metrics_collected", this.onMetricsCollected);
-      }
-      if (this.realtimeSession) {
-        this.realtimeSession.off("generation_created", this.onGenerationCreated);
-        this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
-        this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
-        this.realtimeSession.off(
-          "input_audio_transcription_completed",
-          this.onInputAudioTranscriptionCompleted
-        );
-        this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
-      }
-      if (this.stt instanceof STT) {
-        this.stt.off("metrics_collected", this.onMetricsCollected);
+      await this._closeSessionResources();
+      if (this._mainTask) {
+        await this._mainTask.cancelAndWait();
       }
-      if (this.tts instanceof TTS) {
-        this.tts.off("metrics_collected", this.onMetricsCollected);
-      }
-      if (this.vad instanceof VAD) {
-        this.vad.off("metrics_collected", this.onMetricsCollected);
-      }
-      this.detachAudioInput();
-      (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
-      await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
-      await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
-      await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
+      this.agent._agentActivity = void 0;
     } finally {
       unlock();
     }
   }
+  async _closeSessionResources() {
+    var _a, _b, _c;
+    if (this.llm instanceof LLM) {
+      this.llm.off("metrics_collected", this.onMetricsCollected);
+      this.llm.off("error", this.onModelError);
+    }
+    if (this.realtimeSession) {
+      this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
+      this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
+      this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
+      this.realtimeSession.off(
+        "input_audio_transcription_completed",
+        this.onRealtimeInputAudioTranscriptionCompleted
+      );
+      this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
+      this.realtimeSession.off("error", this.onModelError);
+    }
+    if (this.stt instanceof STT) {
+      this.stt.off("metrics_collected", this.onMetricsCollected);
+      this.stt.off("error", this.onModelError);
+    }
+    if (this.tts instanceof TTS) {
+      this.tts.off("metrics_collected", this.onMetricsCollected);
+      this.tts.off("error", this.onModelError);
+    }
+    if (this.vad instanceof VAD) {
+      this.vad.off("metrics_collected", this.onMetricsCollected);
+    }
+    this.detachAudioInput();
+    (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
+    await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
+    await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
+    this.realtimeSession = void 0;
+    this.audioRecognition = void 0;
+  }
 }
 function toOaiToolChoice(toolChoice) {
   return toolChoice !== null ? toolChoice : void 0;
 }
 export {
-  AgentActivity
+  AgentActivity,
+  agentActivityStorage
 };
 //# sourceMappingURL=agent_activity.js.map