npm - @livekit/agents - Versions diffs - 1.0.47 → 1.0.49 - Mend

@livekit/agents 1.0.47 → 1.0.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (151) hide show

package/dist/beta/index.cjs +29 -0
package/dist/beta/index.cjs.map +1 -0
package/dist/beta/index.d.cts +2 -0
package/dist/beta/index.d.ts +2 -0
package/dist/beta/index.d.ts.map +1 -0
package/dist/beta/index.js +7 -0
package/dist/beta/index.js.map +1 -0
package/dist/beta/workflows/index.cjs +29 -0
package/dist/beta/workflows/index.cjs.map +1 -0
package/dist/beta/workflows/index.d.cts +2 -0
package/dist/beta/workflows/index.d.ts +2 -0
package/dist/beta/workflows/index.d.ts.map +1 -0
package/dist/beta/workflows/index.js +7 -0
package/dist/beta/workflows/index.js.map +1 -0
package/dist/beta/workflows/task_group.cjs +162 -0
package/dist/beta/workflows/task_group.cjs.map +1 -0
package/dist/beta/workflows/task_group.d.cts +32 -0
package/dist/beta/workflows/task_group.d.ts +32 -0
package/dist/beta/workflows/task_group.d.ts.map +1 -0
package/dist/beta/workflows/task_group.js +138 -0
package/dist/beta/workflows/task_group.js.map +1 -0
package/dist/cpu.cjs +189 -0
package/dist/cpu.cjs.map +1 -0
package/dist/cpu.d.cts +24 -0
package/dist/cpu.d.ts +24 -0
package/dist/cpu.d.ts.map +1 -0
package/dist/cpu.js +152 -0
package/dist/cpu.js.map +1 -0
package/dist/cpu.test.cjs +227 -0
package/dist/cpu.test.cjs.map +1 -0
package/dist/cpu.test.js +204 -0
package/dist/cpu.test.js.map +1 -0
package/dist/index.cjs +3 -0
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +2 -0
package/dist/index.js.map +1 -1
package/dist/inference/api_protos.d.cts +59 -59
package/dist/inference/api_protos.d.ts +59 -59
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +1 -1
package/dist/inference/llm.d.ts +1 -1
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +6 -0
package/dist/inference/tts.d.ts +6 -0
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js.map +1 -1
package/dist/llm/chat_context.cjs +89 -1
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +10 -1
package/dist/llm/chat_context.d.ts +10 -1
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +89 -1
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/chat_context.test.cjs +43 -0
package/dist/llm/chat_context.test.cjs.map +1 -1
package/dist/llm/chat_context.test.js +43 -0
package/dist/llm/chat_context.test.js.map +1 -1
package/dist/llm/index.cjs +2 -0
package/dist/llm/index.cjs.map +1 -1
package/dist/llm/index.d.cts +1 -1
package/dist/llm/index.d.ts +1 -1
package/dist/llm/index.d.ts.map +1 -1
package/dist/llm/index.js +3 -1
package/dist/llm/index.js.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/tool_context.cjs +7 -0
package/dist/llm/tool_context.cjs.map +1 -1
package/dist/llm/tool_context.d.cts +10 -2
package/dist/llm/tool_context.d.ts +10 -2
package/dist/llm/tool_context.d.ts.map +1 -1
package/dist/llm/tool_context.js +6 -0
package/dist/llm/tool_context.js.map +1 -1
package/dist/utils.cjs +1 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +1 -0
package/dist/utils.js.map +1 -1
package/dist/version.cjs +1 -1
package/dist/version.js +1 -1
package/dist/voice/agent.cjs +9 -0
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.cts +1 -0
package/dist/voice/agent.d.ts +1 -0
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +9 -0
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +67 -16
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +7 -0
package/dist/voice/agent_activity.d.ts +7 -0
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +68 -17
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +27 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +6 -0
package/dist/voice/agent_session.d.ts +6 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +27 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/room_io/room_io.cjs +11 -2
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +12 -3
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/testing/fake_llm.cjs +127 -0
package/dist/voice/testing/fake_llm.cjs.map +1 -0
package/dist/voice/testing/fake_llm.d.cts +30 -0
package/dist/voice/testing/fake_llm.d.ts +30 -0
package/dist/voice/testing/fake_llm.d.ts.map +1 -0
package/dist/voice/testing/fake_llm.js +103 -0
package/dist/voice/testing/fake_llm.js.map +1 -0
package/dist/voice/testing/index.cjs +3 -0
package/dist/voice/testing/index.cjs.map +1 -1
package/dist/voice/testing/index.d.cts +1 -0
package/dist/voice/testing/index.d.ts +1 -0
package/dist/voice/testing/index.d.ts.map +1 -1
package/dist/voice/testing/index.js +2 -0
package/dist/voice/testing/index.js.map +1 -1
package/dist/worker.cjs +6 -29
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +6 -19
package/dist/worker.js.map +1 -1
package/package.json +1 -1
package/src/beta/index.ts +9 -0
package/src/beta/workflows/index.ts +9 -0
package/src/beta/workflows/task_group.ts +194 -0
package/src/cpu.test.ts +239 -0
package/src/cpu.ts +173 -0
package/src/index.ts +2 -1
package/src/inference/llm.ts +2 -0
package/src/inference/tts.ts +8 -1
package/src/llm/chat_context.test.ts +48 -0
package/src/llm/chat_context.ts +123 -0
package/src/llm/index.ts +1 -0
package/src/llm/tool_context.ts +14 -0
package/src/utils.ts +5 -0
package/src/voice/agent.ts +11 -0
package/src/voice/agent_activity.ts +102 -16
package/src/voice/agent_session.ts +33 -2
package/src/voice/room_io/room_io.ts +14 -3
package/src/voice/testing/fake_llm.ts +138 -0
package/src/voice/testing/index.ts +2 -0
package/src/worker.ts +34 -50

package/src/llm/chat_context.test.ts CHANGED Viewed

@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import { describe, expect, it } from 'vitest';
+import { initializeLogger } from '../log.js';
+import { FakeLLM } from '../voice/testing/fake_llm.js';
 import {
   type AudioContent,
   ChatContext,
@@ -13,6 +15,8 @@ import {
   ReadonlyChatContext,
 } from './chat_context.js';
+initializeLogger({ pretty: false, level: 'error' });
 describe('ChatContext.toJSON', () => {
   it('should match snapshot for empty context', () => {
     const context = new ChatContext();
@@ -283,6 +287,50 @@ describe('ChatContext.toJSON', () => {
   });
 });
+describe('ChatContext._summarize', () => {
+  it('keeps chronological timestamps with summary + tail', async () => {
+    const ctx = new ChatContext();
+    ctx.addMessage({ role: 'system', content: 'System prompt', createdAt: 0 });
+    ctx.addMessage({ role: 'user', content: 'hello', createdAt: 1000 });
+    ctx.addMessage({ role: 'assistant', content: 'hi there', createdAt: 2000 });
+    ctx.insert(
+      new FunctionCallOutput({
+        callId: 'call_1',
+        name: 'lookup',
+        output: '{"ok":true}',
+        isError: false,
+        createdAt: 3500,
+      }),
+    );
+    ctx.addMessage({ role: 'user', content: 'my color is blue', createdAt: 3000 });
+    ctx.addMessage({ role: 'assistant', content: 'noted', createdAt: 4000 });
+    const fake = new FakeLLM([
+      {
+        input: 'Conversation to summarize:\n\nuser: hello\nassistant: hi there',
+        content: 'condensed head',
+      },
+    ]);
+    await ctx._summarize(fake, { keepLastTurns: 1 });
+    const summary = ctx.items.find(
+      (item) =>
+        item.type === 'message' && item.role === 'assistant' && item.extra?.is_summary === true,
+    );
+    expect(summary).toBeDefined();
+    if (!summary || summary.type !== 'message') {
+      throw new Error('summary message is missing');
+    }
+    expect(summary.createdAt).toBeCloseTo(2999.999, 6);
+    const createdAts = ctx.items.map((item) => item.createdAt);
+    const sorted = [...createdAts].sort((a, b) => a - b);
+    expect(createdAts).toEqual(sorted);
+  });
+});
 describe('ReadonlyChatContext with immutable array', () => {
   it('should have readonly property set to true', () => {
     const items: ChatItem[] = [

package/src/llm/chat_context.ts CHANGED Viewed

@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 import type { AudioFrame, VideoFrame } from '@livekit/rtc-node';
 import { createImmutableArray, shortuuid } from '../utils.js';
+import type { LLM } from './llm.js';
 import { type ProviderFormat, toChatCtx } from './provider_format/index.js';
 import type { JSONObject, JSONValue, ToolContext } from './tool_context.js';
@@ -95,12 +96,15 @@ export class ChatMessage {
   createdAt: number;
+  extra: Record<string, unknown>;
   constructor(params: {
     role: ChatRole;
     content: ChatContent[] | string;
     id?: string;
     interrupted?: boolean;
     createdAt?: number;
+    extra?: Record<string, unknown>;
   }) {
     const {
       role,
@@ -108,12 +112,14 @@ export class ChatMessage {
       id = shortuuid('item_'),
       interrupted = false,
       createdAt = Date.now(),
+      extra = {},
     } = params;
     this.id = id;
     this.role = role;
     this.content = Array.isArray(content) ? content : [content];
     this.interrupted = interrupted;
     this.createdAt = createdAt;
+    this.extra = extra;
   }
   static create(params: {
@@ -122,6 +128,7 @@ export class ChatMessage {
     id?: string;
     interrupted?: boolean;
     createdAt?: number;
+    extra?: Record<string, unknown>;
   }) {
     return new ChatMessage(params);
   }
@@ -401,6 +408,7 @@ export class AgentHandoffItem {
   }
 }
+// TODO(parity): Add AgentConfigUpdate type to ChatItem union
 export type ChatItem = ChatMessage | FunctionCall | FunctionCallOutput | AgentHandoffItem;
 export class ChatContext {
@@ -431,6 +439,7 @@ export class ChatContext {
     id?: string;
     interrupted?: boolean;
     createdAt?: number;
+    extra?: Record<string, unknown>;
   }): ChatMessage {
     const msg = new ChatMessage(params);
     if (params.createdAt !== undefined) {
@@ -463,11 +472,13 @@ export class ChatContext {
     return idx !== -1 ? idx : undefined;
   }
+  // TODO(parity): Add excludeConfigUpdate option when AgentConfigUpdate is ported
   copy(
     options: {
       excludeFunctionCall?: boolean;
       excludeInstructions?: boolean;
       excludeEmptyMessage?: boolean;
+      excludeHandoff?: boolean;
       toolCtx?: ToolContext<any>; // eslint-disable-line @typescript-eslint/no-explicit-any
     } = {},
   ): ChatContext {
@@ -475,6 +486,7 @@ export class ChatContext {
       excludeFunctionCall = false,
       excludeInstructions = false,
       excludeEmptyMessage = false,
+      excludeHandoff = false,
       toolCtx,
     } = options;
     const items: ChatItem[] = [];
@@ -500,6 +512,10 @@ export class ChatContext {
         continue;
       }
+      if (excludeHandoff && item.type === 'agent_handoff') {
+        continue;
+      }
       if (toolCtx !== undefined && isToolCallOrOutput(item) && toolCtx[item.name] === undefined) {
         continue;
       }
@@ -510,6 +526,7 @@ export class ChatContext {
     return new ChatContext(items);
   }
+  // TODO(parity): Add excludeConfigUpdate option when AgentConfigUpdate is ported
   merge(
     other: ChatContext,
     options: {
@@ -762,6 +779,112 @@ export class ChatContext {
     return true;
   }
+  async _summarize(llm: LLM, options: { keepLastTurns?: number } = {}): Promise<ChatContext> {
+    const { keepLastTurns = 2 } = options;
+    const toSummarize: ChatMessage[] = [];
+    for (const item of this._items) {
+      if (item.type !== 'message') continue;
+      if (item.role !== 'user' && item.role !== 'assistant') continue;
+      if (item.extra?.is_summary === true) continue;
+      const text = (item.textContent ?? '').trim();
+      if (text) {
+        toSummarize.push(item);
+      }
+    }
+    if (toSummarize.length === 0) {
+      return this;
+    }
+    const tailN = Math.max(0, Math.min(toSummarize.length, keepLastTurns * 2));
+    let head: ChatMessage[];
+    let tail: ChatMessage[];
+    if (tailN === 0) {
+      head = toSummarize;
+      tail = [];
+    } else {
+      head = toSummarize.slice(0, -tailN);
+      tail = toSummarize.slice(-tailN);
+    }
+    if (head.length === 0) {
+      return this;
+    }
+    const sourceText = head
+      .map((m) => `${m.role}: ${(m.textContent ?? '').trim()}`)
+      .join('\n')
+      .trim();
+    if (!sourceText) {
+      return this;
+    }
+    // TODO: refactor this into LLMStream.collect API.
+    const promptCtx = new ChatContext();
+    promptCtx.addMessage({
+      role: 'system',
+      content:
+        'Compress older chat history into a short, faithful summary.\n' +
+        'Focus on user goals, constraints, decisions, key facts/preferences/entities, and pending tasks.\n' +
+        'Exclude chit-chat and greetings. Be concise.',
+    });
+    promptCtx.addMessage({
+      role: 'user',
+      content: `Conversation to summarize:\n\n${sourceText}`,
+    });
+    const chunks: string[] = [];
+    for await (const chunk of llm.chat({ chatCtx: promptCtx })) {
+      if (chunk.delta?.content) {
+        chunks.push(chunk.delta.content);
+      }
+    }
+    const summary = chunks.join('').trim();
+    if (!summary) {
+      return this;
+    }
+    const tailStartTs = tail.length > 0 ? tail[0]!.createdAt : Infinity;
+    const preserved: ChatItem[] = [];
+    for (const it of this._items) {
+      if (
+        (it.type === 'function_call' || it.type === 'function_call_output') &&
+        it.createdAt < tailStartTs
+      ) {
+        continue;
+      }
+      if (it.type === 'message' && (it.role === 'user' || it.role === 'assistant')) {
+        continue;
+      }
+      preserved.push(it);
+    }
+    this._items = preserved;
+    const createdAtHint =
+      tail.length > 0 ? tail[0]!.createdAt - 1e-3 : head[head.length - 1]!.createdAt + 1e-3;
+    this.addMessage({
+      role: 'assistant',
+      content: `[history summary]\n${summary}`,
+      createdAt: createdAtHint,
+      extra: { is_summary: true },
+    });
+    for (const msg of tail) {
+      this.insert(msg);
+    }
+    return this;
+  }
   /**
    * Indicates whether the context is read-only
    */

package/src/llm/index.ts CHANGED Viewed

@@ -6,6 +6,7 @@ export {
   isFunctionTool,
   tool,
   ToolError,
+  ToolFlag,
   type AgentHandoff,
   type FunctionTool,
   type ProviderDefinedTool,

package/src/llm/tool_context.ts CHANGED Viewed

@@ -80,6 +80,13 @@ export class ToolError extends Error {
   }
 }
+export const ToolFlag = {
+  NONE: 0,
+  IGNORE_ON_ENTER: 1 << 0,
+} as const;
+export type ToolFlag = (typeof ToolFlag)[keyof typeof ToolFlag];
 export interface AgentHandoff {
   /**
    * The agent to handoff to.
@@ -178,6 +185,8 @@ export interface FunctionTool<
    */
   execute: ToolExecuteFunction<Parameters, UserData, Result>;
+  flags: number;
   [FUNCTION_TOOL_SYMBOL]: true;
 }
@@ -242,10 +251,12 @@ export function tool<
   description,
   parameters,
   execute,
+  flags,
 }: {
   description: string;
   parameters: Schema;
   execute: ToolExecuteFunction<InferToolInput<Schema>, UserData, Result>;
+  flags?: number;
 }): FunctionTool<InferToolInput<Schema>, UserData, Result>;
 /**
@@ -254,10 +265,12 @@ export function tool<
 export function tool<UserData = UnknownUserData, Result = unknown>({
   description,
   execute,
+  flags,
 }: {
   description: string;
   parameters?: never;
   execute: ToolExecuteFunction<Record<string, never>, UserData, Result>;
+  flags?: number;
 }): FunctionTool<Record<string, never>, UserData, Result>;
 /**
@@ -295,6 +308,7 @@ export function tool(tool: any): any {
       description: tool.description,
       parameters,
       execute: tool.execute,
+      flags: tool.flags ?? ToolFlag.NONE,
       [TOOL_SYMBOL]: true,
       [FUNCTION_TOOL_SYMBOL]: true,
     };

package/src/utils.ts CHANGED Viewed

@@ -173,6 +173,11 @@ export class Future<T = void> {
     this.#rejected = true;
     this.#error = error;
     this.#rejectPromise(error);
+    // Python calls Future.exception() right after set_exception() to silence
+    // "exception was never retrieved" warnings. In JS, consume the rejection
+    // immediately so Node does not emit unhandled-rejection noise before a
+    // later await/catch observes it.
+    void this.#await.catch(() => undefined);
   }
 }

package/src/voice/agent.ts CHANGED Viewed

@@ -302,6 +302,17 @@ export class Agent<UserData = any> {
     this._agentActivity.updateChatCtx(chatCtx);
   }
+  // TODO(parity): Add when AgentConfigUpdate is ported to ChatContext.
+  async updateTools(tools: ToolContext): Promise<void> {
+    if (!this._agentActivity) {
+      this._tools = { ...tools };
+      this._chatCtx = this._chatCtx.copy({ toolCtx: this._tools });
+      return;
+    }
+    await this._agentActivity.updateTools(tools);
+  }
   static default = {
     async sttNode(
       agent: Agent,

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -7,7 +7,7 @@ import type { Span } from '@opentelemetry/api';
 import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
 import { Heap } from 'heap-js';
 import { AsyncLocalStorage } from 'node:async_hooks';
-import { ReadableStream } from 'node:stream/web';
+import { ReadableStream, TransformStream } from 'node:stream/web';
 import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
 import {
   type ChatItem,
@@ -23,6 +23,7 @@ import {
   type RealtimeSession,
   type ToolChoice,
   type ToolContext,
+  ToolFlag,
 } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
 import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
@@ -83,6 +84,12 @@ import { SpeechHandle } from './speech_handle.js';
 import { setParticipantSpanAttributes } from './utils.js';
 export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
+export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
+interface OnEnterData {
+  session: AgentSession;
+  agent: Agent;
+}
 interface PreemptiveGeneration {
   speechHandle: SpeechHandle;
@@ -312,6 +319,8 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
+    // TODO(parity): Record initial AgentConfigUpdate in chat context
     // metrics and error handling
     if (this.llm instanceof LLM) {
       this.llm.on('metrics_collected', this.onMetricsCollected);
@@ -354,11 +363,13 @@ export class AgentActivity implements RecognitionHooks {
     if (runOnEnter) {
       this._onEnterTask = this.createSpeechTask({
         taskFn: () =>
-          tracer.startActiveSpan(async () => this.agent.onEnter(), {
-            name: 'on_enter',
-            context: trace.setSpan(ROOT_CONTEXT, startSpan),
-            attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
-          }),
+          onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
+            tracer.startActiveSpan(async () => this.agent.onEnter(), {
+              name: 'on_enter',
+              context: trace.setSpan(ROOT_CONTEXT, startSpan),
+              attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
+            }),
+          ),
         inlineTask: true,
         name: 'AgentActivity_onEnter',
       });
@@ -446,6 +457,20 @@ export class AgentActivity implements RecognitionHooks {
     }
   }
+  // TODO: Add when AgentConfigUpdate is ported to ChatContext.
+  async updateTools(tools: ToolContext): Promise<void> {
+    this.agent._tools = { ...tools };
+    if (this.realtimeSession) {
+      await this.realtimeSession.updateTools(tools);
+    }
+    if (this.llm instanceof LLM) {
+      // for realtime LLM, we assume the server will remove unvalid tool messages
+      await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
+    }
+  }
   updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
     if (toolChoice !== undefined) {
       this.toolChoice = toolChoice;
@@ -460,15 +485,36 @@ export class AgentActivity implements RecognitionHooks {
     void this.audioStream.close();
     this.audioStream = new MultiInputStream<AudioFrame>();
+    // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
+    // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
+    // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
+    // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
+    const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
+      transform: (frame, controller) => {
+        const shouldDiscardForAecWarmup =
+          this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
+        if (!shouldDiscardForAecWarmup) {
+          controller.enqueue(frame);
+        }
+      },
+    });
     this.audioStreamId = this.audioStream.addInputStream(audioStream);
-    const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
-    if (this.realtimeSession) {
+    if (this.realtimeSession && this.audioRecognition) {
+      const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
+        .pipeThrough(aecWarmupAudioFilter)
+        .tee();
       this.realtimeSession.setInputAudioStream(realtimeAudioStream);
-    }
-    if (this.audioRecognition) {
       this.audioRecognition.setInputAudioStream(recognitionAudioStream);
+    } else if (this.realtimeSession) {
+      this.realtimeSession.setInputAudioStream(
+        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
+      );
+    } else if (this.audioRecognition) {
+      this.audioRecognition.setInputAudioStream(
+        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
+      );
     }
   }
@@ -730,6 +776,11 @@ export class AgentActivity implements RecognitionHooks {
   }
   private interruptByAudioActivity(): void {
+    if (this.agentSession._aecWarmupRemaining > 0) {
+      // Disable interruption from audio activity while AEC warmup is active.
+      return;
+    }
     if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
       // skip speech handle interruption if server side turn detection is enabled
       return;
@@ -1129,12 +1180,25 @@ export class AgentActivity implements RecognitionHooks {
         instructions = `${this.agent.instructions}\n${instructions}`;
       }
+      // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
+      const onEnterData = onEnterStorage.getStore();
+      const shouldFilterTools =
+        onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
+      const tools = shouldFilterTools
+        ? Object.fromEntries(
+            Object.entries(this.agent.toolCtx).filter(
+              ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
+            ),
+          )
+        : this.agent.toolCtx;
       const task = this.createSpeechTask({
         taskFn: (abortController: AbortController) =>
           this.pipelineReplyTask(
             handle,
             chatCtx ?? this.agent.chatCtx,
-            this.agent.toolCtx,
+            tools,
             {
               toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
             },
@@ -1172,7 +1236,24 @@ export class AgentActivity implements RecognitionHooks {
     this.realtimeSession?.interrupt();
-    if (currentSpeech === undefined) {
+    if (force) {
+      // Force-interrupt (used during shutdown): cancel all speech tasks so they
+      // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
+      // when the room is disconnected). Mark the current speech as done immediately
+      // so the interrupt future resolves without waiting for tasks to finish.
+      // Clear the queue so mainTask doesn't dequeue already-interrupted handles
+      // and hang on _waitForGeneration() (the generation future created by
+      // _authorizeGeneration would never resolve since _markDone is a no-op
+      // once doneFut is already settled).
+      for (const task of this.speechTasks) {
+        task.cancel();
+      }
+      if (currentSpeech && !currentSpeech.done()) {
+        currentSpeech._markDone();
+      }
+      this.speechQueue.clear();
+      future.resolve();
+    } else if (currentSpeech === undefined) {
       future.resolve();
     } else {
       currentSpeech.addDoneCallback(() => {
@@ -1680,9 +1761,7 @@ export class AgentActivity implements RecognitionHooks {
       }
       replyAbortController.abort();
-      await Promise.allSettled(
-        tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
-      );
+      await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       let forwardedText = textOut?.text || '';
@@ -2511,6 +2590,13 @@ export class AgentActivity implements RecognitionHooks {
     const unlock = await this.lock.lock();
     try {
       this.cancelPreemptiveGeneration();
+      await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
+      if (this._currentSpeech && !this._currentSpeech.done()) {
+        this._currentSpeech._markDone();
+      }
       await this._closeSessionResources();
       if (this._mainTask) {

package/src/voice/agent_session.ts CHANGED Viewed

@@ -77,6 +77,7 @@ export interface VoiceOptions {
   maxToolSteps: number;
   preemptiveGeneration: boolean;
   userAwayTimeout?: number | null;
+  aecWarmupDuration: number | null;
   useTtsAlignedTranscript: boolean;
 }
@@ -90,6 +91,7 @@ const defaultVoiceOptions: VoiceOptions = {
   maxToolSteps: 3,
   preemptiveGeneration: false,
   userAwayTimeout: 15.0,
+  aecWarmupDuration: 3000,
   useTtsAlignedTranscript: true,
 } as const;
@@ -158,6 +160,8 @@ export class AgentSession<
   private closingTask: Promise<void> | null = null;
   private userAwayTimer: NodeJS.Timeout | null = null;
+  private _aecWarmupTimer: NodeJS.Timeout | null = null;
   // Connection options for STT, LLM, and TTS
   private _connOptions: ResolvedSessionConnectOptions;
@@ -169,6 +173,9 @@ export class AgentSession<
   private userSpeakingSpan?: Span;
   private agentSpeakingSpan?: Span;
+  /** @internal */
+  _aecWarmupRemaining = 0;
   /** @internal */
   _recorderIO?: RecorderIO;
@@ -241,6 +248,7 @@ export class AgentSession<
     // This is the "global" chat context, it holds the entire conversation history
     this._chatCtx = ChatContext.empty();
     this.options = { ...defaultVoiceOptions, ...voiceOptions };
+    this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0;
     this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
     this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
@@ -774,7 +782,9 @@ export class AgentSession<
     if (this.closingTask) {
       return;
     }
-    this.closeImpl(reason, error, drain);
+    this.closingTask = this.closeImpl(reason, error, drain).finally(() => {
+      this.closingTask = null;
+    });
   }
   /** @internal */
@@ -845,6 +855,14 @@ export class AgentSession<
       this.agentSpeakingSpan = undefined;
     }
+    if (state === 'speaking' && this._aecWarmupRemaining > 0 && this._aecWarmupTimer === null) {
+      this._aecWarmupTimer = setTimeout(() => this._onAecWarmupExpired(), this._aecWarmupRemaining);
+      this.logger.debug(
+        { warmupDurationMs: this._aecWarmupRemaining },
+        'aec warmup active, disabling interruptions',
+      );
+    }
     const oldState = this._agentState;
     this._agentState = state;
@@ -938,6 +956,19 @@ export class AgentSession<
     }
   }
+  /** @internal */
+  _onAecWarmupExpired(): void {
+    if (this._aecWarmupRemaining > 0) {
+      this.logger.debug('aec warmup expired, re-enabling interruptions');
+    }
+    this._aecWarmupRemaining = 0;
+    if (this._aecWarmupTimer !== null) {
+      clearTimeout(this._aecWarmupTimer);
+      this._aecWarmupTimer = null;
+    }
+  }
   private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
     if (this.userState === 'away' && ev.isFinal) {
       this.logger.debug('User returned from away state due to speech input');
@@ -969,6 +1000,7 @@ export class AgentSession<
     }
     this._cancelUserAwayTimer();
+    this._onAecWarmupExpired();
     this.off(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
     if (this.activity) {
@@ -976,7 +1008,6 @@ export class AgentSession<
         try {
           await this.activity.interrupt({ force: true }).await;
         } catch (error) {
-          // Uninterruptible speech can throw during forced interruption.
           this.logger.warn({ error }, 'Error interrupting activity');
         }
       }