npm - @livekit/agents - Versions diffs - 1.0.9 → 1.0.11 - Mend

@livekit/agents 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/dist/audio.cjs +3 -3
package/dist/audio.cjs.map +1 -1
package/dist/audio.d.cts +1 -1
package/dist/audio.d.ts +1 -1
package/dist/audio.d.ts.map +1 -1
package/dist/audio.js +2 -2
package/dist/audio.js.map +1 -1
package/dist/llm/llm.cjs +7 -4
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js +7 -4
package/dist/llm/llm.js.map +1 -1
package/dist/metrics/base.cjs.map +1 -1
package/dist/metrics/base.d.cts +23 -18
package/dist/metrics/base.d.ts +23 -18
package/dist/metrics/base.d.ts.map +1 -1
package/dist/metrics/usage_collector.cjs +2 -2
package/dist/metrics/usage_collector.cjs.map +1 -1
package/dist/metrics/usage_collector.d.cts +1 -1
package/dist/metrics/usage_collector.d.ts +1 -1
package/dist/metrics/usage_collector.d.ts.map +1 -1
package/dist/metrics/usage_collector.js +2 -2
package/dist/metrics/usage_collector.js.map +1 -1
package/dist/metrics/utils.cjs +14 -7
package/dist/metrics/utils.cjs.map +1 -1
package/dist/metrics/utils.d.ts.map +1 -1
package/dist/metrics/utils.js +14 -7
package/dist/metrics/utils.js.map +1 -1
package/dist/stt/stt.cjs +5 -5
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.js +6 -6
package/dist/stt/stt.js.map +1 -1
package/dist/tts/tts.cjs +11 -10
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +11 -10
package/dist/tts/tts.js.map +1 -1
package/dist/vad.cjs +5 -5
package/dist/vad.cjs.map +1 -1
package/dist/vad.js +5 -5
package/dist/vad.js.map +1 -1
package/dist/voice/agent_activity.cjs +7 -4
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +7 -4
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/generation_tools.test.cjs +236 -0
package/dist/voice/generation_tools.test.cjs.map +1 -0
package/dist/voice/generation_tools.test.js +235 -0
package/dist/voice/generation_tools.test.js.map +1 -0
package/dist/voice/index.cjs +3 -1
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -0
package/dist/voice/index.d.ts +1 -0
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +1 -0
package/dist/voice/index.js.map +1 -1
package/package.json +1 -1
package/src/audio.ts +1 -1
package/src/llm/llm.ts +7 -4
package/src/metrics/base.ts +23 -18
package/src/metrics/usage_collector.ts +3 -3
package/src/metrics/utils.ts +16 -7
package/src/stt/stt.ts +6 -6
package/src/tts/tts.ts +11 -10
package/src/vad.ts +5 -5
package/src/voice/agent_activity.ts +8 -4
package/src/voice/generation_tools.test.ts +268 -0
package/src/voice/index.ts +1 -0

package/src/metrics/base.ts CHANGED Viewed

@@ -15,8 +15,10 @@ export type LLMMetrics = {
   label: string;
   requestId: string;
   timestamp: number;
-  duration: number;
-  ttft: number;
+  /** Duration of the request in milliseconds. */
+  durationMs: number;
+  /** Time to first token in milliseconds. */
+  ttftMs: number;
   cancelled: boolean;
   completionTokens: number;
   promptTokens: number;
@@ -32,13 +34,13 @@ export type STTMetrics = {
   requestId: string;
   timestamp: number;
   /**
-   * The request duration in seconds, 0.0 if the STT is streaming.
+   * The request duration in milliseconds, 0.0 if the STT is streaming.
    */
-  duration: number;
+  durationMs: number;
   /**
-   * The duration of the pushed audio in seconds.
+   * The duration of the pushed audio in milliseconds.
    */
-  audioDuration: number;
+  audioDurationMs: number;
   /**
    * Whether the STT is streaming (e.g using websocket).
    */
@@ -50,9 +52,12 @@ export type TTSMetrics = {
   label: string;
   requestId: string;
   timestamp: number;
-  ttfb: number;
-  duration: number;
-  audioDuration: number;
+  /** Time to first byte in milliseconds. */
+  ttfbMs: number;
+  /** Total synthesis duration in milliseconds. */
+  durationMs: number;
+  /** Generated audio duration in milliseconds. */
+  audioDurationMs: number;
   cancelled: boolean;
   charactersCount: number;
   streamed: boolean;
@@ -64,8 +69,8 @@ export type VADMetrics = {
   type: 'vad_metrics';
   label: string;
   timestamp: number;
-  idleTime: number;
-  inferenceDurationTotal: number;
+  idleTimeMs: number;
+  inferenceDurationTotalMs: number;
   inferenceCount: number;
 };
@@ -76,16 +81,16 @@ export type EOUMetrics = {
    * Amount of time between the end of speech from VAD and the decision to end the user's turn.
    * Set to 0.0 if the end of speech was not detected.
    */
-  endOfUtteranceDelay: number;
+  endOfUtteranceDelayMs: number;
   /**
    * Time taken to obtain the transcript after the end of the user's speech.
    * Set to 0.0 if the end of speech was not detected.
    */
-  transcriptionDelay: number;
+  transcriptionDelayMs: number;
   /**
    * Time taken to invoke the user's `Agent.onUserTurnCompleted` callback.
    */
-  onUserTurnCompletedDelay: number;
+  onUserTurnCompletedDelayMs: number;
   speechId?: string;
 };
@@ -118,13 +123,13 @@ export type RealtimeModelMetrics = {
    */
   timestamp: number;
   /**
-   * The duration of the response from created to done in seconds.
+   * The duration of the response from created to done in milliseconds.
    */
-  duration: number;
+  durationMs: number;
   /**
-   * Time to first audio token in seconds. -1 if no audio token was sent.
+   * Time to first audio token in milliseconds. -1 if no audio token was sent.
    */
-  ttft: number;
+  ttftMs: number;
   /**
    * Whether the request was cancelled.
    */

package/src/metrics/usage_collector.ts CHANGED Viewed

@@ -8,7 +8,7 @@ export interface UsageSummary {
   llmPromptCachedTokens: number;
   llmCompletionTokens: number;
   ttsCharactersCount: number;
-  sttAudioDuration: number;
+  sttAudioDurationMs: number;
 }
 export class UsageCollector {
@@ -20,7 +20,7 @@ export class UsageCollector {
       llmPromptCachedTokens: 0,
       llmCompletionTokens: 0,
       ttsCharactersCount: 0,
-      sttAudioDuration: 0,
+      sttAudioDurationMs: 0,
     };
   }
@@ -36,7 +36,7 @@ export class UsageCollector {
     } else if (metrics.type === 'tts_metrics') {
       this.summary.ttsCharactersCount += metrics.charactersCount;
     } else if (metrics.type === 'stt_metrics') {
-      this.summary.sttAudioDuration += metrics.audioDuration;
+      this.summary.sttAudioDurationMs += metrics.audioDurationMs;
     }
   }

package/src/metrics/utils.ts CHANGED Viewed

@@ -13,7 +13,7 @@ export const logMetrics = (metrics: AgentMetrics) => {
   if (metrics.type === 'llm_metrics') {
     logger
       .child({
-        ttft: roundTwoDecimals(metrics.ttft),
+        ttftMs: roundTwoDecimals(metrics.ttftMs),
         inputTokens: metrics.promptTokens,
         promptCachedTokens: metrics.promptCachedTokens,
         outputTokens: metrics.completionTokens,
@@ -23,7 +23,7 @@ export const logMetrics = (metrics: AgentMetrics) => {
   } else if (metrics.type === 'realtime_model_metrics') {
     logger
       .child({
-        ttft: roundTwoDecimals(metrics.ttft),
+        ttftMs: roundTwoDecimals(metrics.ttftMs),
         input_tokens: metrics.inputTokens,
         cached_input_tokens: metrics.inputTokenDetails.cachedTokens,
         output_tokens: metrics.outputTokens,
@@ -34,21 +34,30 @@ export const logMetrics = (metrics: AgentMetrics) => {
   } else if (metrics.type === 'tts_metrics') {
     logger
       .child({
-        ttfb: roundTwoDecimals(metrics.ttfb),
-        audioDuration: metrics.audioDuration,
+        ttfbMs: roundTwoDecimals(metrics.ttfbMs),
+        audioDurationMs: Math.round(metrics.audioDurationMs),
       })
       .info('TTS metrics');
   } else if (metrics.type === 'eou_metrics') {
     logger
       .child({
-        end_of_utterance_delay: roundTwoDecimals(metrics.endOfUtteranceDelay),
-        transcription_delay: roundTwoDecimals(metrics.transcriptionDelay),
+        endOfUtteranceDelayMs: roundTwoDecimals(metrics.endOfUtteranceDelayMs),
+        transcriptionDelayMs: roundTwoDecimals(metrics.transcriptionDelayMs),
+        onUserTurnCompletedDelayMs: roundTwoDecimals(metrics.onUserTurnCompletedDelayMs),
       })
       .info('EOU metrics');
+  } else if (metrics.type === 'vad_metrics') {
+    logger
+      .child({
+        idleTimeMs: Math.round(metrics.idleTimeMs),
+        inferenceDurationTotalMs: Math.round(metrics.inferenceDurationTotalMs),
+        inferenceCount: metrics.inferenceCount,
+      })
+      .info('VAD metrics');
   } else if (metrics.type === 'stt_metrics') {
     logger
       .child({
-        audioDuration: metrics.audioDuration,
+        audioDurationMs: Math.round(metrics.audioDurationMs),
       })
       .info('STT metrics');
   }

package/src/stt/stt.ts CHANGED Viewed

@@ -6,7 +6,7 @@ import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
 import { EventEmitter } from 'node:events';
 import type { ReadableStream } from 'node:stream/web';
 import { APIConnectionError, APIError } from '../_exceptions.js';
-import { calculateAudioDuration } from '../audio.js';
+import { calculateAudioDurationSeconds } from '../audio.js';
 import { log } from '../log.js';
 import type { STTMetrics } from '../metrics/base.js';
 import { DeferredReadableStream } from '../stream/deferred_stream.js';
@@ -110,14 +110,14 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter<STTCal
   async recognize(frame: AudioBuffer): Promise<SpeechEvent> {
     const startTime = process.hrtime.bigint();
     const event = await this._recognize(frame);
-    const duration = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
+    const durationMs = Number((process.hrtime.bigint() - startTime) / BigInt(1000000));
     this.emit('metrics_collected', {
       type: 'stt_metrics',
       requestId: event.requestId ?? '',
       timestamp: Date.now(),
-      duration,
+      durationMs,
       label: this.label,
-      audioDuration: calculateAudioDuration(frame),
+      audioDurationMs: Math.round(calculateAudioDurationSeconds(frame) * 1000),
       streamed: false,
     });
     return event;
@@ -252,9 +252,9 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
         type: 'stt_metrics',
         timestamp: Date.now(),
         requestId: event.requestId!,
-        duration: 0,
+        durationMs: 0,
         label: this.#stt.label,
-        audioDuration: event.recognitionUsage!.audioDuration,
+        audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000),
         streamed: true,
       };
       this.#stt.emit('metrics_collected', metrics);

package/src/tts/tts.ts CHANGED Viewed

@@ -228,7 +228,7 @@ export abstract class SynthesizeStream
   protected async monitorMetrics() {
     const startTime = process.hrtime.bigint();
-    let audioDuration = 0;
+    let audioDurationMs = 0;
     let ttfb: bigint = BigInt(-1);
     let requestId = '';
@@ -236,14 +236,15 @@ export abstract class SynthesizeStream
       if (this.#metricsPendingTexts.length) {
         const text = this.#metricsPendingTexts.shift()!;
         const duration = process.hrtime.bigint() - startTime;
+        const roundedAudioDurationMs = Math.round(audioDurationMs);
         const metrics: TTSMetrics = {
           type: 'tts_metrics',
           timestamp: Date.now(),
           requestId,
-          ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),
-          duration: Math.trunc(Number(duration / BigInt(1000000))),
+          ttfbMs: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),
+          durationMs: Math.trunc(Number(duration / BigInt(1000000))),
           charactersCount: text.length,
-          audioDuration,
+          audioDurationMs: roundedAudioDurationMs,
           cancelled: this.abortController.signal.aborted,
           label: this.#tts.label,
           streamed: false,
@@ -263,7 +264,7 @@ export abstract class SynthesizeStream
         ttfb = process.hrtime.bigint() - startTime;
       }
       // TODO(AJS-102): use frame.durationMs once available in rtc-node
-      audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
+      audioDurationMs += (audio.frame.samplesPerChannel / audio.frame.sampleRate) * 1000;
       if (audio.final) {
         emit();
       }
@@ -436,7 +437,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
   protected async monitorMetrics() {
     const startTime = process.hrtime.bigint();
-    let audioDuration = 0;
+    let audioDurationMs = 0;
     let ttfb: bigint = BigInt(-1);
     let requestId = '';
@@ -446,7 +447,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
       if (ttfb === BigInt(-1)) {
         ttfb = process.hrtime.bigint() - startTime;
       }
-      audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
+      audioDurationMs += (audio.frame.samplesPerChannel / audio.frame.sampleRate) * 1000;
     }
     this.output.close();
@@ -455,10 +456,10 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
       type: 'tts_metrics',
       timestamp: Date.now(),
       requestId,
-      ttfb: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),
-      duration: Math.trunc(Number(duration / BigInt(1000000))),
+      ttfbMs: ttfb === BigInt(-1) ? -1 : Math.trunc(Number(ttfb / BigInt(1000000))),
+      durationMs: Math.trunc(Number(duration / BigInt(1000000))),
       charactersCount: this.#text.length,
-      audioDuration,
+      audioDurationMs: Math.round(audioDurationMs),
       cancelled: false, // TODO(AJS-186): support ChunkedStream with 1.0 - add this.abortController.signal.aborted here
       label: this.#tts.label,
       streamed: false,

package/src/vad.ts CHANGED Viewed

@@ -139,7 +139,7 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
   }
   protected async monitorMetrics() {
-    let inferenceDurationTotal = 0;
+    let inferenceDurationTotalMs = 0;
     let inferenceCount = 0;
     const metricsReader = this.metricsStream.getReader();
     while (true) {
@@ -154,20 +154,20 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
             this.#vad.emit('metrics_collected', {
               type: 'vad_metrics',
               timestamp: Date.now(),
-              idleTime: Math.trunc(
+              idleTimeMs: Math.trunc(
                 Number((process.hrtime.bigint() - this.#lastActivityTime) / BigInt(1000000)),
               ),
-              inferenceDurationTotal,
+              inferenceDurationTotalMs,
               inferenceCount,
               label: this.#vad.label,
             });
             inferenceCount = 0;
-            inferenceDurationTotal = 0;
+            inferenceDurationTotalMs = 0;
           }
           break;
         case VADEventType.INFERENCE_DONE:
-          inferenceDurationTotal += value.inferenceDuration;
+          inferenceDurationTotalMs += Math.round(value.inferenceDuration);
           this.#lastActivityTime = process.hrtime.bigint();
           break;
         case VADEventType.END_OF_SPEECH:

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -984,9 +984,9 @@ export class AgentActivity implements RecognitionHooks {
     const eouMetrics: EOUMetrics = {
       type: 'eou_metrics',
       timestamp: Date.now(),
-      endOfUtteranceDelay: info.endOfUtteranceDelay,
-      transcriptionDelay: info.transcriptionDelay,
-      onUserTurnCompletedDelay: callbackDuration,
+      endOfUtteranceDelayMs: info.endOfUtteranceDelay,
+      transcriptionDelayMs: info.transcriptionDelay,
+      onUserTurnCompletedDelayMs: callbackDuration,
       speechId: speechHandle.id,
     };
@@ -1506,6 +1506,10 @@ export class AgentActivity implements RecognitionHooks {
       abortController: AbortController,
       outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
     ) => {
+      replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
+        once: true,
+      });
       const forwardTasks: Array<Task<void>> = [];
       try {
         for await (const msg of ev.messageStream) {
@@ -1563,7 +1567,7 @@ export class AgentActivity implements RecognitionHooks {
     const tasks = [
       Task.from(
         (controller) => readMessages(controller, messageOutputs),
-        replyAbortController,
+        undefined,
         'AgentActivity.realtime_generation.read_messages',
       ),
     ];

package/src/voice/generation_tools.test.ts ADDED Viewed

@@ -0,0 +1,268 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { ReadableStream as NodeReadableStream } from 'stream/web';
+import { describe, expect, it } from 'vitest';
+import { z } from 'zod';
+import { FunctionCall, tool } from '../llm/index.js';
+import { initializeLogger } from '../log.js';
+import type { Task } from '../utils.js';
+import { cancelAndWait, delay } from '../utils.js';
+import { type _TextOut, performTextForwarding, performToolExecutions } from './generation.js';
+function createStringStream(chunks: string[], delayMs: number = 0): NodeReadableStream<string> {
+  return new NodeReadableStream<string>({
+    async start(controller) {
+      for (const c of chunks) {
+        if (delayMs > 0) {
+          await delay(delayMs);
+        }
+        controller.enqueue(c);
+      }
+      controller.close();
+    },
+  });
+}
+function createFunctionCallStream(fc: FunctionCall): NodeReadableStream<FunctionCall> {
+  return new NodeReadableStream<FunctionCall>({
+    start(controller) {
+      controller.enqueue(fc);
+      controller.close();
+    },
+  });
+}
+function createFunctionCallStreamFromArray(fcs: FunctionCall[]): NodeReadableStream<FunctionCall> {
+  return new NodeReadableStream<FunctionCall>({
+    start(controller) {
+      for (const fc of fcs) {
+        controller.enqueue(fc);
+      }
+      controller.close();
+    },
+  });
+}
+describe('Generation + Tool Execution', () => {
+  initializeLogger({ pretty: false, level: 'silent' });
+  it('should not abort tool when preamble forwarders are cleaned up', async () => {
+    const replyAbortController = new AbortController();
+    const forwarderController = new AbortController();
+    const chunks = Array.from({ length: 50 }, () => `Hi.`);
+    const fullPreambleText = chunks.join('');
+    const preamble = createStringStream(chunks, 20);
+    const [textForwardTask, textOut]: [Task<void>, _TextOut] = performTextForwarding(
+      preamble,
+      forwarderController,
+      null,
+    );
+    // Tool that takes > 5 seconds
+    let toolAborted = false;
+    const getWeather = tool({
+      description: 'weather',
+      parameters: z.object({ location: z.string() }),
+      execute: async ({ location }, { abortSignal }) => {
+        if (abortSignal) {
+          abortSignal.addEventListener('abort', () => {
+            toolAborted = true;
+          });
+        }
+        // 6s delay
+        await delay(6000);
+        return `Sunny in ${location}`;
+      },
+    });
+    const fc = FunctionCall.create({
+      callId: 'call_1',
+      name: 'getWeather',
+      args: JSON.stringify({ location: 'San Francisco' }),
+    });
+    const toolCallStream = createFunctionCallStream(fc);
+    const [execTask, toolOutput] = performToolExecutions({
+      session: {} as any,
+      speechHandle: { id: 'speech_test', _itemAdded: () => {} } as any,
+      toolCtx: { getWeather } as any,
+      toolCallStream,
+      controller: replyAbortController,
+      onToolExecutionStarted: () => {},
+      onToolExecutionCompleted: () => {},
+    });
+    // Ensure tool has started, then cancel forwarders mid-stream (without aborting parent AbortController)
+    await toolOutput.firstToolStartedFuture.await;
+    await delay(100);
+    await cancelAndWait([textForwardTask], 5000);
+    await execTask.result;
+    expect(toolOutput.output.length).toBe(1);
+    const out = toolOutput.output[0]!;
+    expect(out.toolCallOutput?.isError).toBe(false);
+    expect(out.toolCallOutput?.output).toContain('Sunny in San Francisco');
+    // Forwarder should have been cancelled before finishing all preamble chunks
+    expect(textOut.text).not.toBe(fullPreambleText);
+    // Tool's abort signal must not have fired
+    expect(toolAborted).toBe(false);
+  }, 30_000);
+  it('should return basic tool execution output', async () => {
+    const replyAbortController = new AbortController();
+    const echo = tool({
+      description: 'echo',
+      parameters: z.object({ msg: z.string() }),
+      execute: async ({ msg }) => `echo: ${msg}`,
+    });
+    const fc = FunctionCall.create({
+      callId: 'call_2',
+      name: 'echo',
+      args: JSON.stringify({ msg: 'hello' }),
+    });
+    const toolCallStream = createFunctionCallStream(fc);
+    const [execTask, toolOutput] = performToolExecutions({
+      session: {} as any,
+      speechHandle: { id: 'speech_test2', _itemAdded: () => {} } as any,
+      toolCtx: { echo } as any,
+      toolCallStream,
+      controller: replyAbortController,
+    });
+    await execTask.result;
+    expect(toolOutput.output.length).toBe(1);
+    const out = toolOutput.output[0];
+    expect(out?.toolCallOutput?.isError).toBe(false);
+    expect(out?.toolCallOutput?.output).toContain('echo: hello');
+  });
+  it('should abort tool when reply is aborted mid-execution', async () => {
+    const replyAbortController = new AbortController();
+    let aborted = false;
+    const longOp = tool({
+      description: 'longOp',
+      parameters: z.object({ ms: z.number() }),
+      execute: async ({ ms }, { abortSignal }) => {
+        if (abortSignal) {
+          abortSignal.addEventListener('abort', () => {
+            aborted = true;
+          });
+        }
+        await delay(ms);
+        return 'done';
+      },
+    });
+    const fc = FunctionCall.create({
+      callId: 'call_abort_1',
+      name: 'longOp',
+      args: JSON.stringify({ ms: 5000 }),
+    });
+    const toolCallStream = createFunctionCallStream(fc);
+    const [execTask, toolOutput] = performToolExecutions({
+      session: {} as any,
+      speechHandle: { id: 'speech_abort', _itemAdded: () => {} } as any,
+      toolCtx: { longOp } as any,
+      toolCallStream,
+      controller: replyAbortController,
+    });
+    await toolOutput.firstToolStartedFuture.await;
+    replyAbortController.abort();
+    await execTask.result;
+    expect(aborted).toBe(true);
+    expect(toolOutput.output.length).toBe(1);
+    const out = toolOutput.output[0];
+    expect(out?.toolCallOutput?.isError).toBe(true);
+  }, 20_000);
+  it('should return error output on invalid tool args (zod validation failure)', async () => {
+    const replyAbortController = new AbortController();
+    const echo = tool({
+      description: 'echo',
+      parameters: z.object({ msg: z.string() }),
+      execute: async ({ msg }) => `echo: ${msg}`,
+    });
+    // invalid: msg should be string
+    const fc = FunctionCall.create({
+      callId: 'call_invalid_args',
+      name: 'echo',
+      args: JSON.stringify({ msg: 123 }),
+    });
+    const toolCallStream = createFunctionCallStream(fc);
+    const [execTask, toolOutput] = performToolExecutions({
+      session: {} as any,
+      speechHandle: { id: 'speech_invalid', _itemAdded: () => {} } as any,
+      toolCtx: { echo } as any,
+      toolCallStream,
+      controller: replyAbortController,
+    });
+    await execTask.result;
+    expect(toolOutput.output.length).toBe(1);
+    const out = toolOutput.output[0];
+    expect(out?.toolCallOutput?.isError).toBe(true);
+  });
+  it('should handle multiple tool calls within a single stream', async () => {
+    const replyAbortController = new AbortController();
+    const sum = tool({
+      description: 'sum',
+      parameters: z.object({ a: z.number(), b: z.number() }),
+      execute: async ({ a, b }) => a + b,
+    });
+    const upper = tool({
+      description: 'upper',
+      parameters: z.object({ s: z.string() }),
+      execute: async ({ s }) => s.toUpperCase(),
+    });
+    const fc1 = FunctionCall.create({
+      callId: 'call_multi_1',
+      name: 'sum',
+      args: JSON.stringify({ a: 2, b: 3 }),
+    });
+    const fc2 = FunctionCall.create({
+      callId: 'call_multi_2',
+      name: 'upper',
+      args: JSON.stringify({ s: 'hey' }),
+    });
+    const toolCallStream = createFunctionCallStreamFromArray([fc1, fc2]);
+    const [execTask, toolOutput] = performToolExecutions({
+      session: {} as any,
+      speechHandle: { id: 'speech_multi', _itemAdded: () => {} } as any,
+      toolCtx: { sum, upper } as any,
+      toolCallStream,
+      controller: replyAbortController,
+    });
+    await execTask.result;
+    expect(toolOutput.output.length).toBe(2);
+    // sort by callId to assert deterministically
+    const sorted = [...toolOutput.output].sort((a, b) =>
+      a.toolCall.callId.localeCompare(b.toolCall.callId),
+    );
+    expect(sorted[0]?.toolCall.name).toBe('sum');
+    expect(sorted[0]?.toolCallOutput?.isError).toBe(false);
+    expect(sorted[0]?.toolCallOutput?.output).toBe('5');
+    expect(sorted[1]?.toolCall.name).toBe('upper');
+    expect(sorted[1]?.toolCallOutput?.isError).toBe(false);
+    expect(sorted[1]?.toolCallOutput?.output).toBe('"HEY"');
+  });
+});

package/src/voice/index.ts CHANGED Viewed

@@ -6,4 +6,5 @@ export { AgentSession, type AgentSessionOptions } from './agent_session.js';
 export * from './avatar/index.js';
 export * from './events.js';
+export * from './room_io/index.js';
 export { RunContext } from './run_context.js';