npm - @livekit/agents - Versions diffs - 1.0.15 → 1.0.17 - Mend

@livekit/agents 1.0.15 → 1.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/dist/cli.cjs +12 -12
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.cts +3 -3
package/dist/cli.d.ts +3 -3
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +13 -13
package/dist/cli.js.map +1 -1
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +1 -1
package/dist/inference/stt.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +2 -1
package/dist/inference/tts.d.ts +2 -1
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js +1 -5
package/dist/inference/tts.js.map +1 -1
package/dist/llm/chat_context.cjs +78 -0
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +16 -0
package/dist/llm/chat_context.d.ts +16 -0
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +78 -0
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/chat_context.test.cjs +531 -0
package/dist/llm/chat_context.test.cjs.map +1 -1
package/dist/llm/chat_context.test.js +531 -0
package/dist/llm/chat_context.test.js.map +1 -1
package/dist/llm/tool_context.cjs +40 -0
package/dist/llm/tool_context.cjs.map +1 -1
package/dist/llm/tool_context.d.cts +2 -0
package/dist/llm/tool_context.d.ts +2 -0
package/dist/llm/tool_context.d.ts.map +1 -1
package/dist/llm/tool_context.js +38 -0
package/dist/llm/tool_context.js.map +1 -1
package/dist/metrics/base.cjs.map +1 -1
package/dist/metrics/base.d.cts +7 -0
package/dist/metrics/base.d.ts +7 -0
package/dist/metrics/base.d.ts.map +1 -1
package/dist/stt/stt.cjs +1 -1
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +7 -1
package/dist/stt/stt.d.ts +7 -1
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +1 -1
package/dist/stt/stt.js.map +1 -1
package/dist/tts/tts.cjs +2 -4
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +3 -5
package/dist/tts/tts.js.map +1 -1
package/dist/voice/agent_activity.cjs +83 -8
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +6 -2
package/dist/voice/agent_activity.d.ts +6 -2
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +83 -8
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +3 -2
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +2 -1
package/dist/voice/agent_session.d.ts +2 -1
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +3 -2
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +138 -16
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +11 -0
package/dist/voice/audio_recognition.d.ts +11 -0
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +138 -16
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +0 -1
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/worker.cjs +17 -11
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.cts +16 -9
package/dist/worker.d.ts +16 -9
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +16 -12
package/dist/worker.js.map +1 -1
package/package.json +1 -1
package/src/cli.ts +17 -17
package/src/inference/stt.ts +2 -1
package/src/inference/tts.ts +2 -5
package/src/llm/chat_context.test.ts +607 -0
package/src/llm/chat_context.ts +106 -0
package/src/llm/tool_context.ts +44 -0
package/src/metrics/base.ts +7 -0
package/src/stt/stt.ts +8 -1
package/src/tts/tts.ts +7 -5
package/src/voice/agent_activity.ts +119 -9
package/src/voice/agent_session.ts +3 -1
package/src/voice/audio_recognition.ts +235 -57
package/src/voice/room_io/_input.ts +1 -1
package/src/worker.ts +29 -18

package/src/llm/tool_context.ts CHANGED Viewed

@@ -187,6 +187,50 @@ export type ToolContext<UserData = UnknownUserData> = {
   [name: string]: FunctionTool<any, UserData, any>;
 };
+export function isSameToolContext(ctx1: ToolContext, ctx2: ToolContext): boolean {
+  const toolNames = new Set(Object.keys(ctx1));
+  const toolNames2 = new Set(Object.keys(ctx2));
+  if (toolNames.size !== toolNames2.size) {
+    return false;
+  }
+  for (const name of toolNames) {
+    if (!toolNames2.has(name)) {
+      return false;
+    }
+    const tool1 = ctx1[name];
+    const tool2 = ctx2[name];
+    if (!tool1 || !tool2) {
+      return false;
+    }
+    if (tool1.description !== tool2.description) {
+      return false;
+    }
+  }
+  return true;
+}
+export function isSameToolChoice(choice1: ToolChoice | null, choice2: ToolChoice | null): boolean {
+  if (choice1 === choice2) {
+    return true;
+  }
+  if (choice1 === null || choice2 === null) {
+    return false;
+  }
+  if (typeof choice1 === 'string' && typeof choice2 === 'string') {
+    return choice1 === choice2;
+  }
+  if (typeof choice1 === 'object' && typeof choice2 === 'object') {
+    return choice1.type === choice2.type && choice1.function.name === choice2.function.name;
+  }
+  return false;
+}
 /**
  * Create a function tool with inferred parameters from the schema.
  */

package/src/metrics/base.ts CHANGED Viewed

@@ -91,6 +91,13 @@ export type EOUMetrics = {
    * Time taken to invoke the user's `Agent.onUserTurnCompleted` callback.
    */
   onUserTurnCompletedDelayMs: number;
+  /**
+   * The time the user stopped speaking.
+   */
+  lastSpeakingTimeMs: number;
+  /**
+   * The ID of the speech handle.
+   */
   speechId?: string;
 };

package/src/stt/stt.ts CHANGED Viewed

@@ -38,6 +38,12 @@ export enum SpeechEventType {
   END_OF_SPEECH = 3,
   /** Usage event, emitted periodically to indicate usage metrics. */
   RECOGNITION_USAGE = 4,
+  /**
+   * Preflight transcript, emitted before final transcript when STT has high confidence
+   * but hasn't fully committed yet. Includes all pre-committed transcripts including
+   * final transcript from the previous STT run.
+   */
+  PREFLIGHT_TRANSCRIPT = 5,
 }
 /** SpeechData contains metadata about this {@link SpeechEvent}. */
@@ -198,7 +204,8 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
               options: { retryable: false },
             });
           } else {
-            this.emitError({ error, recoverable: true });
+            // Don't emit error event for recoverable errors during retry loop
+            // to avoid ERR_UNHANDLED_ERROR or premature session termination
             this.logger.warn(
               { tts: this.#stt.label, attempt: i + 1, error },
               `failed to recognize speech, retrying in ${retryInterval}s`,

package/src/tts/tts.ts CHANGED Viewed

@@ -5,7 +5,7 @@ import type { AudioFrame } from '@livekit/rtc-node';
 import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
 import { EventEmitter } from 'node:events';
 import type { ReadableStream } from 'node:stream/web';
-import { APIConnectionError, APIStatusError } from '../_exceptions.js';
+import { APIConnectionError, APIError } from '../_exceptions.js';
 import { log } from '../log.js';
 import type { TTSMetrics } from '../metrics/base.js';
 import { DeferredReadableStream } from '../stream/deferred_stream.js';
@@ -161,7 +161,7 @@ export abstract class SynthesizeStream
       try {
         return await this.run();
       } catch (error) {
-        if (error instanceof APIStatusError) {
+        if (error instanceof APIError) {
           const retryInterval = this._connOptions._intervalForRetry(i);
           if (this._connOptions.maxRetry === 0 || !error.retryable) {
@@ -174,7 +174,8 @@ export abstract class SynthesizeStream
               options: { retryable: false },
             });
           } else {
-            this.emitError({ error, recoverable: true });
+            // Don't emit error event for recoverable errors during retry loop
+            // to avoid ERR_UNHANDLED_ERROR or premature session termination
             this.logger.warn(
               { tts: this.#tts.label, attempt: i + 1, error },
               `failed to synthesize speech, retrying in  ${retryInterval}s`,
@@ -388,7 +389,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
       try {
         return await this.run();
       } catch (error) {
-        if (error instanceof APIStatusError) {
+        if (error instanceof APIError) {
           const retryInterval = this._connOptions._intervalForRetry(i);
           if (this._connOptions.maxRetry === 0 || !error.retryable) {
@@ -401,7 +402,8 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
               options: { retryable: false },
             });
           } else {
-            this.emitError({ error, recoverable: true });
+            // Don't emit error event for recoverable errors during retry loop
+            // to avoid ERR_UNHANDLED_ERROR or premature session termination
             this.logger.warn(
               { tts: this.#tts.label, attempt: i + 1, error },
               `failed to generate TTS completion, retrying in ${retryInterval}s`,

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -22,6 +22,7 @@ import {
   type ToolContext,
 } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
+import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
   EOUMetrics,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
 import {
   AudioRecognition,
   type EndOfTurnInfo,
+  type PreemptiveGenerationInfo,
   type RecognitionHooks,
   type _TurnDetector,
 } from './audio_recognition.js';
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
 // equivalent to Python's contextvars
 const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
+interface PreemptiveGeneration {
+  speechHandle: SpeechHandle;
+  userMessage: ChatMessage;
+  info: PreemptiveGenerationInfo;
+  chatCtx: ChatContext;
+  tools: ToolContext;
+  toolChoice: ToolChoice | null;
+  createdAt: number;
+}
 export class AgentActivity implements RecognitionHooks {
   private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
   private started = false;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
   private audioStream = new DeferredReadableStream<AudioFrame>();
   // default to null as None, which maps to the default provider tool choice value
   private toolChoice: ToolChoice | null = null;
+  private _preemptiveGeneration?: PreemptiveGeneration;
   agent: Agent;
   agentSession: AgentSession;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
     this.agentSession._updateUserState('speaking');
   }
-  onEndOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('listening');
+  onEndOfSpeech(ev: VADEvent): void {
+    let speechEndTime = Date.now();
+    if (ev) {
+      speechEndTime = speechEndTime - ev.silenceDuration;
+    }
+    this.agentSession._updateUserState('listening', speechEndTime);
   }
   onVADInferenceDone(ev: VADEvent): void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
     );
   }
+  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
+    if (
+      !this.agentSession.options.preemptiveGeneration ||
+      this.draining ||
+      (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
+      !(this.llm instanceof LLM)
+    ) {
+      return;
+    }
+    this.cancelPreemptiveGeneration();
+    this.logger.info(
+      {
+        newTranscript: info.newTranscript,
+        transcriptConfidence: info.transcriptConfidence,
+      },
+      'starting preemptive generation',
+    );
+    const userMessage = ChatMessage.create({
+      role: 'user',
+      content: info.newTranscript,
+    });
+    const chatCtx = this.agent.chatCtx.copy();
+    const speechHandle = this.generateReply({
+      userMessage,
+      chatCtx,
+      scheduleSpeech: false,
+    });
+    this._preemptiveGeneration = {
+      speechHandle,
+      userMessage,
+      info,
+      chatCtx: chatCtx.copy(),
+      tools: { ...this.tools },
+      toolChoice: this.toolChoice,
+      createdAt: Date.now(),
+    };
+  }
+  private cancelPreemptiveGeneration(): void {
+    if (this._preemptiveGeneration !== undefined) {
+      this._preemptiveGeneration.speechHandle._cancel();
+      this._preemptiveGeneration = undefined;
+    }
+  }
   private createSpeechTask(options: {
     task: Task<void>;
     ownedSpeechHandle?: SpeechHandle;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
     if (this.draining) {
+      this.cancelPreemptiveGeneration();
       this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
       // copied from python:
       // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
       info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
     ) {
       // avoid interruption if the new_transcript is too short
+      this.cancelPreemptiveGeneration();
       this.logger.info('skipping user input, new_transcript is too short');
       return false;
     }
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
     instructions?: string;
     toolChoice?: ToolChoice | null;
     allowInterruptions?: boolean;
+    scheduleSpeech?: boolean;
   }): SpeechHandle {
     const {
       userMessage,
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
       instructions: defaultInstructions,
       toolChoice: defaultToolChoice,
       allowInterruptions: defaultAllowInterruptions,
+      scheduleSpeech = true,
     } = options;
     let instructions = defaultInstructions;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
       task.finally(() => this.onPipelineReplyDone());
     }
-    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    if (scheduleSpeech) {
+      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    }
     return handle;
   }
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    // Ensure the new message is passed to generateReply
-    // This preserves the original message id, making it easier for users to track responses
-    const speechHandle = this.generateReply({ userMessage, chatCtx });
+    let speechHandle: SpeechHandle | undefined;
+    if (this._preemptiveGeneration !== undefined) {
+      const preemptive = this._preemptiveGeneration;
+      // make sure the onUserTurnCompleted didn't change some request parameters
+      // otherwise invalidate the preemptive generation
+      if (
+        preemptive.info.newTranscript === userMessage?.textContent &&
+        preemptive.chatCtx.isEquivalent(chatCtx) &&
+        isSameToolContext(preemptive.tools, this.tools) &&
+        isSameToolChoice(preemptive.toolChoice, this.toolChoice)
+      ) {
+        speechHandle = preemptive.speechHandle;
+        this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+        this.logger.debug(
+          {
+            preemptiveLeadTime: Date.now() - preemptive.createdAt,
+          },
+          'using preemptive generation',
+        );
+      } else {
+        this.logger.warn(
+          'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
+        );
+        preemptive.speechHandle._cancel();
+      }
+      this._preemptiveGeneration = undefined;
+    }
+    if (speechHandle === undefined) {
+      // Ensure the new message is passed to generateReply
+      // This preserves the original message id, making it easier for users to track responses
+      speechHandle = this.generateReply({ userMessage, chatCtx });
+    }
     const eouMetrics: EOUMetrics = {
       type: 'eou_metrics',
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
       endOfUtteranceDelayMs: info.endOfUtteranceDelay,
       transcriptionDelayMs: info.transcriptionDelay,
       onUserTurnCompletedDelayMs: callbackDuration,
+      lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
       speechId: speechHandle.id,
     };
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
     chatCtx = chatCtx.copy();
+    // Insert new message into temporary chat context for LLM inference
     if (newMessage) {
       chatCtx.insert(newMessage);
-      this.agent._chatCtx.insert(newMessage);
-      this.agentSession._conversationItemAdded(newMessage);
     }
     if (instructions) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
-    this.agentSession._updateAgentState('thinking');
     const tasks: Array<Task<void>> = [];
     const [llmTask, llmGenData] = performLLMInference(
       // preserve  `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
+    // Add new message to actual chat context if the speech is scheduled
+    if (newMessage && speechHandle.scheduled) {
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+    }
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
     try {
       if (this._draining) return;
+      this.cancelPreemptiveGeneration();
       this.createSpeechTask({
         task: Task.from(() => this.agent.onExit()),
         name: 'AgentActivity_onExit',
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
         this.logger.warn('task closing without draining');
       }
+      this.cancelPreemptiveGeneration();
       // Unregister event handlers to prevent duplicate metrics
       if (this.llm instanceof LLM) {
         this.llm.off('metrics_collected', this.onMetricsCollected);

package/src/voice/agent_session.ts CHANGED Viewed

@@ -57,6 +57,7 @@ export interface VoiceOptions {
   minEndpointingDelay: number;
   maxEndpointingDelay: number;
   maxToolSteps: number;
+  preemptiveGeneration: boolean;
 }
 const defaultVoiceOptions: VoiceOptions = {
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
   minEndpointingDelay: 500,
   maxEndpointingDelay: 6000,
   maxToolSteps: 3,
+  preemptiveGeneration: false,
 } as const;
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -421,7 +423,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateUserState(state: UserState) {
+  _updateUserState(state: UserState, _lastSpeakingTime?: number) {
     if (this.userState === state) {
       return;
     }