npm - @livekit/agents - Versions diffs - 1.0.14 → 1.0.16 - Mend

@livekit/agents 1.0.14 → 1.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

package/dist/cli.cjs +12 -12
package/dist/cli.cjs.map +1 -1
package/dist/cli.d.cts +3 -3
package/dist/cli.d.ts +3 -3
package/dist/cli.d.ts.map +1 -1
package/dist/cli.js +13 -13
package/dist/cli.js.map +1 -1
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +1 -1
package/dist/inference/stt.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +2 -1
package/dist/inference/tts.d.ts +2 -1
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js +1 -5
package/dist/inference/tts.js.map +1 -1
package/dist/llm/chat_context.cjs +78 -0
package/dist/llm/chat_context.cjs.map +1 -1
package/dist/llm/chat_context.d.cts +16 -0
package/dist/llm/chat_context.d.ts +16 -0
package/dist/llm/chat_context.d.ts.map +1 -1
package/dist/llm/chat_context.js +78 -0
package/dist/llm/chat_context.js.map +1 -1
package/dist/llm/chat_context.test.cjs +531 -0
package/dist/llm/chat_context.test.cjs.map +1 -1
package/dist/llm/chat_context.test.js +531 -0
package/dist/llm/chat_context.test.js.map +1 -1
package/dist/llm/tool_context.cjs +43 -2
package/dist/llm/tool_context.cjs.map +1 -1
package/dist/llm/tool_context.d.cts +39 -11
package/dist/llm/tool_context.d.ts +39 -11
package/dist/llm/tool_context.d.ts.map +1 -1
package/dist/llm/tool_context.js +42 -3
package/dist/llm/tool_context.js.map +1 -1
package/dist/llm/tool_context.test.cjs +197 -0
package/dist/llm/tool_context.test.cjs.map +1 -1
package/dist/llm/tool_context.test.js +175 -0
package/dist/llm/tool_context.test.js.map +1 -1
package/dist/llm/utils.cjs +17 -11
package/dist/llm/utils.cjs.map +1 -1
package/dist/llm/utils.d.cts +1 -2
package/dist/llm/utils.d.ts +1 -2
package/dist/llm/utils.d.ts.map +1 -1
package/dist/llm/utils.js +17 -11
package/dist/llm/utils.js.map +1 -1
package/dist/llm/zod-utils.cjs +99 -0
package/dist/llm/zod-utils.cjs.map +1 -0
package/dist/llm/zod-utils.d.cts +65 -0
package/dist/llm/zod-utils.d.ts +65 -0
package/dist/llm/zod-utils.d.ts.map +1 -0
package/dist/llm/zod-utils.js +61 -0
package/dist/llm/zod-utils.js.map +1 -0
package/dist/llm/zod-utils.test.cjs +389 -0
package/dist/llm/zod-utils.test.cjs.map +1 -0
package/dist/llm/zod-utils.test.js +372 -0
package/dist/llm/zod-utils.test.js.map +1 -0
package/dist/metrics/base.cjs.map +1 -1
package/dist/metrics/base.d.cts +7 -0
package/dist/metrics/base.d.ts +7 -0
package/dist/metrics/base.d.ts.map +1 -1
package/dist/stt/stt.cjs +1 -0
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +7 -1
package/dist/stt/stt.d.ts +7 -1
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +1 -0
package/dist/stt/stt.js.map +1 -1
package/dist/vad.cjs +16 -0
package/dist/vad.cjs.map +1 -1
package/dist/vad.d.cts +6 -0
package/dist/vad.d.ts +6 -0
package/dist/vad.d.ts.map +1 -1
package/dist/vad.js +16 -0
package/dist/vad.js.map +1 -1
package/dist/voice/agent_activity.cjs +83 -8
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +6 -2
package/dist/voice/agent_activity.d.ts +6 -2
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +83 -8
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +3 -2
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +2 -1
package/dist/voice/agent_session.d.ts +2 -1
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +3 -2
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +138 -16
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.cts +11 -0
package/dist/voice/audio_recognition.d.ts +11 -0
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +138 -16
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/generation.cjs +8 -3
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +8 -3
package/dist/voice/generation.js.map +1 -1
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +0 -1
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/worker.cjs +17 -11
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.cts +16 -9
package/dist/worker.d.ts +16 -9
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +16 -12
package/dist/worker.js.map +1 -1
package/package.json +5 -4
package/src/cli.ts +17 -17
package/src/inference/stt.ts +2 -1
package/src/inference/tts.ts +2 -5
package/src/llm/__snapshots__/zod-utils.test.ts.snap +341 -0
package/src/llm/chat_context.test.ts +607 -0
package/src/llm/chat_context.ts +106 -0
package/src/llm/tool_context.test.ts +210 -1
package/src/llm/tool_context.ts +101 -17
package/src/llm/utils.ts +18 -15
package/src/llm/zod-utils.test.ts +476 -0
package/src/llm/zod-utils.ts +144 -0
package/src/metrics/base.ts +7 -0
package/src/stt/stt.ts +6 -0
package/src/vad.ts +18 -0
package/src/voice/agent_activity.ts +119 -9
package/src/voice/agent_session.ts +3 -1
package/src/voice/audio_recognition.ts +235 -57
package/src/voice/generation.ts +8 -3
package/src/voice/room_io/_input.ts +1 -1
package/src/worker.ts +29 -18

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -22,6 +22,7 @@ import {
   type ToolContext,
 } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
+import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
   EOUMetrics,
@@ -43,6 +44,7 @@ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
 import {
   AudioRecognition,
   type EndOfTurnInfo,
+  type PreemptiveGenerationInfo,
   type RecognitionHooks,
   type _TurnDetector,
 } from './audio_recognition.js';
@@ -71,6 +73,16 @@ import { SpeechHandle } from './speech_handle.js';
 // equivalent to Python's contextvars
 const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
+interface PreemptiveGeneration {
+  speechHandle: SpeechHandle;
+  userMessage: ChatMessage;
+  info: PreemptiveGenerationInfo;
+  chatCtx: ChatContext;
+  tools: ToolContext;
+  toolChoice: ToolChoice | null;
+  createdAt: number;
+}
 export class AgentActivity implements RecognitionHooks {
   private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
   private started = false;
@@ -87,6 +99,7 @@ export class AgentActivity implements RecognitionHooks {
   private audioStream = new DeferredReadableStream<AudioFrame>();
   // default to null as None, which maps to the default provider tool choice value
   private toolChoice: ToolChoice | null = null;
+  private _preemptiveGeneration?: PreemptiveGeneration;
   agent: Agent;
   agentSession: AgentSession;
@@ -589,8 +602,12 @@ export class AgentActivity implements RecognitionHooks {
     this.agentSession._updateUserState('speaking');
   }
-  onEndOfSpeech(_ev: VADEvent): void {
-    this.agentSession._updateUserState('listening');
+  onEndOfSpeech(ev: VADEvent): void {
+    let speechEndTime = Date.now();
+    if (ev) {
+      speechEndTime = speechEndTime - ev.silenceDuration;
+    }
+    this.agentSession._updateUserState('listening', speechEndTime);
   }
   onVADInferenceDone(ev: VADEvent): void {
@@ -664,6 +681,55 @@ export class AgentActivity implements RecognitionHooks {
     );
   }
+  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
+    if (
+      !this.agentSession.options.preemptiveGeneration ||
+      this.draining ||
+      (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
+      !(this.llm instanceof LLM)
+    ) {
+      return;
+    }
+    this.cancelPreemptiveGeneration();
+    this.logger.info(
+      {
+        newTranscript: info.newTranscript,
+        transcriptConfidence: info.transcriptConfidence,
+      },
+      'starting preemptive generation',
+    );
+    const userMessage = ChatMessage.create({
+      role: 'user',
+      content: info.newTranscript,
+    });
+    const chatCtx = this.agent.chatCtx.copy();
+    const speechHandle = this.generateReply({
+      userMessage,
+      chatCtx,
+      scheduleSpeech: false,
+    });
+    this._preemptiveGeneration = {
+      speechHandle,
+      userMessage,
+      info,
+      chatCtx: chatCtx.copy(),
+      tools: { ...this.tools },
+      toolChoice: this.toolChoice,
+      createdAt: Date.now(),
+    };
+  }
+  private cancelPreemptiveGeneration(): void {
+    if (this._preemptiveGeneration !== undefined) {
+      this._preemptiveGeneration.speechHandle._cancel();
+      this._preemptiveGeneration = undefined;
+    }
+  }
   private createSpeechTask(options: {
     task: Task<void>;
     ownedSpeechHandle?: SpeechHandle;
@@ -694,6 +760,7 @@ export class AgentActivity implements RecognitionHooks {
   async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
     if (this.draining) {
+      this.cancelPreemptiveGeneration();
       this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
       // copied from python:
       // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
@@ -710,6 +777,7 @@ export class AgentActivity implements RecognitionHooks {
       info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
     ) {
       // avoid interruption if the new_transcript is too short
+      this.cancelPreemptiveGeneration();
       this.logger.info('skipping user input, new_transcript is too short');
       return false;
     }
@@ -775,6 +843,7 @@ export class AgentActivity implements RecognitionHooks {
     instructions?: string;
     toolChoice?: ToolChoice | null;
     allowInterruptions?: boolean;
+    scheduleSpeech?: boolean;
   }): SpeechHandle {
     const {
       userMessage,
@@ -782,6 +851,7 @@ export class AgentActivity implements RecognitionHooks {
       instructions: defaultInstructions,
       toolChoice: defaultToolChoice,
       allowInterruptions: defaultAllowInterruptions,
+      scheduleSpeech = true,
     } = options;
     let instructions = defaultInstructions;
@@ -871,7 +941,9 @@ export class AgentActivity implements RecognitionHooks {
       task.finally(() => this.onPipelineReplyDone());
     }
-    this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    if (scheduleSpeech) {
+      this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+    }
     return handle;
   }
@@ -977,9 +1049,40 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
-    // Ensure the new message is passed to generateReply
-    // This preserves the original message id, making it easier for users to track responses
-    const speechHandle = this.generateReply({ userMessage, chatCtx });
+    let speechHandle: SpeechHandle | undefined;
+    if (this._preemptiveGeneration !== undefined) {
+      const preemptive = this._preemptiveGeneration;
+      // make sure the onUserTurnCompleted didn't change some request parameters
+      // otherwise invalidate the preemptive generation
+      if (
+        preemptive.info.newTranscript === userMessage?.textContent &&
+        preemptive.chatCtx.isEquivalent(chatCtx) &&
+        isSameToolContext(preemptive.tools, this.tools) &&
+        isSameToolChoice(preemptive.toolChoice, this.toolChoice)
+      ) {
+        speechHandle = preemptive.speechHandle;
+        this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
+        this.logger.debug(
+          {
+            preemptiveLeadTime: Date.now() - preemptive.createdAt,
+          },
+          'using preemptive generation',
+        );
+      } else {
+        this.logger.warn(
+          'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
+        );
+        preemptive.speechHandle._cancel();
+      }
+      this._preemptiveGeneration = undefined;
+    }
+    if (speechHandle === undefined) {
+      // Ensure the new message is passed to generateReply
+      // This preserves the original message id, making it easier for users to track responses
+      speechHandle = this.generateReply({ userMessage, chatCtx });
+    }
     const eouMetrics: EOUMetrics = {
       type: 'eou_metrics',
@@ -987,6 +1090,7 @@ export class AgentActivity implements RecognitionHooks {
       endOfUtteranceDelayMs: info.endOfUtteranceDelay,
       transcriptionDelayMs: info.transcriptionDelay,
       onUserTurnCompletedDelayMs: callbackDuration,
+      lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
       speechId: speechHandle.id,
     };
@@ -1139,10 +1243,9 @@ export class AgentActivity implements RecognitionHooks {
     chatCtx = chatCtx.copy();
+    // Insert new message into temporary chat context for LLM inference
     if (newMessage) {
       chatCtx.insert(newMessage);
-      this.agent._chatCtx.insert(newMessage);
-      this.agentSession._conversationItemAdded(newMessage);
     }
     if (instructions) {
@@ -1157,7 +1260,6 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
-    this.agentSession._updateAgentState('thinking');
     const tasks: Array<Task<void>> = [];
     const [llmTask, llmGenData] = performLLMInference(
       // preserve  `this` context in llmNode
@@ -1185,6 +1287,12 @@ export class AgentActivity implements RecognitionHooks {
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
+    // Add new message to actual chat context if the speech is scheduled
+    if (newMessage && speechHandle.scheduled) {
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+    }
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1917,6 +2025,7 @@ export class AgentActivity implements RecognitionHooks {
     try {
       if (this._draining) return;
+      this.cancelPreemptiveGeneration();
       this.createSpeechTask({
         task: Task.from(() => this.agent.onExit()),
         name: 'AgentActivity_onExit',
@@ -1937,6 +2046,7 @@ export class AgentActivity implements RecognitionHooks {
         this.logger.warn('task closing without draining');
       }
+      this.cancelPreemptiveGeneration();
       // Unregister event handlers to prevent duplicate metrics
       if (this.llm instanceof LLM) {
         this.llm.off('metrics_collected', this.onMetricsCollected);

package/src/voice/agent_session.ts CHANGED Viewed

@@ -57,6 +57,7 @@ export interface VoiceOptions {
   minEndpointingDelay: number;
   maxEndpointingDelay: number;
   maxToolSteps: number;
+  preemptiveGeneration: boolean;
 }
 const defaultVoiceOptions: VoiceOptions = {
@@ -67,6 +68,7 @@ const defaultVoiceOptions: VoiceOptions = {
   minEndpointingDelay: 500,
   maxEndpointingDelay: 6000,
   maxToolSteps: 3,
+  preemptiveGeneration: false,
 } as const;
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -421,7 +423,7 @@ export class AgentSession<
   }
   /** @internal */
-  _updateUserState(state: UserState) {
+  _updateUserState(state: UserState, _lastSpeakingTime?: number) {
     if (this.userState === state) {
       return;
     }

package/src/voice/audio_recognition.ts CHANGED Viewed

@@ -17,8 +17,16 @@ import type { STTNode } from './io.js';
 export interface EndOfTurnInfo {
   newTranscript: string;
+  transcriptConfidence: number;
   transcriptionDelay: number;
   endOfUtteranceDelay: number;
+  startedSpeakingAt: number | undefined;
+  stoppedSpeakingAt: number | undefined;
+}
+export interface PreemptiveGenerationInfo {
+  newTranscript: string;
+  transcriptConfidence: number;
 }
 export interface RecognitionHooks {
@@ -28,6 +36,7 @@ export interface RecognitionHooks {
   onInterimTranscript: (ev: SpeechEvent) => void;
   onFinalTranscript: (ev: SpeechEvent) => void;
   onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
+  onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
   retrieveChatCtx: () => ChatContext;
 }
@@ -63,7 +72,10 @@ export class AudioRecognition {
   private lastFinalTranscriptTime = 0;
   private audioTranscript = '';
   private audioInterimTranscript = '';
-  private lastSpeakingTime = 0;
+  private audioPreflightTranscript = '';
+  private finalTranscriptConfidence: number[] = [];
+  private lastSpeakingTime: number | undefined;
+  private speechStartTime: number | undefined;
   private userTurnCommitted = false;
   private speaking = false;
   private sampleRate?: number;
@@ -144,6 +156,7 @@ export class AudioRecognition {
       case SpeechEventType.FINAL_TRANSCRIPT:
         this.hooks.onFinalTranscript(ev);
         const transcript = ev.alternatives?.[0]?.text;
+        const confidence = ev.alternatives?.[0]?.confidence ?? 0;
         this.lastLanguage = ev.alternatives?.[0]?.language;
         if (!transcript) {
@@ -162,34 +175,144 @@ export class AudioRecognition {
         this.lastFinalTranscriptTime = Date.now();
         this.audioTranscript += ` ${transcript}`;
         this.audioTranscript = this.audioTranscript.trimStart();
+        this.finalTranscriptConfidence.push(confidence);
+        const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
         this.audioInterimTranscript = '';
+        this.audioPreflightTranscript = '';
+        if (!this.vad || this.lastSpeakingTime === undefined) {
+          // vad disabled, use stt timestamp
+          // TODO: this would screw up transcription latency metrics
+          // but we'll live with it for now.
+          // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
+          // and using that timestamp for lastSpeakingTime
+          this.lastSpeakingTime = Date.now();
+        }
-        if (!this.speaking) {
-          if (!this.vad) {
-            // Copied from python agents:
-            // vad disabled, use stt timestamp
-            // TODO: this would screw up transcription latency metrics
-            // but we'll live with it for now.
-            // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
-            // and using that timestamp for _last_speaking_time
-            this.lastSpeakingTime = Date.now();
+        if (this.vadBaseTurnDetection || this.userTurnCommitted) {
+          if (transcriptChanged) {
+            this.logger.debug(
+              { transcript: this.audioTranscript },
+              'triggering preemptive generation (FINAL_TRANSCRIPT)',
+            );
+            this.hooks.onPreemptiveGeneration({
+              newTranscript: this.audioTranscript,
+              transcriptConfidence:
+                this.finalTranscriptConfidence.length > 0
+                  ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
+                    this.finalTranscriptConfidence.length
+                  : 0,
+            });
           }
-          if (this.vadBaseTurnDetection || this.userTurnCommitted) {
+          if (!this.speaking) {
             const chatCtx = this.hooks.retrieveChatCtx();
             this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
             this.runEOUDetection(chatCtx);
           }
         }
         break;
+      case SpeechEventType.PREFLIGHT_TRANSCRIPT:
+        this.hooks.onInterimTranscript(ev);
+        const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
+        const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
+        const preflightLanguage = ev.alternatives?.[0]?.language;
+        const MIN_LANGUAGE_DETECTION_LENGTH = 5;
+        if (
+          !this.lastLanguage ||
+          (preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
+        ) {
+          this.lastLanguage = preflightLanguage;
+        }
+        if (!preflightTranscript) {
+          return;
+        }
+        this.logger.debug(
+          {
+            user_transcript: preflightTranscript,
+            language: this.lastLanguage,
+          },
+          'received user preflight transcript',
+        );
+        // still need to increment it as it's used for turn detection,
+        this.lastFinalTranscriptTime = Date.now();
+        // preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
+        this.audioPreflightTranscript =
+          `${this.audioTranscript} ${preflightTranscript}`.trimStart();
+        this.audioInterimTranscript = preflightTranscript;
+        if (!this.vad || this.lastSpeakingTime === undefined) {
+          // vad disabled, use stt timestamp
+          this.lastSpeakingTime = Date.now();
+        }
+        if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
+          const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
+          this.logger.debug(
+            {
+              transcript:
+                this.audioPreflightTranscript.length > 100
+                  ? this.audioPreflightTranscript.slice(0, 100) + '...'
+                  : this.audioPreflightTranscript,
+            },
+            'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
+          );
+          this.hooks.onPreemptiveGeneration({
+            newTranscript: this.audioPreflightTranscript,
+            transcriptConfidence:
+              confidenceVals.length > 0
+                ? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
+                : 0,
+          });
+        }
+        break;
       case SpeechEventType.INTERIM_TRANSCRIPT:
         this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
         this.hooks.onInterimTranscript(ev);
         this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
         break;
+      case SpeechEventType.START_OF_SPEECH:
+        if (this.turnDetectionMode !== 'stt') break;
+        this.hooks.onStartOfSpeech({
+          type: VADEventType.START_OF_SPEECH,
+          samplesIndex: 0,
+          timestamp: Date.now(),
+          speechDuration: 0,
+          silenceDuration: 0,
+          frames: [],
+          probability: 0,
+          inferenceDuration: 0,
+          speaking: true,
+          rawAccumulatedSilence: 0,
+          rawAccumulatedSpeech: 0,
+        });
+        this.speaking = true;
+        this.lastSpeakingTime = Date.now();
+        this.bounceEOUTask?.cancel();
+        break;
       case SpeechEventType.END_OF_SPEECH:
         if (this.turnDetectionMode !== 'stt') break;
+        this.hooks.onEndOfSpeech({
+          type: VADEventType.END_OF_SPEECH,
+          samplesIndex: 0,
+          timestamp: Date.now(),
+          speechDuration: 0,
+          silenceDuration: 0,
+          frames: [],
+          probability: 0,
+          inferenceDuration: 0,
+          speaking: false,
+          rawAccumulatedSilence: 0,
+          rawAccumulatedSpeech: 0,
+        });
+        this.speaking = false;
         this.userTurnCommitted = true;
+        this.lastSpeakingTime = Date.now();
         if (!this.speaking) {
           const chatCtx = this.hooks.retrieveChatCtx();
@@ -222,61 +345,106 @@ export class AudioRecognition {
       // disable EOU model if manual turn detection enabled
       this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
-    const bounceEOUTask = (lastSpeakingTime: number) => async (controller: AbortController) => {
-      let endpointingDelay = this.minEndpointingDelay;
-      // TODO(AJS-74): need to support actual turn detection model plugins for following code to run
-      if (turnDetector) {
-        this.logger.debug('Running turn detector model');
-        if (!turnDetector.supportsLanguage(this.lastLanguage)) {
-          this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
-        } else {
-          const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
-          this.logger.debug(
-            { endOfTurnProbability, language: this.lastLanguage },
-            'end of turn probability',
-          );
-          const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
-          this.logger.debug(
-            {
-              unlikelyThreshold,
-              endOfTurnProbability,
-              language: this.lastLanguage,
-              transcript: this.audioTranscript,
-            },
-            'EOU Detection',
-          );
+    const bounceEOUTask =
+      (
+        lastSpeakingTime: number | undefined,
+        lastFinalTranscriptTime: number,
+        speechStartTime: number | undefined,
+      ) =>
+      async (controller: AbortController) => {
+        let endpointingDelay = this.minEndpointingDelay;
-          if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
-            endpointingDelay = this.maxEndpointingDelay;
+        if (turnDetector) {
+          this.logger.debug('Running turn detector model');
+          if (!turnDetector.supportsLanguage(this.lastLanguage)) {
+            this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
+          } else {
+            const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
+            this.logger.debug(
+              { endOfTurnProbability, language: this.lastLanguage },
+              'end of turn probability',
+            );
+            const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
+            this.logger.debug(
+              {
+                unlikelyThreshold,
+                endOfTurnProbability,
+                language: this.lastLanguage,
+                transcript: this.audioTranscript,
+              },
+              'EOU Detection',
+            );
+            if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
+              endpointingDelay = this.maxEndpointingDelay;
+            }
           }
         }
-      }
-      const extraSleep = lastSpeakingTime + endpointingDelay - Date.now();
-      // add delay to see if there's a potential upcoming EOU task that cancels this one
-      await delay(Math.max(extraSleep, 0), { signal: controller.signal });
+        let extraSleep = endpointingDelay;
+        if (lastSpeakingTime !== undefined) {
+          extraSleep += lastSpeakingTime - Date.now();
+        }
-      this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
+        if (extraSleep > 0) {
+          // add delay to see if there's a potential upcoming EOU task that cancels this one
+          await delay(Math.max(extraSleep, 0), { signal: controller.signal });
+        }
-      const committed = await this.hooks.onEndOfTurn({
-        newTranscript: this.audioTranscript,
-        transcriptionDelay: Math.max(this.lastFinalTranscriptTime - lastSpeakingTime, 0),
-        endOfUtteranceDelay: Date.now() - lastSpeakingTime,
-      });
+        this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
+        const confidenceAvg =
+          this.finalTranscriptConfidence.length > 0
+            ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
+              this.finalTranscriptConfidence.length
+            : 0;
+        let startedSpeakingAt: number | undefined;
+        let stoppedSpeakingAt: number | undefined;
+        let transcriptionDelay: number | undefined;
+        let endOfUtteranceDelay: number | undefined;
+        // sometimes, we can't calculate the metrics because VAD was unreliable.
+        // in this case, we just ignore the calculation, it's better than providing likely wrong values
+        if (
+          lastFinalTranscriptTime !== 0 &&
+          lastSpeakingTime !== undefined &&
+          speechStartTime !== undefined
+        ) {
+          startedSpeakingAt = speechStartTime;
+          stoppedSpeakingAt = lastSpeakingTime;
+          transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
+          endOfUtteranceDelay = Date.now() - lastSpeakingTime;
+        }
-      if (committed) {
-        // clear the transcript if the user turn was committed
-        this.audioTranscript = '';
-      }
+        const committed = await this.hooks.onEndOfTurn({
+          newTranscript: this.audioTranscript,
+          transcriptConfidence: confidenceAvg,
+          transcriptionDelay: transcriptionDelay ?? 0,
+          endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
+          startedSpeakingAt,
+          stoppedSpeakingAt,
+        });
+        if (committed) {
+          // clear the transcript if the user turn was committed
+          this.audioTranscript = '';
+          this.finalTranscriptConfidence = [];
+          this.lastSpeakingTime = undefined;
+          this.lastFinalTranscriptTime = 0;
+          this.speechStartTime = undefined;
+        }
-      this.userTurnCommitted = false;
-    };
+        this.userTurnCommitted = false;
+      };
     // cancel any existing EOU task
     this.bounceEOUTask?.cancel();
-    this.bounceEOUTask = Task.from(bounceEOUTask(this.lastSpeakingTime));
+    // copy the values before awaiting (the values can change)
+    this.bounceEOUTask = Task.from(
+      bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
+    );
     this.bounceEOUTask.result
       .then(() => {
@@ -376,13 +544,21 @@ export class AudioRecognition {
             break;
           case VADEventType.INFERENCE_DONE:
             this.hooks.onVADInferenceDone(ev);
+            // for metrics, get the "earliest" signal of speech as possible
+            if (ev.rawAccumulatedSpeech > 0.0) {
+              this.lastSpeakingTime = Date.now();
+              if (this.speechStartTime === undefined) {
+                this.speechStartTime = Date.now();
+              }
+            }
             break;
           case VADEventType.END_OF_SPEECH:
             this.logger.debug('VAD task: END_OF_SPEECH');
             this.hooks.onEndOfSpeech(ev);
-            this.speaking = false;
             // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
-            this.lastSpeakingTime = Date.now() - ev.silenceDuration;
+            this.speaking = false;
             if (
               this.vadBaseTurnDetection ||
@@ -412,6 +588,8 @@ export class AudioRecognition {
   clearUserTurn() {
     this.audioTranscript = '';
     this.audioInterimTranscript = '';
+    this.audioPreflightTranscript = '';
+    this.finalTranscriptConfidence = [];
     this.userTurnCommitted = false;
     this.sttTask?.cancelAndWait().finally(() => {

package/src/voice/generation.ts CHANGED Viewed

@@ -4,7 +4,6 @@
 import type { AudioFrame } from '@livekit/rtc-node';
 import { AudioResampler } from '@livekit/rtc-node';
 import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
-import { ZodObject } from 'zod';
 import {
   type ChatContext,
   ChatMessage,
@@ -19,6 +18,7 @@ import {
   isFunctionTool,
   isToolError,
 } from '../llm/tool_context.js';
+import { isZodSchema, parseZodSchema } from '../llm/zod-utils.js';
 import { log } from '../log.js';
 import { IdentityTransform } from '../stream/identity_transform.js';
 import { Future, Task, shortuuid, toError } from '../utils.js';
@@ -732,8 +732,13 @@ export function performToolExecutions({
       try {
         const jsonArgs = JSON.parse(toolCall.args);
-        if (tool.parameters instanceof ZodObject) {
-          parsedArgs = tool.parameters.parse(jsonArgs);
+        if (isZodSchema(tool.parameters)) {
+          const result = await parseZodSchema<object>(tool.parameters, jsonArgs);
+          if (result.success) {
+            parsedArgs = result.data;
+          } else {
+            throw result.error;
+          }
         } else {
           parsedArgs = jsonArgs;
         }