npm - @livekit/agents - Versions diffs - 1.0.16 → 1.0.18 - Mend

@livekit/agents 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/dist/inference/llm.cjs +35 -13
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +10 -5
package/dist/inference/llm.d.ts +10 -5
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +35 -13
package/dist/inference/llm.js.map +1 -1
package/dist/llm/chat_context.d.cts +1 -1
package/dist/llm/chat_context.d.ts +1 -1
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.cts +1 -1
package/dist/llm/llm.d.ts +1 -1
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js.map +1 -1
package/dist/llm/provider_format/google.cjs.map +1 -1
package/dist/llm/provider_format/google.d.cts +1 -1
package/dist/llm/provider_format/google.d.ts +1 -1
package/dist/llm/provider_format/google.d.ts.map +1 -1
package/dist/llm/provider_format/google.js.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +4 -0
package/dist/llm/realtime.d.ts +4 -0
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/llm/utils.cjs +2 -2
package/dist/llm/utils.cjs.map +1 -1
package/dist/llm/utils.d.cts +1 -1
package/dist/llm/utils.d.ts +1 -1
package/dist/llm/utils.d.ts.map +1 -1
package/dist/llm/utils.js +2 -2
package/dist/llm/utils.js.map +1 -1
package/dist/llm/zod-utils.cjs +6 -3
package/dist/llm/zod-utils.cjs.map +1 -1
package/dist/llm/zod-utils.d.cts +1 -1
package/dist/llm/zod-utils.d.ts +1 -1
package/dist/llm/zod-utils.d.ts.map +1 -1
package/dist/llm/zod-utils.js +6 -3
package/dist/llm/zod-utils.js.map +1 -1
package/dist/llm/zod-utils.test.cjs +83 -0
package/dist/llm/zod-utils.test.cjs.map +1 -1
package/dist/llm/zod-utils.test.js +83 -0
package/dist/llm/zod-utils.test.js.map +1 -1
package/dist/stt/stt.cjs +0 -1
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +0 -1
package/dist/stt/stt.js.map +1 -1
package/dist/tts/tts.cjs +2 -4
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +3 -5
package/dist/tts/tts.js.map +1 -1
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +7 -0
package/dist/utils.d.ts +7 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js.map +1 -1
package/dist/voice/agent_activity.cjs +69 -20
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +69 -20
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +40 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -0
package/dist/voice/agent_session.d.ts +5 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +40 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/interruption_detection.test.cjs +114 -0
package/dist/voice/interruption_detection.test.cjs.map +1 -0
package/dist/voice/interruption_detection.test.js +113 -0
package/dist/voice/interruption_detection.test.js.map +1 -0
package/dist/voice/room_io/room_io.cjs +3 -0
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +1 -0
package/dist/voice/room_io/room_io.d.ts +1 -0
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +3 -0
package/dist/voice/room_io/room_io.js.map +1 -1
package/package.json +3 -3
package/src/inference/llm.ts +53 -21
package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
package/src/llm/llm.ts +1 -1
package/src/llm/provider_format/google.ts +4 -4
package/src/llm/realtime.ts +8 -1
package/src/llm/utils.ts +7 -2
package/src/llm/zod-utils.test.ts +101 -0
package/src/llm/zod-utils.ts +12 -3
package/src/stt/stt.ts +2 -1
package/src/tts/tts.ts +7 -5
package/src/utils.ts +17 -0
package/src/voice/agent_activity.ts +96 -24
package/src/voice/agent_session.ts +54 -0
package/src/voice/interruption_detection.test.ts +151 -0
package/src/voice/room_io/room_io.ts +4 -0

package/src/voice/agent_activity.ts CHANGED Viewed

@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
         } catch (error) {
           this.logger.error(error, 'failed to update the tools');
         }
+        if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+          this.logger.error(
+            'audio output is enabled but RealtimeModel has no audio modality ' +
+              'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
+              'or set a TTS model.',
+          );
+        }
       } else if (this.llm instanceof LLM) {
         try {
           updateInstructions({
@@ -625,11 +633,21 @@ export class AgentActivity implements RecognitionHooks {
       return;
     }
+    // Refactored interruption word count check:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Apply check to all STT results: empty string, undefined, or any length
+    // - This ensures consistent behavior across all interruption scenarios
     if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
       const text = this.audioRecognition.currentTranscript;
       // TODO(shubhra): better word splitting for multi-language
-      if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
+      // Normalize text: convert undefined/null to empty string for consistent word counting
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      // Only allow interruption if word count meets or exceeds minInterruptionWords
+      // This applies to all cases: empty strings, partial speech, and full speech
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
         return;
       }
     }
@@ -767,19 +785,30 @@ export class AgentActivity implements RecognitionHooks {
       return true;
     }
+    // Refactored interruption word count check for consistency with onVADInferenceDone:
+    // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
+    // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
     if (
       this.stt &&
       this.turnDetection !== 'manual' &&
       this._currentSpeech &&
       this._currentSpeech.allowInterruptions &&
       !this._currentSpeech.interrupted &&
-      this.agentSession.options.minInterruptionWords > 0 &&
-      info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
+      this.agentSession.options.minInterruptionWords > 0
     ) {
-      // avoid interruption if the new_transcript is too short
-      this.cancelPreemptiveGeneration();
-      this.logger.info('skipping user input, new_transcript is too short');
-      return false;
+      const wordCount = splitWords(info.newTranscript, true).length;
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
+        // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
+        this.cancelPreemptiveGeneration();
+        this.logger.info(
+          {
+            wordCount,
+            minInterruptionWords: this.agentSession.options.minInterruptionWords,
+          },
+          'skipping user input, word count below minimum interruption threshold',
+        );
+        return false;
+      }
     }
     const oldTask = this._userTurnCompletedTask;
@@ -1612,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
     const readMessages = async (
       abortController: AbortController,
-      outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
+      outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
     ) => {
       replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
         once: true,
@@ -1627,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
             );
             break;
           }
-          const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
+          const msgModalities = msg.modalities ? await msg.modalities : undefined;
+          let ttsTextInput: ReadableStream<string> | null = null;
+          let trTextInput: ReadableStream<string>;
+          if (msgModalities && !msgModalities.includes('audio') && this.tts) {
+            if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.warn(
+                'text response received from realtime API, falling back to use a TTS model.',
+              );
+            }
+            const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
+            ttsTextInput = _ttsTextInput;
+            trTextInput = _trTextInput;
+          } else {
+            trTextInput = msg.textStream;
+          }
+          const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
           let textOut: _TextOut | null = null;
           if (trNodeResult) {
             const [textForwardTask, _textOut] = performTextForwarding(
@@ -1638,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
             forwardTasks.push(textForwardTask);
             textOut = _textOut;
           }
           let audioOut: _AudioOut | null = null;
           if (audioOutput) {
-            const realtimeAudio = await this.agent.realtimeAudioOutputNode(
-              msg.audioStream,
-              modelSettings,
-            );
-            if (realtimeAudio) {
+            let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
+            if (ttsTextInput) {
+              const [ttsTask, ttsStream] = performTTSInference(
+                (...args) => this.agent.ttsNode(...args),
+                ttsTextInput,
+                modelSettings,
+                abortController,
+              );
+              tasks.push(ttsTask);
+              realtimeAudioResult = ttsStream;
+            } else if (msgModalities && msgModalities.includes('audio')) {
+              realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
+                msg.audioStream,
+                modelSettings,
+              );
+            } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.error(
+                'Text message received from Realtime API with audio modality. ' +
+                  'This usually happens when text chat context is synced to the API. ' +
+                  'Try to add a TTS model as fallback or use text modality with TTS instead.',
+              );
+            } else {
+              this.logger.warn(
+                'audio output is enabled but neither tts nor realtime audio is available',
+              );
+            }
+            if (realtimeAudioResult) {
               const [forwardTask, _audioOut] = performAudioForwarding(
-                realtimeAudio,
+                realtimeAudioResult,
                 audioOutput,
                 abortController,
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
               audioOut.firstFrameFut.await.finally(onFirstFrame);
-            } else {
-              this.logger.warn(
-                'audio output is enabled but neither tts nor realtime audio is available',
-              );
             }
           } else if (textOut) {
             textOut.firstTextFut.await.finally(onFirstFrame);
           }
-          outputs.push([msg.messageId, textOut, audioOut]);
+          outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
         await waitFor(forwardTasks);
       } catch (error) {
@@ -1671,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
       }
     };
-    const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
+    const messageOutputs: Array<
+      [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
+    > = [];
     const tasks = [
       Task.from(
         (controller) => readMessages(controller, messageOutputs),
@@ -1750,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
       if (messageOutputs.length > 0) {
         // there should be only one message
-        const [msgId, textOut, audioOut] = messageOutputs[0]!;
+        const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
         let forwardedText = textOut?.text || '';
         if (audioOutput) {
@@ -1775,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
           this.realtimeSession.truncate({
             messageId: msgId,
             audioEndMs: Math.floor(playbackPosition),
+            modalities: msgModalities,
+            audioTranscript: forwardedText,
           });
         }
@@ -1805,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
     if (messageOutputs.length > 0) {
       // there should be only one message
-      const [msgId, textOut, _] = messageOutputs[0]!;
+      const [msgId, textOut, _, __] = messageOutputs[0]!;
       const message = ChatMessage.create({
         role: 'assistant',
         content: textOut?.text || '',

package/src/voice/agent_session.ts CHANGED Viewed

@@ -58,6 +58,7 @@ export interface VoiceOptions {
   maxEndpointingDelay: number;
   maxToolSteps: number;
   preemptiveGeneration: boolean;
+  userAwayTimeout?: number | null;
 }
 const defaultVoiceOptions: VoiceOptions = {
@@ -69,6 +70,7 @@ const defaultVoiceOptions: VoiceOptions = {
   maxEndpointingDelay: 6000,
   maxToolSteps: 3,
   preemptiveGeneration: false,
+  userAwayTimeout: 15.0,
 } as const;
 export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -123,6 +125,7 @@ export class AgentSession<
   private _output: AgentOutput;
   private closingTask: Promise<void> | null = null;
+  private userAwayTimer: NodeJS.Timeout | null = null;
   constructor(opts: AgentSessionOptions<UserData>) {
     super();
@@ -167,6 +170,8 @@ export class AgentSession<
     // This is the "global" chat context, it holds the entire conversation history
     this._chatCtx = ChatContext.empty();
     this.options = { ...defaultVoiceOptions, ...voiceOptions };
+    this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed.bind(this));
   }
   get input(): AgentInput {
@@ -416,6 +421,14 @@ export class AgentSession<
     const oldState = this._agentState;
     this._agentState = state;
+    // Handle user away timer based on state changes
+    if (state === 'listening' && this.userState === 'listening') {
+      this._setUserAwayTimer();
+    } else {
+      this._cancelUserAwayTimer();
+    }
     this.emit(
       AgentSessionEventTypes.AgentStateChanged,
       createAgentStateChangedEvent(oldState, state),
@@ -430,6 +443,14 @@ export class AgentSession<
     const oldState = this.userState;
     this.userState = state;
+    // Handle user away timer based on state changes
+    if (state === 'listening' && this._agentState === 'listening') {
+      this._setUserAwayTimer();
+    } else {
+      this._cancelUserAwayTimer();
+    }
     this.emit(
       AgentSessionEventTypes.UserStateChanged,
       createUserStateChangedEvent(oldState, state),
@@ -451,6 +472,37 @@ export class AgentSession<
   private onTextOutputChanged(): void {}
+  private _setUserAwayTimer(): void {
+    this._cancelUserAwayTimer();
+    if (this.options.userAwayTimeout === null || this.options.userAwayTimeout === undefined) {
+      return;
+    }
+    if (this.roomIO && !this.roomIO.isParticipantAvailable) {
+      return;
+    }
+    this.userAwayTimer = setTimeout(() => {
+      this.logger.debug('User away timeout triggered');
+      this._updateUserState('away');
+    }, this.options.userAwayTimeout * 1000);
+  }
+  private _cancelUserAwayTimer(): void {
+    if (this.userAwayTimer !== null) {
+      clearTimeout(this.userAwayTimer);
+      this.userAwayTimer = null;
+    }
+  }
+  private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
+    if (this.userState === 'away' && ev.isFinal) {
+      this.logger.debug('User returned from away state due to speech input');
+      this._updateUserState('listening');
+    }
+  }
   private async closeImpl(
     reason: CloseReason,
     error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
@@ -460,6 +512,8 @@ export class AgentSession<
       return;
     }
+    this._cancelUserAwayTimer();
     if (this.activity) {
       if (!drain) {
         try {

package/src/voice/interruption_detection.test.ts ADDED Viewed

@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Unit tests for interruption detection logic in AgentActivity.
+ *
+ * Tests the refactored minInterruptionWords check which ensures:
+ * - Consistent word count filtering across all speech scenarios
+ * - Proper handling of empty strings, undefined, and short speech
+ * - Interruptions allowed only when word count meets or exceeds minInterruptionWords threshold
+ */
+import { describe, expect, it } from 'vitest';
+import { splitWords } from '../tokenize/basic/word.js';
+describe('Interruption Detection - Word Counting', () => {
+  describe('Word Splitting Behavior', () => {
+    it('should count empty string as 0 words', () => {
+      const text = '';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(0);
+    });
+    it('should count single word correctly', () => {
+      const text = 'hello';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(1);
+    });
+    it('should count two words correctly', () => {
+      const text = 'hello world';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(2);
+    });
+    it('should count multiple words correctly', () => {
+      const text = 'hello this is a full sentence';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(6);
+    });
+    it('should handle punctuation correctly', () => {
+      const text = 'hello, world!';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(2);
+    });
+    it('should handle multiple spaces between words', () => {
+      const text = 'hello  world';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(2);
+    });
+    it('should count whitespace-only string as 0 words', () => {
+      const text = '   ';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(0);
+    });
+    it('should handle leading and trailing whitespace', () => {
+      const text = '  hello world  ';
+      const wordCount = splitWords(text, true).length;
+      expect(wordCount).toBe(2);
+    });
+  });
+  describe('Integration: Full Interruption Check Logic', () => {
+    it('should block interruption for empty transcript with threshold 2', () => {
+      const text = '';
+      const minInterruptionWords = 2;
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      const shouldBlock = wordCount < minInterruptionWords;
+      expect(normalizedText).toBe('');
+      expect(wordCount).toBe(0);
+      expect(shouldBlock).toBe(true);
+    });
+    it('should block interruption for undefined transcript with threshold 2', () => {
+      const text: string | undefined = undefined;
+      const minInterruptionWords = 2;
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      const shouldBlock = wordCount < minInterruptionWords;
+      expect(normalizedText).toBe('');
+      expect(wordCount).toBe(0);
+      expect(shouldBlock).toBe(true);
+    });
+    it('should block interruption for single word with threshold 2', () => {
+      const text = 'hello';
+      const minInterruptionWords = 2;
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      const shouldBlock = wordCount < minInterruptionWords;
+      expect(normalizedText).toBe('hello');
+      expect(wordCount).toBe(1);
+      expect(shouldBlock).toBe(true);
+    });
+    it('should allow interruption when word count exactly meets threshold', () => {
+      const text = 'hello world';
+      const minInterruptionWords = 2;
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      const shouldBlock = wordCount < minInterruptionWords;
+      expect(normalizedText).toBe('hello world');
+      expect(wordCount).toBe(2);
+      expect(shouldBlock).toBe(false);
+    });
+    it('should allow interruption when word count exceeds threshold', () => {
+      const text = 'hello this is a full sentence';
+      const minInterruptionWords = 2;
+      const normalizedText = text ?? '';
+      const wordCount = splitWords(normalizedText, true).length;
+      const shouldBlock = wordCount < minInterruptionWords;
+      expect(normalizedText).toBe('hello this is a full sentence');
+      expect(wordCount).toBe(6);
+      expect(shouldBlock).toBe(false);
+    });
+    it('should apply consistent word counting logic in both methods', () => {
+      const transcripts = ['', 'hello', 'hello world', 'this is a longer sentence'];
+      const threshold = 2;
+      transcripts.forEach((transcript) => {
+        const text1 = transcript;
+        const normalizedText1 = text1 ?? '';
+        const wordCount1 = splitWords(normalizedText1, true).length;
+        const shouldBlock1 = wordCount1 < threshold;
+        const wordCount2 = splitWords(transcript, true).length;
+        const shouldBlock2 = wordCount2 < threshold;
+        expect(wordCount1).toBe(wordCount2);
+        expect(shouldBlock1).toBe(shouldBlock2);
+      });
+    });
+  });
+});

package/src/voice/room_io/room_io.ts CHANGED Viewed

@@ -369,6 +369,10 @@ export class RoomIO {
     return this.transcriptionSynchronizer.textOutput;
   }
+  get isParticipantAvailable(): boolean {
+    return this.participantAvailableFuture.done;
+  }
   /** Switch to a different participant */
   setParticipant(participantIdentity: string | null) {
     this.logger.debug({ participantIdentity }, 'setting participant');