npm - @livekit/agents - Versions diffs - 1.0.16 → 1.0.18 - Mend

@livekit/agents 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/dist/inference/llm.cjs +35 -13
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +10 -5
package/dist/inference/llm.d.ts +10 -5
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +35 -13
package/dist/inference/llm.js.map +1 -1
package/dist/llm/chat_context.d.cts +1 -1
package/dist/llm/chat_context.d.ts +1 -1
package/dist/llm/llm.cjs.map +1 -1
package/dist/llm/llm.d.cts +1 -1
package/dist/llm/llm.d.ts +1 -1
package/dist/llm/llm.d.ts.map +1 -1
package/dist/llm/llm.js.map +1 -1
package/dist/llm/provider_format/google.cjs.map +1 -1
package/dist/llm/provider_format/google.d.cts +1 -1
package/dist/llm/provider_format/google.d.ts +1 -1
package/dist/llm/provider_format/google.d.ts.map +1 -1
package/dist/llm/provider_format/google.js.map +1 -1
package/dist/llm/provider_format/index.d.cts +1 -1
package/dist/llm/provider_format/index.d.ts +1 -1
package/dist/llm/provider_format/index.d.ts.map +1 -1
package/dist/llm/realtime.cjs.map +1 -1
package/dist/llm/realtime.d.cts +4 -0
package/dist/llm/realtime.d.ts +4 -0
package/dist/llm/realtime.d.ts.map +1 -1
package/dist/llm/realtime.js.map +1 -1
package/dist/llm/utils.cjs +2 -2
package/dist/llm/utils.cjs.map +1 -1
package/dist/llm/utils.d.cts +1 -1
package/dist/llm/utils.d.ts +1 -1
package/dist/llm/utils.d.ts.map +1 -1
package/dist/llm/utils.js +2 -2
package/dist/llm/utils.js.map +1 -1
package/dist/llm/zod-utils.cjs +6 -3
package/dist/llm/zod-utils.cjs.map +1 -1
package/dist/llm/zod-utils.d.cts +1 -1
package/dist/llm/zod-utils.d.ts +1 -1
package/dist/llm/zod-utils.d.ts.map +1 -1
package/dist/llm/zod-utils.js +6 -3
package/dist/llm/zod-utils.js.map +1 -1
package/dist/llm/zod-utils.test.cjs +83 -0
package/dist/llm/zod-utils.test.cjs.map +1 -1
package/dist/llm/zod-utils.test.js +83 -0
package/dist/llm/zod-utils.test.js.map +1 -1
package/dist/stt/stt.cjs +0 -1
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +0 -1
package/dist/stt/stt.js.map +1 -1
package/dist/tts/tts.cjs +2 -4
package/dist/tts/tts.cjs.map +1 -1
package/dist/tts/tts.d.ts.map +1 -1
package/dist/tts/tts.js +3 -5
package/dist/tts/tts.js.map +1 -1
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +7 -0
package/dist/utils.d.ts +7 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js.map +1 -1
package/dist/voice/agent_activity.cjs +69 -20
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +69 -20
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +40 -1
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -0
package/dist/voice/agent_session.d.ts +5 -0
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +40 -1
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/interruption_detection.test.cjs +114 -0
package/dist/voice/interruption_detection.test.cjs.map +1 -0
package/dist/voice/interruption_detection.test.js +113 -0
package/dist/voice/interruption_detection.test.js.map +1 -0
package/dist/voice/room_io/room_io.cjs +3 -0
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +1 -0
package/dist/voice/room_io/room_io.d.ts +1 -0
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js +3 -0
package/dist/voice/room_io/room_io.js.map +1 -1
package/package.json +3 -3
package/src/inference/llm.ts +53 -21
package/src/llm/__snapshots__/zod-utils.test.ts.snap +218 -0
package/src/llm/llm.ts +1 -1
package/src/llm/provider_format/google.ts +4 -4
package/src/llm/realtime.ts +8 -1
package/src/llm/utils.ts +7 -2
package/src/llm/zod-utils.test.ts +101 -0
package/src/llm/zod-utils.ts +12 -3
package/src/stt/stt.ts +2 -1
package/src/tts/tts.ts +7 -5
package/src/utils.ts +17 -0
package/src/voice/agent_activity.ts +96 -24
package/src/voice/agent_session.ts +54 -0
package/src/voice/interruption_detection.test.ts +151 -0
package/src/voice/room_io/room_io.ts +4 -0

package/dist/voice/agent_activity.js CHANGED Viewed

@@ -152,6 +152,11 @@ class AgentActivity {
         } catch (error) {
           this.logger.error(error, "failed to update the tools");
         }
+        if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
+          this.logger.error(
+            "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
+          );
+        }
       } else if (this.llm instanceof LLM) {
         try {
           updateInstructions({
@@ -449,7 +454,9 @@ class AgentActivity {
     }
     if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
       const text = this.audioRecognition.currentTranscript;
-      if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
+      const normalizedText = text ?? "";
+      const wordCount = splitWords(normalizedText, true).length;
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
         return;
       }
     }
@@ -551,10 +558,19 @@ class AgentActivity {
       this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
       return true;
     }
-    if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
-      this.cancelPreemptiveGeneration();
-      this.logger.info("skipping user input, new_transcript is too short");
-      return false;
+    if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
+      const wordCount = splitWords(info.newTranscript, true).length;
+      if (wordCount < this.agentSession.options.minInterruptionWords) {
+        this.cancelPreemptiveGeneration();
+        this.logger.info(
+          {
+            wordCount,
+            minInterruptionWords: this.agentSession.options.minInterruptionWords
+          },
+          "skipping user input, word count below minimum interruption threshold"
+        );
+        return false;
+      }
     }
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
@@ -1197,7 +1213,22 @@ ${instructions}` : instructions,
             );
             break;
           }
-          const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
+          const msgModalities = msg.modalities ? await msg.modalities : void 0;
+          let ttsTextInput = null;
+          let trTextInput;
+          if (msgModalities && !msgModalities.includes("audio") && this.tts) {
+            if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.warn(
+                "text response received from realtime API, falling back to use a TTS model."
+              );
+            }
+            const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
+            ttsTextInput = _ttsTextInput;
+            trTextInput = _trTextInput;
+          } else {
+            trTextInput = msg.textStream;
+          }
+          const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
           let textOut = null;
           if (trNodeResult) {
             const [textForwardTask, _textOut] = performTextForwarding(
@@ -1210,28 +1241,44 @@ ${instructions}` : instructions,
           }
           let audioOut = null;
           if (audioOutput) {
-            const realtimeAudio = await this.agent.realtimeAudioOutputNode(
-              msg.audioStream,
-              modelSettings
-            );
-            if (realtimeAudio) {
+            let realtimeAudioResult = null;
+            if (ttsTextInput) {
+              const [ttsTask, ttsStream] = performTTSInference(
+                (...args) => this.agent.ttsNode(...args),
+                ttsTextInput,
+                modelSettings,
+                abortController
+              );
+              tasks.push(ttsTask);
+              realtimeAudioResult = ttsStream;
+            } else if (msgModalities && msgModalities.includes("audio")) {
+              realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
+                msg.audioStream,
+                modelSettings
+              );
+            } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
+              this.logger.error(
+                "Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
+              );
+            } else {
+              this.logger.warn(
+                "audio output is enabled but neither tts nor realtime audio is available"
+              );
+            }
+            if (realtimeAudioResult) {
               const [forwardTask, _audioOut] = performAudioForwarding(
-                realtimeAudio,
+                realtimeAudioResult,
                 audioOutput,
                 abortController
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
               audioOut.firstFrameFut.await.finally(onFirstFrame);
-            } else {
-              this.logger.warn(
-                "audio output is enabled but neither tts nor realtime audio is available"
-              );
             }
           } else if (textOut) {
             textOut.firstTextFut.await.finally(onFirstFrame);
           }
-          outputs.push([msg.messageId, textOut, audioOut]);
+          outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
         await waitFor(forwardTasks);
       } catch (error) {
@@ -1301,7 +1348,7 @@ ${instructions}` : instructions,
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       if (messageOutputs.length > 0) {
-        const [msgId, textOut, audioOut] = messageOutputs[0];
+        const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
         let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
         if (audioOutput) {
           audioOutput.clearBuffer();
@@ -1321,7 +1368,9 @@ ${instructions}` : instructions,
           }
           this.realtimeSession.truncate({
             messageId: msgId,
-            audioEndMs: Math.floor(playbackPosition)
+            audioEndMs: Math.floor(playbackPosition),
+            modalities: msgModalities,
+            audioTranscript: forwardedText
           });
         }
         if (forwardedText) {
@@ -1345,7 +1394,7 @@ ${instructions}` : instructions,
       return;
     }
     if (messageOutputs.length > 0) {
-      const [msgId, textOut, _] = messageOutputs[0];
+      const [msgId, textOut, _, __] = messageOutputs[0];
       const message = ChatMessage.create({
         role: "assistant",
         content: (textOut == null ? void 0 : textOut.text) || "",