npm - @livekit/agents - Versions diffs - 1.0.37 → 1.0.39 - Mend

@livekit/agents 1.0.37 → 1.0.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/dist/cli.cjs.map +1 -1
package/dist/inference/api_protos.cjs +68 -0
package/dist/inference/api_protos.cjs.map +1 -1
package/dist/inference/api_protos.d.cts +345 -4
package/dist/inference/api_protos.d.ts +345 -4
package/dist/inference/api_protos.d.ts.map +1 -1
package/dist/inference/api_protos.js +60 -0
package/dist/inference/api_protos.js.map +1 -1
package/dist/inference/llm.cjs +7 -3
package/dist/inference/llm.cjs.map +1 -1
package/dist/inference/llm.d.cts +5 -6
package/dist/inference/llm.d.ts +5 -6
package/dist/inference/llm.d.ts.map +1 -1
package/dist/inference/llm.js +7 -3
package/dist/inference/llm.js.map +1 -1
package/dist/inference/stt.cjs +32 -21
package/dist/inference/stt.cjs.map +1 -1
package/dist/inference/stt.d.cts +5 -4
package/dist/inference/stt.d.ts +5 -4
package/dist/inference/stt.d.ts.map +1 -1
package/dist/inference/stt.js +34 -21
package/dist/inference/stt.js.map +1 -1
package/dist/inference/tts.cjs.map +1 -1
package/dist/inference/tts.d.cts +10 -7
package/dist/inference/tts.d.ts +10 -7
package/dist/inference/tts.d.ts.map +1 -1
package/dist/inference/tts.js.map +1 -1
package/dist/ipc/inference_proc_executor.cjs.map +1 -1
package/dist/ipc/job_proc_executor.cjs.map +1 -1
package/dist/stt/stream_adapter.cjs +9 -1
package/dist/stt/stream_adapter.cjs.map +1 -1
package/dist/stt/stream_adapter.d.ts.map +1 -1
package/dist/stt/stream_adapter.js +9 -1
package/dist/stt/stream_adapter.js.map +1 -1
package/dist/stt/stt.cjs +10 -0
package/dist/stt/stt.cjs.map +1 -1
package/dist/stt/stt.d.cts +12 -0
package/dist/stt/stt.d.ts +12 -0
package/dist/stt/stt.d.ts.map +1 -1
package/dist/stt/stt.js +10 -0
package/dist/stt/stt.js.map +1 -1
package/dist/telemetry/traces.cjs +4 -3
package/dist/telemetry/traces.cjs.map +1 -1
package/dist/telemetry/traces.d.cts +2 -0
package/dist/telemetry/traces.d.ts +2 -0
package/dist/telemetry/traces.d.ts.map +1 -1
package/dist/telemetry/traces.js +4 -3
package/dist/telemetry/traces.js.map +1 -1
package/dist/utils.cjs +11 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +10 -0
package/dist/utils.d.ts +10 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +10 -0
package/dist/utils.js.map +1 -1
package/dist/voice/agent.cjs +6 -2
package/dist/voice/agent.cjs.map +1 -1
package/dist/voice/agent.d.ts.map +1 -1
package/dist/voice/agent.js +6 -2
package/dist/voice/agent.js.map +1 -1
package/dist/voice/agent_activity.cjs +72 -37
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.cts +2 -1
package/dist/voice/agent_activity.d.ts +2 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +73 -38
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/agent_session.cjs +7 -5
package/dist/voice/agent_session.cjs.map +1 -1
package/dist/voice/agent_session.d.cts +5 -2
package/dist/voice/agent_session.d.ts +5 -2
package/dist/voice/agent_session.d.ts.map +1 -1
package/dist/voice/agent_session.js +7 -5
package/dist/voice/agent_session.js.map +1 -1
package/dist/voice/audio_recognition.cjs +3 -1
package/dist/voice/audio_recognition.cjs.map +1 -1
package/dist/voice/audio_recognition.d.ts.map +1 -1
package/dist/voice/audio_recognition.js +3 -1
package/dist/voice/audio_recognition.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +6 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -1
package/dist/voice/avatar/datastream_io.d.cts +1 -0
package/dist/voice/avatar/datastream_io.d.ts +1 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
package/dist/voice/avatar/datastream_io.js +6 -0
package/dist/voice/avatar/datastream_io.js.map +1 -1
package/dist/voice/background_audio.cjs.map +1 -1
package/dist/voice/generation.cjs +14 -5
package/dist/voice/generation.cjs.map +1 -1
package/dist/voice/generation.d.cts +3 -2
package/dist/voice/generation.d.ts +3 -2
package/dist/voice/generation.d.ts.map +1 -1
package/dist/voice/generation.js +14 -5
package/dist/voice/generation.js.map +1 -1
package/dist/voice/io.cjs +12 -0
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +19 -1
package/dist/voice/io.d.ts +19 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js +12 -0
package/dist/voice/io.js.map +1 -1
package/dist/voice/recorder_io/recorder_io.cjs +91 -28
package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
package/dist/voice/recorder_io/recorder_io.js +91 -28
package/dist/voice/recorder_io/recorder_io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +40 -11
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.cts +4 -1
package/dist/voice/room_io/_input.d.ts +4 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +31 -2
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/room_io/_output.cjs +6 -0
package/dist/voice/room_io/_output.cjs.map +1 -1
package/dist/voice/room_io/_output.d.cts +1 -0
package/dist/voice/room_io/_output.d.ts +1 -0
package/dist/voice/room_io/_output.d.ts.map +1 -1
package/dist/voice/room_io/_output.js +6 -0
package/dist/voice/room_io/_output.js.map +1 -1
package/dist/voice/room_io/room_io.cjs.map +1 -1
package/dist/voice/room_io/room_io.d.cts +2 -2
package/dist/voice/room_io/room_io.d.ts +2 -2
package/dist/voice/room_io/room_io.d.ts.map +1 -1
package/dist/voice/room_io/room_io.js.map +1 -1
package/dist/voice/speech_handle.cjs +2 -0
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +3 -0
package/dist/voice/speech_handle.d.ts +3 -0
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +2 -0
package/dist/voice/speech_handle.js.map +1 -1
package/package.json +2 -2
package/src/inference/api_protos.ts +83 -0
package/src/inference/llm.ts +20 -15
package/src/inference/stt.ts +48 -29
package/src/inference/tts.ts +36 -16
package/src/stt/stream_adapter.ts +12 -1
package/src/stt/stt.ts +21 -0
package/src/telemetry/traces.ts +6 -2
package/src/utils.ts +21 -0
package/src/voice/agent.ts +11 -2
package/src/voice/agent_activity.ts +108 -41
package/src/voice/agent_session.ts +6 -5
package/src/voice/audio_recognition.ts +2 -0
package/src/voice/avatar/datastream_io.ts +8 -0
package/src/voice/generation.ts +24 -12
package/src/voice/io.ts +27 -5
package/src/voice/recorder_io/recorder_io.ts +123 -31
package/src/voice/room_io/_input.ts +32 -4
package/src/voice/room_io/_output.ts +8 -0
package/src/voice/room_io/room_io.ts +3 -1
package/src/voice/speech_handle.ts +4 -0

package/dist/voice/agent_activity.cjs CHANGED Viewed

@@ -122,9 +122,9 @@ class AgentActivity {
       );
       this.turnDetectionMode = void 0;
     }
-    if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
+    if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
       this.logger.warn(
-        "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
+        "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
       );
     }
   }
@@ -458,8 +458,12 @@ class AgentActivity {
     this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
   }
   // recognition hooks
-  onStartOfSpeech(_ev) {
-    this.agentSession._updateUserState("speaking");
+  onStartOfSpeech(ev) {
+    let speechStartTime = Date.now();
+    if (ev) {
+      speechStartTime = speechStartTime - ev.speechDuration;
+    }
+    this.agentSession._updateUserState("speaking", speechStartTime);
   }
   onEndOfSpeech(ev) {
     let speechEndTime = Date.now();
@@ -469,14 +473,16 @@ class AgentActivity {
     this.agentSession._updateUserState("listening", speechEndTime);
   }
   onVADInferenceDone(ev) {
-    var _a, _b;
     if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
       return;
     }
-    if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
-      return;
+    if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
+      this.interruptByAudioActivity();
     }
-    if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
+  }
+  interruptByAudioActivity() {
+    var _a, _b;
+    if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
       return;
     }
     if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
@@ -489,7 +495,10 @@ class AgentActivity {
     }
     (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
     if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
-      this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
+      this.logger.info(
+        { "speech id": this._currentSpeech.id },
+        "speech interrupted by audio activity"
+      );
       (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
       this._currentSpeech.interrupt();
     }
@@ -507,6 +516,9 @@ class AgentActivity {
         // TODO(AJS-106): add multi participant support
       })
     );
+    if (ev.alternatives[0].text) {
+      this.interruptByAudioActivity();
+    }
   }
   onFinalTranscript(ev) {
     if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
@@ -521,6 +533,9 @@ class AgentActivity {
         // TODO(AJS-106): add multi participant support
       })
     );
+    if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
+      this.interruptByAudioActivity();
+    }
   }
   onPreemptiveGeneration(info) {
     if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
@@ -836,6 +851,7 @@ ${instructions}` : instructions,
     );
   }
   async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
+    speechHandle._agentTurnContext = import_api.context.active();
     speechHandleStorage.enterWith(speechHandle);
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
     const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
@@ -867,12 +883,15 @@ ${instructions}` : instructions,
       textOut = _textOut;
       tasks.push(textForwardTask);
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState("speaking");
+    const onFirstFrame = (startedSpeakingAt) => {
+      this.agentSession._updateAgentState("speaking", {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext
+      });
     };
     if (!audioOutput) {
       if (textOut) {
-        textOut.firstTextFut.await.finally(onFirstFrame);
+        textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
       }
     } else {
       let audioOut = null;
@@ -900,7 +919,7 @@ ${instructions}` : instructions,
         tasks.push(forwardTask);
         audioOut = _audioOut;
       }
-      audioOut.firstFrameFut.await.finally(onFirstFrame);
+      audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
     }
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
     if (audioOutput) {
@@ -939,6 +958,7 @@ ${instructions}` : instructions,
     span
   }) => {
     var _a, _b, _c;
+    speechHandle._agentTurnContext = import_api.context.active();
     span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     if (instructions) {
       span.setAttribute(import_telemetry.traceTypes.ATTR_INSTRUCTIONS, instructions);
@@ -1015,8 +1035,11 @@ ${instructions}` : instructions,
       tasks.push(textForwardTask);
       textOut = _textOut;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState("speaking");
+    const onFirstFrame = (startedSpeakingAt) => {
+      this.agentSession._updateAgentState("speaking", {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext
+      });
     };
     let audioOut = null;
     if (audioOutput) {
@@ -1028,12 +1051,12 @@ ${instructions}` : instructions,
         );
         audioOut = _audioOut;
         tasks.push(forwardTask);
-        audioOut.firstFrameFut.await.finally(onFirstFrame);
+        audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
       } else {
         throw Error("ttsStream is null when audioOutput is enabled");
       }
     } else {
-      textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
+      textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
     }
     const onToolExecutionStarted = (f) => {
       speechHandle._itemAdded([f]);
@@ -1064,7 +1087,12 @@ ${instructions}` : instructions,
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolsMessages);
-      this.agentSession._toolItemsAdded(toolsMessages);
+      const toolCallOutputs = toolsMessages.filter(
+        (m) => m.type === "function_call_output"
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
     if (speechHandle.interrupted) {
       this.logger.debug(
@@ -1081,9 +1109,9 @@ ${instructions}` : instructions,
       let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
       if (audioOutput) {
         const playbackEv = await audioOutput.waitForPlayout();
-        if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
+        if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
           this.logger.info(
-            { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+            { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
             "playout interrupted"
           );
           if (playbackEv.synchronizedTranscript) {
@@ -1221,7 +1249,12 @@ ${instructions}` : instructions,
         msg.createdAt = replyStartedAt;
       }
       this.agent._chatCtx.insert(toolMessages);
-      this.agentSession._toolItemsAdded(toolMessages);
+      const toolCallOutputs = toolMessages.filter(
+        (m) => m.type === "function_call_output"
+      );
+      if (toolCallOutputs.length > 0) {
+        this.agentSession._toolItemsAdded(toolCallOutputs);
+      }
     }
   };
   pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
@@ -1264,6 +1297,7 @@ ${instructions}` : instructions,
     span
   }) {
     var _a, _b, _c;
+    speechHandle._agentTurnContext = import_api.context.active();
     span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     speechHandleStorage.enterWith(speechHandle);
     if (!this.realtimeSession) {
@@ -1288,8 +1322,11 @@ ${instructions}` : instructions,
     if (speechHandle.interrupted) {
       return;
     }
-    const onFirstFrame = () => {
-      this.agentSession._updateAgentState("speaking");
+    const onFirstFrame = (startedSpeakingAt) => {
+      this.agentSession._updateAgentState("speaking", {
+        startTime: startedSpeakingAt,
+        otelContext: speechHandle._agentTurnContext
+      });
     };
     const readMessages = async (abortController, outputs) => {
       replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
@@ -1364,10 +1401,10 @@ ${instructions}` : instructions,
               );
               forwardTasks.push(forwardTask);
               audioOut = _audioOut;
-              audioOut.firstFrameFut.await.finally(onFirstFrame);
+              audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
             }
           } else if (textOut) {
-            textOut.firstTextFut.await.finally(onFirstFrame);
+            textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
           }
           outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
         }
@@ -1431,7 +1468,6 @@ ${instructions}` : instructions,
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
     if (audioOutput) {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
-      this.agentSession._updateAgentState("listening");
     }
     if (speechHandle.interrupted) {
       this.logger.debug(
@@ -1446,10 +1482,10 @@ ${instructions}` : instructions,
         if (audioOutput) {
           audioOutput.clearBuffer();
           const playbackEv = await audioOutput.waitForPlayout();
-          let playbackPosition = playbackEv.playbackPosition;
-          if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
+          let playbackPositionInS = playbackEv.playbackPosition;
+          if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
             this.logger.info(
-              { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
+              { speech_id: speechHandle.id, playbackPositionInS },
               "playout interrupted"
             );
             if (playbackEv.synchronizedTranscript) {
@@ -1457,11 +1493,11 @@ ${instructions}` : instructions,
             }
           } else {
             forwardedText = "";
-            playbackPosition = 0;
+            playbackPositionInS = 0;
           }
           this.realtimeSession.truncate({
             messageId: msgId,
-            audioEndMs: Math.floor(playbackPosition),
+            audioEndMs: Math.floor(playbackPositionInS * 1e3),
             modalities: msgModalities,
             audioTranscript: forwardedText
           });
@@ -1499,14 +1535,13 @@ ${instructions}` : instructions,
       this.agentSession._conversationItemAdded(message);
     }
     speechHandle._markGenerationDone();
-    toolOutput.firstToolStartedFuture.await.finally(() => {
-      this.agentSession._updateAgentState("thinking");
-    });
     await executeToolsTask.result;
+    if (toolOutput.output.length > 0) {
+      this.agentSession._updateAgentState("thinking");
+    } else if (this.agentSession.agentState === "speaking") {
+      this.agentSession._updateAgentState("listening");
+    }
     if (toolOutput.output.length === 0) {
-      if (!speechHandle.interrupted) {
-        this.agentSession._updateAgentState("listening");
-      }
       return;
     }
     const { maxToolSteps } = this.agentSession.options;