npm - @livekit/agents - Versions diffs - 1.0.3 → 1.0.4 - Mend

@livekit/agents 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/dist/index.cjs +2 -5
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +2 -3
package/dist/index.d.ts +2 -3
package/dist/index.d.ts.map +1 -1
package/dist/index.js +1 -3
package/dist/index.js.map +1 -1
package/dist/tokenize/basic/hyphenator.cjs.map +1 -1
package/dist/tokenize/basic/hyphenator.js.map +1 -1
package/dist/utils.cjs +77 -0
package/dist/utils.cjs.map +1 -1
package/dist/utils.d.cts +21 -0
package/dist/utils.d.ts +21 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +76 -1
package/dist/utils.js.map +1 -1
package/dist/voice/agent_activity.cjs +112 -71
package/dist/voice/agent_activity.cjs.map +1 -1
package/dist/voice/agent_activity.d.ts.map +1 -1
package/dist/voice/agent_activity.js +112 -71
package/dist/voice/agent_activity.js.map +1 -1
package/dist/voice/avatar/datastream_io.cjs +204 -0
package/dist/voice/avatar/datastream_io.cjs.map +1 -0
package/dist/voice/avatar/datastream_io.d.cts +37 -0
package/dist/voice/avatar/datastream_io.d.ts +37 -0
package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
package/dist/voice/avatar/datastream_io.js +188 -0
package/dist/voice/avatar/datastream_io.js.map +1 -0
package/dist/{multimodal → voice/avatar}/index.cjs +4 -4
package/dist/voice/avatar/index.cjs.map +1 -0
package/dist/voice/avatar/index.d.cts +2 -0
package/dist/voice/avatar/index.d.ts +2 -0
package/dist/voice/avatar/index.d.ts.map +1 -0
package/dist/voice/avatar/index.js +2 -0
package/dist/voice/avatar/index.js.map +1 -0
package/dist/voice/index.cjs +2 -0
package/dist/voice/index.cjs.map +1 -1
package/dist/voice/index.d.cts +1 -0
package/dist/voice/index.d.ts +1 -0
package/dist/voice/index.d.ts.map +1 -1
package/dist/voice/index.js +1 -0
package/dist/voice/index.js.map +1 -1
package/dist/voice/io.cjs.map +1 -1
package/dist/voice/io.d.cts +1 -1
package/dist/voice/io.d.ts +1 -1
package/dist/voice/io.d.ts.map +1 -1
package/dist/voice/io.js.map +1 -1
package/dist/voice/room_io/_input.cjs +2 -1
package/dist/voice/room_io/_input.cjs.map +1 -1
package/dist/voice/room_io/_input.d.ts.map +1 -1
package/dist/voice/room_io/_input.js +2 -1
package/dist/voice/room_io/_input.js.map +1 -1
package/dist/voice/run_context.cjs +13 -0
package/dist/voice/run_context.cjs.map +1 -1
package/dist/voice/run_context.d.cts +10 -0
package/dist/voice/run_context.d.ts +10 -0
package/dist/voice/run_context.d.ts.map +1 -1
package/dist/voice/run_context.js +13 -0
package/dist/voice/run_context.js.map +1 -1
package/dist/voice/speech_handle.cjs +152 -30
package/dist/voice/speech_handle.cjs.map +1 -1
package/dist/voice/speech_handle.d.cts +67 -16
package/dist/voice/speech_handle.d.ts +67 -16
package/dist/voice/speech_handle.d.ts.map +1 -1
package/dist/voice/speech_handle.js +153 -31
package/dist/voice/speech_handle.js.map +1 -1
package/dist/worker.cjs +4 -1
package/dist/worker.cjs.map +1 -1
package/dist/worker.d.ts.map +1 -1
package/dist/worker.js +4 -1
package/dist/worker.js.map +1 -1
package/package.json +2 -2
package/src/index.ts +2 -3
package/src/tokenize/basic/hyphenator.ts +1 -1
package/src/utils.ts +121 -1
package/src/voice/agent_activity.ts +128 -78
package/src/voice/avatar/datastream_io.ts +247 -0
package/src/voice/avatar/index.ts +4 -0
package/src/voice/index.ts +2 -0
package/src/voice/io.ts +1 -1
package/src/voice/room_io/_input.ts +8 -3
package/src/voice/run_context.ts +16 -2
package/src/voice/speech_handle.ts +183 -38
package/src/worker.ts +5 -1
package/dist/multimodal/agent_playout.cjs +0 -233
package/dist/multimodal/agent_playout.cjs.map +0 -1
package/dist/multimodal/agent_playout.d.cts +0 -34
package/dist/multimodal/agent_playout.d.ts +0 -34
package/dist/multimodal/agent_playout.d.ts.map +0 -1
package/dist/multimodal/agent_playout.js +0 -207
package/dist/multimodal/agent_playout.js.map +0 -1
package/dist/multimodal/index.cjs.map +0 -1
package/dist/multimodal/index.d.cts +0 -2
package/dist/multimodal/index.d.ts +0 -2
package/dist/multimodal/index.d.ts.map +0 -1
package/dist/multimodal/index.js +0 -2
package/dist/multimodal/index.js.map +0 -1
package/src/multimodal/agent_playout.ts +0 -266
package/src/multimodal/index.ts +0 -4

package/dist/voice/agent_activity.js CHANGED Viewed

@@ -190,7 +190,7 @@ class AgentActivity {
       this.started = true;
       this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
       this.createSpeechTask({
-        promise: this.agent.onEnter(),
+        task: Task.from(() => this.agent.onEnter()),
         name: "AgentActivity_onEnter"
       });
     } finally {
@@ -309,7 +309,9 @@ class AgentActivity {
       })
     );
     const task = this.createSpeechTask({
-      promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
+      task: Task.from(
+        (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
+      ),
       ownedSpeechHandle: handle,
       name: "AgentActivity.say_tts"
     });
@@ -413,7 +415,9 @@ class AgentActivity {
     );
     this.logger.info({ speech_id: handle.id }, "Creating speech handle");
     this.createSpeechTask({
-      promise: this.realtimeGenerationTask(handle, ev, {}),
+      task: Task.from(
+        (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
+      ),
       ownedSpeechHandle: handle,
       name: "AgentActivity.realtimeGeneration"
     });
@@ -477,16 +481,23 @@ class AgentActivity {
     );
   }
   createSpeechTask(options) {
-    const { promise, ownedSpeechHandle } = options;
-    this.speechTasks.add(promise);
-    promise.finally(() => {
-      this.speechTasks.delete(promise);
-      if (ownedSpeechHandle) {
-        ownedSpeechHandle._markPlayoutDone();
-      }
+    const { task, ownedSpeechHandle } = options;
+    this.speechTasks.add(task);
+    task.addDoneCallback(() => {
+      this.speechTasks.delete(task);
+    });
+    if (ownedSpeechHandle) {
+      ownedSpeechHandle._tasks.push(task);
+      task.addDoneCallback(() => {
+        if (ownedSpeechHandle._tasks.every((t) => t.done)) {
+          ownedSpeechHandle._markDone();
+        }
+      });
+    }
+    task.addDoneCallback(() => {
       this.wakeupMainTask();
     });
-    return promise;
+    return task.result;
   }
   async onEndOfTurn(info) {
     if (this.draining) {
@@ -499,7 +510,7 @@ class AgentActivity {
     }
     const oldTask = this._userTurnCompletedTask;
     this._userTurnCompletedTask = this.createSpeechTask({
-      promise: this.userTurnCompleted(info, oldTask),
+      task: Task.from(() => this.userTurnCompleted(info, oldTask)),
       name: "AgentActivity.userTurnCompleted"
     });
     return true;
@@ -525,8 +536,8 @@ class AgentActivity {
         }
         const speechHandle = heapItem[2];
         this._currentSpeech = speechHandle;
-        speechHandle._authorizePlayout();
-        await speechHandle.waitForPlayout();
+        speechHandle._authorizeGeneration();
+        await speechHandle._waitForGeneration();
         this._currentSpeech = void 0;
       }
       if (this.draining && this.speechTasks.size === 0) {
@@ -579,16 +590,19 @@ class AgentActivity {
     this.logger.info({ speech_id: handle.id }, "Creating speech handle");
     if (this.llm instanceof RealtimeModel) {
       this.createSpeechTask({
-        promise: this.realtimeReplyTask({
-          speechHandle: handle,
-          // TODO(brian): support llm.ChatMessage for the realtime model
-          userInput: userMessage == null ? void 0 : userMessage.textContent,
-          instructions,
-          modelSettings: {
-            // isGiven(toolChoice) = toolChoice !== undefined
-            toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
-          }
-        }),
+        task: Task.from(
+          (abortController) => this.realtimeReplyTask({
+            speechHandle: handle,
+            // TODO(brian): support llm.ChatMessage for the realtime model
+            userInput: userMessage == null ? void 0 : userMessage.textContent,
+            instructions,
+            modelSettings: {
+              // isGiven(toolChoice) = toolChoice !== undefined
+              toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
+            },
+            abortController
+          })
+        ),
         ownedSpeechHandle: handle,
         name: "AgentActivity.realtimeReply"
       });
@@ -598,14 +612,19 @@ class AgentActivity {
 ${instructions}`;
       }
       const task = this.createSpeechTask({
-        promise: this.pipelineReplyTask(
-          handle,
-          chatCtx ?? this.agent.chatCtx,
-          this.agent.toolCtx,
-          { toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice) },
-          instructions ? `${this.agent.instructions}
+        task: Task.from(
+          (abortController) => this.pipelineReplyTask(
+            handle,
+            chatCtx ?? this.agent.chatCtx,
+            this.agent.toolCtx,
+            {
+              toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
+            },
+            abortController,
+            instructions ? `${this.agent.instructions}
 ${instructions}` : instructions,
-          userMessage
+            userMessage
+          )
         ),
         ownedSpeechHandle: handle,
         name: "AgentActivity.pipelineReply"
@@ -627,7 +646,7 @@ ${instructions}` : instructions,
     if (currentSpeech === void 0) {
       future.resolve();
     } else {
-      currentSpeech.then(() => {
+      currentSpeech.addDoneCallback(() => {
         if (future.done) return;
         future.resolve();
       });
@@ -635,7 +654,7 @@ ${instructions}` : instructions,
     return future;
   }
   onPipelineReplyDone() {
-    if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
+    if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
       this.agentSession._updateAgentState("listening");
     }
   }
@@ -699,11 +718,10 @@ ${instructions}` : instructions,
       createMetricsCollectedEvent({ metrics: eouMetrics })
     );
   }
-  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, audio) {
+  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
     speechHandleStorage.enterWith(speechHandle);
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
     const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
-    const replyAbortController = new AbortController();
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
     if (speechHandle.interrupted) {
       return;
@@ -792,10 +810,9 @@ ${instructions}` : instructions,
       this.agentSession._updateAgentState("listening");
     }
   }
-  async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, instructions, newMessage, toolsMessages) {
+  async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) {
     var _a, _b, _c;
     speechHandleStorage.enterWith(speechHandle);
-    const replyAbortController = new AbortController();
     const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
     const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
     chatCtx = chatCtx.copy();
@@ -838,12 +855,20 @@ ${instructions}` : instructions,
       );
       tasks.push(ttsTask);
     }
-    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
+    if (newMessage && speechHandle.scheduled) {
+      chatCtx.insert(newMessage);
+      this.agent._chatCtx.insert(newMessage);
+      this.agentSession._conversationItemAdded(newMessage);
+    }
     if (speechHandle.interrupted) {
       replyAbortController.abort();
       await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
+    this.agentSession._updateAgentState("thinking");
+    await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    speechHandle._clearAuthorization();
     const replyStartedAt = Date.now();
     const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
     let textOut = null;
@@ -890,7 +915,6 @@ ${instructions}` : instructions,
       onToolExecutionStarted,
       onToolExecutionCompleted
     });
-    tasks.push(executeToolsTask);
     await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
     if (audioOutput) {
       await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
@@ -945,7 +969,7 @@ ${instructions}` : instructions,
         { speech_id: speechHandle.id, message: forwardedText },
         "playout completed with interrupt"
       );
-      speechHandle._markPlayoutDone();
+      speechHandle._markGenerationDone();
       await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
@@ -970,11 +994,11 @@ ${instructions}` : instructions,
     } else if (this.agentSession.agentState === "speaking") {
       this.agentSession._updateAgentState("listening");
     }
-    speechHandle._markPlayoutDone();
+    speechHandle._markGenerationDone();
     await executeToolsTask.result;
     if (toolOutput.output.length === 0) return;
     const { maxToolSteps } = this.agentSession.options;
-    if (speechHandle.stepIndex >= maxToolSteps) {
+    if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
         "maximum number of function calls steps reached"
@@ -1029,7 +1053,7 @@ ${instructions}` : instructions,
       chatCtx.insert(toolMessages);
       const handle = SpeechHandle.create({
         allowInterruptions: speechHandle.allowInterruptions,
-        stepIndex: speechHandle.stepIndex + 1,
+        stepIndex: speechHandle._stepIndex + 1,
         parent: speechHandle
       });
       this.agentSession.emit(
@@ -1042,14 +1066,17 @@ ${instructions}` : instructions,
       );
       const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
       const toolResponseTask = this.createSpeechTask({
-        promise: this.pipelineReplyTask(
-          handle,
-          chatCtx,
-          toolCtx,
-          { toolChoice: respondToolChoice },
-          instructions,
-          void 0,
-          toolMessages
+        task: Task.from(
+          () => this.pipelineReplyTask(
+            handle,
+            chatCtx,
+            toolCtx,
+            { toolChoice: respondToolChoice },
+            replyAbortController,
+            instructions,
+            void 0,
+            toolMessages
+          )
         ),
         ownedSpeechHandle: handle,
         name: "AgentActivity.pipelineReply"
@@ -1063,7 +1090,7 @@ ${instructions}` : instructions,
       this.agent._chatCtx.insert(toolMessages);
     }
   }
-  async realtimeGenerationTask(speechHandle, ev, modelSettings) {
+  async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
     var _a, _b, _c;
     speechHandleStorage.enterWith(speechHandle);
     if (!this.realtimeSession) {
@@ -1073,20 +1100,20 @@ ${instructions}` : instructions,
       throw new Error("llm is not a realtime model");
     }
     this.logger.debug(
-      { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
+      { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
       "realtime generation started"
     );
     const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
     const textOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
     const toolCtx = this.realtimeSession.tools;
     await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
+    speechHandle._clearAuthorization();
     if (speechHandle.interrupted) {
       return;
     }
     const onFirstFrame = () => {
       this.agentSession._updateAgentState("speaking");
     };
-    const replyAbortController = new AbortController();
     const readMessages = async (abortController, outputs) => {
       const forwardTasks = [];
       try {
@@ -1170,9 +1197,13 @@ ${instructions}` : instructions,
         "AgentActivity.realtime_generation.read_tool_stream"
       )
     );
-    const onToolExecutionStarted = (_) => {
+    const onToolExecutionStarted = (f) => {
+      speechHandle._itemAdded([f]);
     };
-    const onToolExecutionCompleted = (_) => {
+    const onToolExecutionCompleted = (out) => {
+      if (out.toolCallOutput) {
+        speechHandle._itemAdded([out.toolCallOutput]);
+      }
     };
     const [executeToolsTask, toolOutput] = performToolExecutions({
       session: this.agentSession,
@@ -1228,7 +1259,7 @@ ${instructions}` : instructions,
             interrupted: true
           });
           this.agent._chatCtx.insert(message);
-          speechHandle._setChatMessage(message);
+          speechHandle._itemAdded([message]);
           this.agentSession._conversationItemAdded(message);
         }
         this.logger.info(
@@ -1236,7 +1267,7 @@ ${instructions}` : instructions,
           "playout completed with interrupt"
         );
       }
-      speechHandle._markPlayoutDone();
+      speechHandle._markGenerationDone();
       await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
       return;
     }
@@ -1249,17 +1280,17 @@ ${instructions}` : instructions,
         interrupted: false
       });
       this.agent._chatCtx.insert(message);
-      speechHandle._setChatMessage(message);
+      speechHandle._itemAdded([message]);
       this.agentSession._conversationItemAdded(message);
     }
-    speechHandle._markPlayoutDone();
+    speechHandle._markGenerationDone();
     toolOutput.firstToolStartedFuture.await.finally(() => {
       this.agentSession._updateAgentState("thinking");
     });
     await executeToolsTask.result;
     if (toolOutput.output.length === 0) return;
     const { maxToolSteps } = this.agentSession.options;
-    if (speechHandle.stepIndex >= maxToolSteps) {
+    if (speechHandle.numSteps >= maxToolSteps) {
       this.logger.warn(
         { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
         "maximum number of function calls steps reached"
@@ -1323,7 +1354,7 @@ ${instructions}` : instructions,
     this.realtimeSession.interrupt();
     const replySpeechHandle = SpeechHandle.create({
       allowInterruptions: speechHandle.allowInterruptions,
-      stepIndex: speechHandle.stepIndex + 1,
+      stepIndex: speechHandle.numSteps + 1,
       parent: speechHandle
     });
     this.agentSession.emit(
@@ -1336,10 +1367,13 @@ ${instructions}` : instructions,
     );
     const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
     this.createSpeechTask({
-      promise: this.realtimeReplyTask({
-        speechHandle: replySpeechHandle,
-        modelSettings: { toolChoice }
-      }),
+      task: Task.from(
+        (abortController) => this.realtimeReplyTask({
+          speechHandle: replySpeechHandle,
+          modelSettings: { toolChoice },
+          abortController
+        })
+      ),
       ownedSpeechHandle: replySpeechHandle,
       name: "AgentActivity.realtime_reply"
     });
@@ -1349,7 +1383,8 @@ ${instructions}` : instructions,
     speechHandle,
     modelSettings: { toolChoice },
     userInput,
-    instructions
+    instructions,
+    abortController
   }) {
     speechHandleStorage.enterWith(speechHandle);
     if (!this.realtimeSession) {
@@ -1372,18 +1407,24 @@ ${instructions}` : instructions,
     }
     try {
       const generationEvent = await this.realtimeSession.generateReply(instructions);
-      await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
+      await this.realtimeGenerationTask(
+        speechHandle,
+        generationEvent,
+        { toolChoice },
+        abortController
+      );
     } finally {
       if (toolChoice !== void 0 && toolChoice !== originalToolChoice) {
         this.realtimeSession.updateOptions({ toolChoice: originalToolChoice });
       }
     }
   }
-  scheduleSpeech(speechHandle, priority, bypassDraining = false) {
-    if (this.draining && !bypassDraining) {
+  scheduleSpeech(speechHandle, priority, force = false) {
+    if (this.draining && !force) {
       throw new Error("cannot schedule new speech, the agent is draining");
     }
     this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
+    speechHandle._markScheduled();
     this.wakeupMainTask();
   }
   async drain() {
@@ -1392,7 +1433,7 @@ ${instructions}` : instructions,
     try {
       if (this._draining) return;
       this.createSpeechTask({
-        promise: this.agent.onExit(),
+        task: Task.from(() => this.agent.onExit()),
         name: "AgentActivity_onExit"
       });
       this.wakeupMainTask();